From 83381018d3b165008cf9678117a77ef40c66ce18 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 5 Nov 2019 11:52:43 +0100 Subject: [PATCH] Add load_from_docbin example [ci skip] TODO: upload the file somewhere --- examples/load_from_docbin.py | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 examples/load_from_docbin.py diff --git a/examples/load_from_docbin.py b/examples/load_from_docbin.py new file mode 100644 index 000000000..f26e7fc49 --- /dev/null +++ b/examples/load_from_docbin.py @@ -0,0 +1,45 @@ +# coding: utf-8 +""" +Example of loading previously parsed text using spaCy's DocBin class. The example +performs an entity count to show that the annotations are available. +For more details, see https://spacy.io/usage/saving-loading#docs +Installation: +python -m spacy download en_core_web_lg +Usage: +python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy +""" +from __future__ import unicode_literals + +import spacy +from spacy.tokens import DocBin +from timeit import default_timer as timer +from collections import Counter + +EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy" + + +def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH): + nlp = spacy.load(model) + print("Reading data from {}".format(docbin_path)) + with open(docbin_path, "rb") as file_: + bytes_data = file_.read() + nr_word = 0 + start_time = timer() + entities = Counter() + docbin = DocBin().from_bytes(bytes_data) + for doc in docbin.get_docs(nlp.vocab): + nr_word += len(doc) + entities.update((e.label_, e.text) for e in doc.ents) + end_time = timer() + msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)" + wps = nr_word / (end_time - start_time) + print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps)) + print("Most common entities:") + for (label, entity), freq in entities.most_common(30): + print(freq, entity, label) + + +if __name__ == "__main__": + import plac + + plac.call(main)