spaCy/examples/load_from_docbin.py

# coding: utf-8
"""
Example of loading previously parsed text using spaCy's DocBin class. The example
performs an entity count to show that the annotations are available.
For more details, see https://spacy.io/usage/saving-loading#docs
Installation:
python -m spacy download en_core_web_lg
Usage:
python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
"""
from __future__ import unicode_literals

import spacy
from spacy.tokens import DocBin
from timeit import default_timer as timer
from collections import Counter

EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"


def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
    nlp = spacy.load(model)
    print("Reading data from {}".format(docbin_path))
    with open(docbin_path, "rb") as file_:
        bytes_data = file_.read()
    nr_word = 0
    start_time = timer()
    entities = Counter()
    docbin = DocBin().from_bytes(bytes_data)
    for doc in docbin.get_docs(nlp.vocab):
        nr_word += len(doc)
        entities.update((e.label_, e.text) for e in doc.ents)
    end_time = timer()
    msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
    wps = nr_word / (end_time - start_time)
    print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
    print("Most common entities:")
    for (label, entity), freq in entities.most_common(30):
        print(freq, entity, label)


if __name__ == "__main__":
    import plac

    plac.call(main)
Add load_from_docbin example [ci skip] TODO: upload the file somewhere 2019-11-05 13:52:43 +03:00			`# coding: utf-8`
			`"""`
			`Example of loading previously parsed text using spaCy's DocBin class. The example`
			`performs an entity count to show that the annotations are available.`
			`For more details, see https://spacy.io/usage/saving-loading#docs`
			`Installation:`
			`python -m spacy download en_core_web_lg`
			`Usage:`
			`python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy`
			`"""`
			`from __future__ import unicode_literals`

			`import spacy`
			`from spacy.tokens import DocBin`
			`from timeit import default_timer as timer`
			`from collections import Counter`

			`EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"`


			`def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):`
			`nlp = spacy.load(model)`
			`print("Reading data from {}".format(docbin_path))`
			`with open(docbin_path, "rb") as file_:`
			`bytes_data = file_.read()`
			`nr_word = 0`
			`start_time = timer()`
			`entities = Counter()`
			`docbin = DocBin().from_bytes(bytes_data)`
			`for doc in docbin.get_docs(nlp.vocab):`
			`nr_word += len(doc)`
			`entities.update((e.label_, e.text) for e in doc.ents)`
			`end_time = timer()`
			`msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"`
			`wps = nr_word / (end_time - start_time)`
			`print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))`
			`print("Most common entities:")`
			`for (label, entity), freq in entities.most_common(30):`
			`print(freq, entity, label)`


			`if __name__ == "__main__":`
			`import plac`

			`plac.call(main)`