spaCy/examples/load_from_docbin.py

# coding: utf-8
"""
Example of loading previously parsed text using spaCy's DocBin class. The example
performs an entity count to show that the annotations are available.
For more details, see https://spacy.io/usage/saving-loading#docs
Installation:
python -m spacy download en_core_web_lg
Usage:
python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
"""
from __future__ import unicode_literals

import spacy
from spacy.tokens import DocBin
from timeit import default_timer as timer
from collections import Counter

EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"


def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
    nlp = spacy.load(model)
    print("Reading data from {}".format(docbin_path))
    with open(docbin_path, "rb") as file_:
        bytes_data = file_.read()
    nr_word = 0
    start_time = timer()
    entities = Counter()
    docbin = DocBin().from_bytes(bytes_data)
    for doc in docbin.get_docs(nlp.vocab):
        nr_word += len(doc)
        entities.update((e.label_, e.text) for e in doc.ents)
    end_time = timer()
    msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
    wps = nr_word / (end_time - start_time)
    print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
    print("Most common entities:")
    for (label, entity), freq in entities.most_common(30):
        print(freq, entity, label)


if __name__ == "__main__":
    import plac

    plac.call(main)