From 83381018d3b165008cf9678117a77ef40c66ce18 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 5 Nov 2019 11:52:43 +0100
Subject: [PATCH] Add load_from_docbin example [ci skip]

TODO: upload the file somewhere
---
 examples/load_from_docbin.py | 45 ++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 examples/load_from_docbin.py

diff --git a/examples/load_from_docbin.py b/examples/load_from_docbin.py
new file mode 100644
index 000000000..f26e7fc49
--- /dev/null
+++ b/examples/load_from_docbin.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+"""
+Example of loading previously parsed text using spaCy's DocBin class. The example
+performs an entity count to show that the annotations are available.
+For more details, see https://spacy.io/usage/saving-loading#docs
+Installation:
+python -m spacy download en_core_web_lg
+Usage:
+python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
+"""
+from __future__ import unicode_literals
+
+import spacy
+from spacy.tokens import DocBin
+from timeit import default_timer as timer
+from collections import Counter
+
+EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
+
+
+def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
+    nlp = spacy.load(model)
+    print("Reading data from {}".format(docbin_path))
+    with open(docbin_path, "rb") as file_:
+        bytes_data = file_.read()
+    nr_word = 0
+    start_time = timer()
+    entities = Counter()
+    docbin = DocBin().from_bytes(bytes_data)
+    for doc in docbin.get_docs(nlp.vocab):
+        nr_word += len(doc)
+        entities.update((e.label_, e.text) for e in doc.ents)
+    end_time = timer()
+    msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
+    wps = nr_word / (end_time - start_time)
+    print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
+    print("Most common entities:")
+    for (label, entity), freq in entities.most_common(30):
+        print(freq, entity, label)
+
+
+if __name__ == "__main__":
+    import plac
+
+    plac.call(main)