From ef2c67cca52b95b621a2c801db39c69daf9045ea Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 7 Aug 2020 14:30:59 +0200
Subject: [PATCH] Add DocBin to/from_disk methods and update docs (#5892)

* Add DocBin to/from_disk methods and update docs

* Use DocBin.from_disk in Corpus
---
 spacy/gold/corpus.py             |  5 ++---
 spacy/tests/test_gold.py         | 12 ++++-------
 spacy/tokens/_serialize.py       | 28 ++++++++++++++++++++++++-
 website/docs/api/data-formats.md | 32 +++++++++++++---------------
 website/docs/api/docbin.md       | 36 +++++++++++++++++++++++++++++++-
 5 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index 47f9a3b53..745d52e0e 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -39,7 +39,7 @@ class Corpus:
 
     def __init__(
         self,
-        path,
+        path: Union[str, Path],
         *,
         limit: int = 0,
         gold_preproc: bool = False,
@@ -136,8 +136,7 @@ class Corpus:
         for loc in locs:
             loc = util.ensure_path(loc)
             if loc.parts[-1].endswith(".spacy"):
-                with loc.open("rb") as file_:
-                    doc_bin = DocBin().from_bytes(file_.read())
+                doc_bin = DocBin().from_disk(loc)
                 docs = doc_bin.get_docs(vocab)
                 for doc in docs:
                     if len(doc):
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 16974a4c2..708c57837 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -484,12 +484,10 @@ def test_roundtrip_docs_to_docbin(doc):
         json_file = tmpdir / "roundtrip.json"
         srsly.write_json(json_file, [docs_to_json(doc)])
         output_file = tmpdir / "roundtrip.spacy"
-        data = DocBin(docs=[doc]).to_bytes()
-        with output_file.open("wb") as file_:
-            file_.write(data)
+        DocBin(docs=[doc]).to_disk(output_file)
         reader = Corpus(output_file)
         reloaded_examples = list(reader(reloaded_nlp))
-        assert len(doc) == sum(len(eg) for eg in reloaded_examples)
+    assert len(doc) == sum(len(eg) for eg in reloaded_examples)
     reloaded_example = reloaded_examples[0]
     assert text == reloaded_example.reference.text
     assert idx == [t.idx for t in reloaded_example.reference]
@@ -512,13 +510,11 @@ def test_make_orth_variants(doc):
     nlp = English()
     with make_tempdir() as tmpdir:
         output_file = tmpdir / "roundtrip.spacy"
-        data = DocBin(docs=[doc]).to_bytes()
-        with output_file.open("wb") as file_:
-            file_.write(data)
+        DocBin(docs=[doc]).to_disk(output_file)
         # due to randomness, test only that this runs with no errors for now
         reader = Corpus(output_file)
         train_example = next(reader(nlp))
-        make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
+    make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
 
 
 @pytest.mark.skip("Outdated")
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 192067ed4..9d17cec1c 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -1,4 +1,5 @@
-from typing import Iterable, Iterator
+from typing import Iterable, Iterator, Union
+from pathlib import Path
 import numpy
 import zlib
 import srsly
@@ -9,6 +10,7 @@ from ..vocab import Vocab
 from ..compat import copy_reg
 from ..attrs import SPACY, ORTH, intify_attr
 from ..errors import Errors
+from ..util import ensure_path
 
 # fmt: off
 ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
@@ -204,6 +206,30 @@ class DocBin:
             assert len(tokens.shape) == 2, tokens.shape  # this should never happen
         return self
 
+    def to_disk(self, path: Union[str, Path]) -> None:
+        """Save the DocBin to a file (typically called .spacy).
+
+        path (str / Path): The file path.
+
+        DOCS: https://spacy.io/api/docbin#to_disk
+        """
+        path = ensure_path(path)
+        with path.open("wb") as file_:
+            file_.write(self.to_bytes())
+
+    def from_disk(self, path: Union[str, Path]) -> "DocBin":
+        """Load the DocBin from a file (typically called .spacy).
+
+        path (str / Path): The file path.
+        RETURNS (DocBin): The loaded DocBin.
+
+        DOCS: https://spacy.io/api/docbin#to_disk
+        """
+        path = ensure_path(path)
+        with path.open("rb") as file_:
+            self.from_bytes(file_.read())
+        return self
+
 
 def merge_bins(bins):
     merged = None
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index e5b52a74a..641115505 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -177,28 +177,26 @@ run [`spacy pretrain`](/api/cli#pretrain).
 > #### Example
 >
 > ```python
-> from pathlib import Path
 > from spacy.tokens import DocBin
 > from spacy.gold import Corpus
-> output_file = Path(dir) / "output.spacy"
-> data = DocBin(docs=docs).to_bytes()
-> with output_file.open("wb") as file_:
->    file_.write(data)
-> reader = Corpus(output_file)
+>
+> doc_bin = DocBin(docs=docs)
+> doc_bin.to_disk("./data.spacy")
+> reader = Corpus("./data.spacy")
 > ```
 
-The main data format used in spaCy v3 is a binary format created by serializing
-a [`DocBin`](/api/docbin) object, which represents a collection of `Doc`
-objects. Typically, the extension for these binary files is `.spacy`, and they
-are used as input format for specifying a [training corpus](/api/corpus) and for
-spaCy's CLI [`train`](/api/cli#train) command.
+The main data format used in spaCy v3.0 is a **binary format** created by
+serializing a [`DocBin`](/api/docbin) object, which represents a collection of
+`Doc` objects. This means that you can train spaCy models using the same format
+it outputs: annotated `Doc` objects. The binary format is extremely **efficient
+in storage**, especially when packing multiple documents together.
 
-This binary format is extremely efficient in storage, especially when packing
-multiple documents together. 
-
-The built-in [`convert`](/api/cli#convert) command helps you convert spaCy's
-previous [JSON format](#json-input) to this new `DocBin` format. It also
-supports conversion of the `.conllu` format used by the
+Typically, the extension for these binary files is `.spacy`, and they are used
+as input format for specifying a [training corpus](/api/corpus) and for spaCy's
+CLI [`train`](/api/cli#train) command. The built-in
+[`convert`](/api/cli#convert) command helps you convert spaCy's previous
+[JSON format](#json-input) to the new binary format format. It also supports
+conversion of the `.conllu` format used by the
 [Universal Dependencies corpora](https://github.com/UniversalDependencies).
 
 ### JSON training format {#json-input tag="deprecated"}
diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md
index 65d1153d1..ced742045 100644
--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@@ -125,7 +125,8 @@ Serialize the `DocBin`'s annotations to a bytestring.
 > #### Example
 >
 > ```python
-> doc_bin = DocBin(attrs=["DEP", "HEAD"])
+> docs = [nlp("Hello world!")]
+> doc_bin = DocBin(docs=docs)
 > doc_bin_bytes = doc_bin.to_bytes()
 > ```
 
@@ -148,3 +149,36 @@ Deserialize the `DocBin`'s annotations from a bytestring.
 | ------------ | -------- | ---------------------- |
 | `bytes_data` | bytes    | The data to load from. |
 | **RETURNS**  | `DocBin` | The loaded `DocBin`.   |
+
+## DocBin.to_disk {#to_disk tag="method" new="3"}
+
+Save the serialized `DocBin` to a file. Typically uses the `.spacy` extension
+and the result can be used as the input data for
+[`spacy train`](/api/cli#train).
+
+> #### Example
+>
+> ```python
+> docs = [nlp("Hello world!")]
+> doc_bin = DocBin(docs=docs)
+> doc_bin.to_disk("./data.spacy")
+> ```
+
+| Argument | Type         | Description                                           |
+| -------- | ------------ | ----------------------------------------------------- |
+| `path`   | str / `Path` | The file path, typically with the `.spacy` extension. |
+
+## DocBin.from_disk {#from_disk tag="method" new="3"}
+
+Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension.
+
+> #### Example
+>
+> ```python
+> doc_bin = DocBin().from_disk("./data.spacy")
+> ```
+
+| Argument    | Type         | Description                                           |
+| ----------- | ------------ | ----------------------------------------------------- |
+| `path`      | str / `Path` | The file path, typically with the `.spacy` extension. |
+| **RETURNS** | `DocBin`     | The loaded `DocBin`.                                  |