Add DocBin to/from_disk methods and update docs (#5892)

* Add DocBin to/from_disk methods and update docs * Use DocBin.from_disk in Corpus
2025-10-15 08:16:36 +03:00 · 2020-08-07 14:30:59 +02:00 · 2020-08-07 14:30:59 +02:00 · ef2c67cca5
commit ef2c67cca5
parent 4ca08c6d5d
5 changed files with 83 additions and 30 deletions
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@ -39,7 +39,7 @@ class Corpus:
    def __init__(
        self,
-        path,
+        path: Union[str, Path],
        *,
        limit: int = 0,
        gold_preproc: bool = False,
@ -136,8 +136,7 @@ class Corpus:
        for loc in locs:
            loc = util.ensure_path(loc)
            if loc.parts[-1].endswith(".spacy"):
-                with loc.open("rb") as file_:
+                doc_bin = DocBin().from_disk(loc)
                    doc_bin = DocBin().from_bytes(file_.read())
                docs = doc_bin.get_docs(vocab)
                for doc in docs:
                    if len(doc):
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -484,12 +484,10 @@ def test_roundtrip_docs_to_docbin(doc):
        json_file = tmpdir / "roundtrip.json"
        srsly.write_json(json_file, [docs_to_json(doc)])
        output_file = tmpdir / "roundtrip.spacy"
-        data = DocBin(docs=[doc]).to_bytes()
+        DocBin(docs=[doc]).to_disk(output_file)
        with output_file.open("wb") as file_:
            file_.write(data)
        reader = Corpus(output_file)
        reloaded_examples = list(reader(reloaded_nlp))
-        assert len(doc) == sum(len(eg) for eg in reloaded_examples)
+    assert len(doc) == sum(len(eg) for eg in reloaded_examples)
    reloaded_example = reloaded_examples[0]
    assert text == reloaded_example.reference.text
    assert idx == [t.idx for t in reloaded_example.reference]
@ -512,13 +510,11 @@ def test_make_orth_variants(doc):
    nlp = English()
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "roundtrip.spacy"
-        data = DocBin(docs=[doc]).to_bytes()
+        DocBin(docs=[doc]).to_disk(output_file)
        with output_file.open("wb") as file_:
            file_.write(data)
        # due to randomness, test only that this runs with no errors for now
        reader = Corpus(output_file)
        train_example = next(reader(nlp))
-        make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
+    make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
@pytest.mark.skip("Outdated")
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -1,4 +1,5 @@
-from typing import Iterable, Iterator
+from typing import Iterable, Iterator, Union
 from pathlib import Path
 import numpy
 import zlib
 import srsly
@ -9,6 +10,7 @@ from ..vocab import Vocab
 from ..compat import copy_reg
 from ..attrs import SPACY, ORTH, intify_attr
 from ..errors import Errors
 from ..util import ensure_path
 # fmt: off
 ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
@ -204,6 +206,30 @@ class DocBin:
            assert len(tokens.shape) == 2, tokens.shape  # this should never happen
        return self
    def to_disk(self, path: Union[str, Path]) -> None:
        """Save the DocBin to a file (typically called .spacy).
        path (str / Path): The file path.
        DOCS: https://spacy.io/api/docbin#to_disk
        """
        path = ensure_path(path)
        with path.open("wb") as file_:
            file_.write(self.to_bytes())
    def from_disk(self, path: Union[str, Path]) -> "DocBin":
        """Load the DocBin from a file (typically called .spacy).
        path (str / Path): The file path.
        RETURNS (DocBin): The loaded DocBin.
        DOCS: https://spacy.io/api/docbin#to_disk
        """
        path = ensure_path(path)
        with path.open("rb") as file_:
            self.from_bytes(file_.read())
        return self
 def merge_bins(bins):
    merged = None
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -177,28 +177,26 @@ run [`spacy pretrain`](/api/cli#pretrain).
 > #### Example
 >
 > ```python
 > from pathlib import Path
 > from spacy.tokens import DocBin
 > from spacy.gold import Corpus
-> output_file = Path(dir) / "output.spacy"
+>
-> data = DocBin(docs=docs).to_bytes()
+> doc_bin = DocBin(docs=docs)
-> with output_file.open("wb") as file_:
+> doc_bin.to_disk("./data.spacy")
->    file_.write(data)
+> reader = Corpus("./data.spacy")
 > reader = Corpus(output_file)
 > ```
-The main data format used in spaCy v3 is a binary format created by serializing
+The main data format used in spaCy v3.0 is a **binary format** created by
-a [`DocBin`](/api/docbin) object, which represents a collection of `Doc`
+serializing a [`DocBin`](/api/docbin) object, which represents a collection of
-objects. Typically, the extension for these binary files is `.spacy`, and they
+`Doc` objects. This means that you can train spaCy models using the same format
-are used as input format for specifying a [training corpus](/api/corpus) and for
+it outputs: annotated `Doc` objects. The binary format is extremely **efficient
-spaCy's CLI [`train`](/api/cli#train) command.
+in storage**, especially when packing multiple documents together.
-This binary format is extremely efficient in storage, especially when packing
+Typically, the extension for these binary files is `.spacy`, and they are used
-multiple documents together. 
+as input format for specifying a [training corpus](/api/corpus) and for spaCy's
-
+CLI [`train`](/api/cli#train) command. The built-in
-The built-in [`convert`](/api/cli#convert) command helps you convert spaCy's
+[`convert`](/api/cli#convert) command helps you convert spaCy's previous
-previous [JSON format](#json-input) to this new `DocBin` format. It also
+[JSON format](#json-input) to the new binary format format. It also supports
-supports conversion of the `.conllu` format used by the
+conversion of the `.conllu` format used by the
 [Universal Dependencies corpora](https://github.com/UniversalDependencies).
 ### JSON training format {#json-input tag="deprecated"}
--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@ -125,7 +125,8 @@ Serialize the `DocBin`'s annotations to a bytestring.
 > #### Example
 >
 > ```python
-> doc_bin = DocBin(attrs=["DEP", "HEAD"])
+> docs = [nlp("Hello world!")]
 > doc_bin = DocBin(docs=docs)
 > doc_bin_bytes = doc_bin.to_bytes()
 > ```
@ -148,3 +149,36 @@ Deserialize the `DocBin`'s annotations from a bytestring.
 | ------------ | -------- | ---------------------- |
 | `bytes_data` | bytes    | The data to load from. |
 | **RETURNS**  | `DocBin` | The loaded `DocBin`.   |
 ## DocBin.to_disk {#to_disk tag="method" new="3"}
 Save the serialized `DocBin` to a file. Typically uses the `.spacy` extension
 and the result can be used as the input data for
 [`spacy train`](/api/cli#train).
 > #### Example
 >
 > ```python
 > docs = [nlp("Hello world!")]
 > doc_bin = DocBin(docs=docs)
 > doc_bin.to_disk("./data.spacy")
 > ```
 | Argument | Type         | Description                                           |
 | -------- | ------------ | ----------------------------------------------------- |
 | `path`   | str / `Path` | The file path, typically with the `.spacy` extension. |
 ## DocBin.from_disk {#from_disk tag="method" new="3"}
 Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension.
 > #### Example
 >
 > ```python
 > doc_bin = DocBin().from_disk("./data.spacy")
 > ```
 | Argument    | Type         | Description                                           |
 | ----------- | ------------ | ----------------------------------------------------- |
 | `path`      | str / `Path` | The file path, typically with the `.spacy` extension. |
 | **RETURNS** | `DocBin`     | The loaded `DocBin`.                                  |