From ef2c67cca52b95b621a2c801db39c69daf9045ea Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 7 Aug 2020 14:30:59 +0200 Subject: [PATCH] Add DocBin to/from_disk methods and update docs (#5892) * Add DocBin to/from_disk methods and update docs * Use DocBin.from_disk in Corpus --- spacy/gold/corpus.py | 5 ++--- spacy/tests/test_gold.py | 12 ++++------- spacy/tokens/_serialize.py | 28 ++++++++++++++++++++++++- website/docs/api/data-formats.md | 32 +++++++++++++--------------- website/docs/api/docbin.md | 36 +++++++++++++++++++++++++++++++- 5 files changed, 83 insertions(+), 30 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 47f9a3b53..745d52e0e 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -39,7 +39,7 @@ class Corpus: def __init__( self, - path, + path: Union[str, Path], *, limit: int = 0, gold_preproc: bool = False, @@ -136,8 +136,7 @@ class Corpus: for loc in locs: loc = util.ensure_path(loc) if loc.parts[-1].endswith(".spacy"): - with loc.open("rb") as file_: - doc_bin = DocBin().from_bytes(file_.read()) + doc_bin = DocBin().from_disk(loc) docs = doc_bin.get_docs(vocab) for doc in docs: if len(doc): diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 16974a4c2..708c57837 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -484,12 +484,10 @@ def test_roundtrip_docs_to_docbin(doc): json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) output_file = tmpdir / "roundtrip.spacy" - data = DocBin(docs=[doc]).to_bytes() - with output_file.open("wb") as file_: - file_.write(data) + DocBin(docs=[doc]).to_disk(output_file) reader = Corpus(output_file) reloaded_examples = list(reader(reloaded_nlp)) - assert len(doc) == sum(len(eg) for eg in reloaded_examples) + assert len(doc) == sum(len(eg) for eg in reloaded_examples) reloaded_example = reloaded_examples[0] assert text == reloaded_example.reference.text assert idx == [t.idx for t in reloaded_example.reference] @@ -512,13 +510,11 @@ def test_make_orth_variants(doc): nlp = English() with make_tempdir() as tmpdir: output_file = tmpdir / "roundtrip.spacy" - data = DocBin(docs=[doc]).to_bytes() - with output_file.open("wb") as file_: - file_.write(data) + DocBin(docs=[doc]).to_disk(output_file) # due to randomness, test only that this runs with no errors for now reader = Corpus(output_file) train_example = next(reader(nlp)) - make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) + make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) @pytest.mark.skip("Outdated") diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 192067ed4..9d17cec1c 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,4 +1,5 @@ -from typing import Iterable, Iterator +from typing import Iterable, Iterator, Union +from pathlib import Path import numpy import zlib import srsly @@ -9,6 +10,7 @@ from ..vocab import Vocab from ..compat import copy_reg from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors +from ..util import ensure_path # fmt: off ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") @@ -204,6 +206,30 @@ class DocBin: assert len(tokens.shape) == 2, tokens.shape # this should never happen return self + def to_disk(self, path: Union[str, Path]) -> None: + """Save the DocBin to a file (typically called .spacy). + + path (str / Path): The file path. + + DOCS: https://spacy.io/api/docbin#to_disk + """ + path = ensure_path(path) + with path.open("wb") as file_: + file_.write(self.to_bytes()) + + def from_disk(self, path: Union[str, Path]) -> "DocBin": + """Load the DocBin from a file (typically called .spacy). + + path (str / Path): The file path. + RETURNS (DocBin): The loaded DocBin. + + DOCS: https://spacy.io/api/docbin#to_disk + """ + path = ensure_path(path) + with path.open("rb") as file_: + self.from_bytes(file_.read()) + return self + def merge_bins(bins): merged = None diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index e5b52a74a..641115505 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -177,28 +177,26 @@ run [`spacy pretrain`](/api/cli#pretrain). > #### Example > > ```python -> from pathlib import Path > from spacy.tokens import DocBin > from spacy.gold import Corpus -> output_file = Path(dir) / "output.spacy" -> data = DocBin(docs=docs).to_bytes() -> with output_file.open("wb") as file_: -> file_.write(data) -> reader = Corpus(output_file) +> +> doc_bin = DocBin(docs=docs) +> doc_bin.to_disk("./data.spacy") +> reader = Corpus("./data.spacy") > ``` -The main data format used in spaCy v3 is a binary format created by serializing -a [`DocBin`](/api/docbin) object, which represents a collection of `Doc` -objects. Typically, the extension for these binary files is `.spacy`, and they -are used as input format for specifying a [training corpus](/api/corpus) and for -spaCy's CLI [`train`](/api/cli#train) command. +The main data format used in spaCy v3.0 is a **binary format** created by +serializing a [`DocBin`](/api/docbin) object, which represents a collection of +`Doc` objects. This means that you can train spaCy models using the same format +it outputs: annotated `Doc` objects. The binary format is extremely **efficient +in storage**, especially when packing multiple documents together. -This binary format is extremely efficient in storage, especially when packing -multiple documents together. - -The built-in [`convert`](/api/cli#convert) command helps you convert spaCy's -previous [JSON format](#json-input) to this new `DocBin` format. It also -supports conversion of the `.conllu` format used by the +Typically, the extension for these binary files is `.spacy`, and they are used +as input format for specifying a [training corpus](/api/corpus) and for spaCy's +CLI [`train`](/api/cli#train) command. The built-in +[`convert`](/api/cli#convert) command helps you convert spaCy's previous +[JSON format](#json-input) to the new binary format format. It also supports +conversion of the `.conllu` format used by the [Universal Dependencies corpora](https://github.com/UniversalDependencies). ### JSON training format {#json-input tag="deprecated"} diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index 65d1153d1..ced742045 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -125,7 +125,8 @@ Serialize the `DocBin`'s annotations to a bytestring. > #### Example > > ```python -> doc_bin = DocBin(attrs=["DEP", "HEAD"]) +> docs = [nlp("Hello world!")] +> doc_bin = DocBin(docs=docs) > doc_bin_bytes = doc_bin.to_bytes() > ``` @@ -148,3 +149,36 @@ Deserialize the `DocBin`'s annotations from a bytestring. | ------------ | -------- | ---------------------- | | `bytes_data` | bytes | The data to load from. | | **RETURNS** | `DocBin` | The loaded `DocBin`. | + +## DocBin.to_disk {#to_disk tag="method" new="3"} + +Save the serialized `DocBin` to a file. Typically uses the `.spacy` extension +and the result can be used as the input data for +[`spacy train`](/api/cli#train). + +> #### Example +> +> ```python +> docs = [nlp("Hello world!")] +> doc_bin = DocBin(docs=docs) +> doc_bin.to_disk("./data.spacy") +> ``` + +| Argument | Type | Description | +| -------- | ------------ | ----------------------------------------------------- | +| `path` | str / `Path` | The file path, typically with the `.spacy` extension. | + +## DocBin.from_disk {#from_disk tag="method" new="3"} + +Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension. + +> #### Example +> +> ```python +> doc_bin = DocBin().from_disk("./data.spacy") +> ``` + +| Argument | Type | Description | +| ----------- | ------------ | ----------------------------------------------------- | +| `path` | str / `Path` | The file path, typically with the `.spacy` extension. | +| **RETURNS** | `DocBin` | The loaded `DocBin`. |