Add DocBin to/from_disk methods and update docs (#5892)

* Add DocBin to/from_disk methods and update docs

* Use DocBin.from_disk in Corpus
This commit is contained in:
Ines Montani 2020-08-07 14:30:59 +02:00 committed by GitHub
parent 4ca08c6d5d
commit ef2c67cca5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 83 additions and 30 deletions

View File

@ -39,7 +39,7 @@ class Corpus:
def __init__(
self,
path,
path: Union[str, Path],
*,
limit: int = 0,
gold_preproc: bool = False,
@ -136,8 +136,7 @@ class Corpus:
for loc in locs:
loc = util.ensure_path(loc)
if loc.parts[-1].endswith(".spacy"):
with loc.open("rb") as file_:
doc_bin = DocBin().from_bytes(file_.read())
doc_bin = DocBin().from_disk(loc)
docs = doc_bin.get_docs(vocab)
for doc in docs:
if len(doc):

View File

@ -484,12 +484,10 @@ def test_roundtrip_docs_to_docbin(doc):
json_file = tmpdir / "roundtrip.json"
srsly.write_json(json_file, [docs_to_json(doc)])
output_file = tmpdir / "roundtrip.spacy"
data = DocBin(docs=[doc]).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
DocBin(docs=[doc]).to_disk(output_file)
reader = Corpus(output_file)
reloaded_examples = list(reader(reloaded_nlp))
assert len(doc) == sum(len(eg) for eg in reloaded_examples)
assert len(doc) == sum(len(eg) for eg in reloaded_examples)
reloaded_example = reloaded_examples[0]
assert text == reloaded_example.reference.text
assert idx == [t.idx for t in reloaded_example.reference]
@ -512,13 +510,11 @@ def test_make_orth_variants(doc):
nlp = English()
with make_tempdir() as tmpdir:
output_file = tmpdir / "roundtrip.spacy"
data = DocBin(docs=[doc]).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
DocBin(docs=[doc]).to_disk(output_file)
# due to randomness, test only that this runs with no errors for now
reader = Corpus(output_file)
train_example = next(reader(nlp))
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
@pytest.mark.skip("Outdated")

View File

@ -1,4 +1,5 @@
from typing import Iterable, Iterator
from typing import Iterable, Iterator, Union
from pathlib import Path
import numpy
import zlib
import srsly
@ -9,6 +10,7 @@ from ..vocab import Vocab
from ..compat import copy_reg
from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors
from ..util import ensure_path
# fmt: off
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
@ -204,6 +206,30 @@ class DocBin:
assert len(tokens.shape) == 2, tokens.shape # this should never happen
return self
def to_disk(self, path: Union[str, Path]) -> None:
"""Save the DocBin to a file (typically called .spacy).
path (str / Path): The file path.
DOCS: https://spacy.io/api/docbin#to_disk
"""
path = ensure_path(path)
with path.open("wb") as file_:
file_.write(self.to_bytes())
def from_disk(self, path: Union[str, Path]) -> "DocBin":
"""Load the DocBin from a file (typically called .spacy).
path (str / Path): The file path.
RETURNS (DocBin): The loaded DocBin.
DOCS: https://spacy.io/api/docbin#to_disk
"""
path = ensure_path(path)
with path.open("rb") as file_:
self.from_bytes(file_.read())
return self
def merge_bins(bins):
merged = None

View File

@ -177,28 +177,26 @@ run [`spacy pretrain`](/api/cli#pretrain).
> #### Example
>
> ```python
> from pathlib import Path
> from spacy.tokens import DocBin
> from spacy.gold import Corpus
> output_file = Path(dir) / "output.spacy"
> data = DocBin(docs=docs).to_bytes()
> with output_file.open("wb") as file_:
> file_.write(data)
> reader = Corpus(output_file)
>
> doc_bin = DocBin(docs=docs)
> doc_bin.to_disk("./data.spacy")
> reader = Corpus("./data.spacy")
> ```
The main data format used in spaCy v3 is a binary format created by serializing
a [`DocBin`](/api/docbin) object, which represents a collection of `Doc`
objects. Typically, the extension for these binary files is `.spacy`, and they
are used as input format for specifying a [training corpus](/api/corpus) and for
spaCy's CLI [`train`](/api/cli#train) command.
The main data format used in spaCy v3.0 is a **binary format** created by
serializing a [`DocBin`](/api/docbin) object, which represents a collection of
`Doc` objects. This means that you can train spaCy models using the same format
it outputs: annotated `Doc` objects. The binary format is extremely **efficient
in storage**, especially when packing multiple documents together.
This binary format is extremely efficient in storage, especially when packing
multiple documents together.
The built-in [`convert`](/api/cli#convert) command helps you convert spaCy's
previous [JSON format](#json-input) to this new `DocBin` format. It also
supports conversion of the `.conllu` format used by the
Typically, the extension for these binary files is `.spacy`, and they are used
as input format for specifying a [training corpus](/api/corpus) and for spaCy's
CLI [`train`](/api/cli#train) command. The built-in
[`convert`](/api/cli#convert) command helps you convert spaCy's previous
[JSON format](#json-input) to the new binary format format. It also supports
conversion of the `.conllu` format used by the
[Universal Dependencies corpora](https://github.com/UniversalDependencies).
### JSON training format {#json-input tag="deprecated"}

View File

@ -125,7 +125,8 @@ Serialize the `DocBin`'s annotations to a bytestring.
> #### Example
>
> ```python
> doc_bin = DocBin(attrs=["DEP", "HEAD"])
> docs = [nlp("Hello world!")]
> doc_bin = DocBin(docs=docs)
> doc_bin_bytes = doc_bin.to_bytes()
> ```
@ -148,3 +149,36 @@ Deserialize the `DocBin`'s annotations from a bytestring.
| ------------ | -------- | ---------------------- |
| `bytes_data` | bytes | The data to load from. |
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
## DocBin.to_disk {#to_disk tag="method" new="3"}
Save the serialized `DocBin` to a file. Typically uses the `.spacy` extension
and the result can be used as the input data for
[`spacy train`](/api/cli#train).
> #### Example
>
> ```python
> docs = [nlp("Hello world!")]
> doc_bin = DocBin(docs=docs)
> doc_bin.to_disk("./data.spacy")
> ```
| Argument | Type | Description |
| -------- | ------------ | ----------------------------------------------------- |
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
## DocBin.from_disk {#from_disk tag="method" new="3"}
Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension.
> #### Example
>
> ```python
> doc_bin = DocBin().from_disk("./data.spacy")
> ```
| Argument | Type | Description |
| ----------- | ------------ | ----------------------------------------------------- |
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
| **RETURNS** | `DocBin` | The loaded `DocBin`. |