mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Add DocBin to/from_disk methods and update docs (#5892)
* Add DocBin to/from_disk methods and update docs * Use DocBin.from_disk in Corpus
This commit is contained in:
parent
4ca08c6d5d
commit
ef2c67cca5
|
@ -39,7 +39,7 @@ class Corpus:
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
path,
|
||||
path: Union[str, Path],
|
||||
*,
|
||||
limit: int = 0,
|
||||
gold_preproc: bool = False,
|
||||
|
@ -136,8 +136,7 @@ class Corpus:
|
|||
for loc in locs:
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.parts[-1].endswith(".spacy"):
|
||||
with loc.open("rb") as file_:
|
||||
doc_bin = DocBin().from_bytes(file_.read())
|
||||
doc_bin = DocBin().from_disk(loc)
|
||||
docs = doc_bin.get_docs(vocab)
|
||||
for doc in docs:
|
||||
if len(doc):
|
||||
|
|
|
@ -484,9 +484,7 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
json_file = tmpdir / "roundtrip.json"
|
||||
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||
output_file = tmpdir / "roundtrip.spacy"
|
||||
data = DocBin(docs=[doc]).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
DocBin(docs=[doc]).to_disk(output_file)
|
||||
reader = Corpus(output_file)
|
||||
reloaded_examples = list(reader(reloaded_nlp))
|
||||
assert len(doc) == sum(len(eg) for eg in reloaded_examples)
|
||||
|
@ -512,9 +510,7 @@ def test_make_orth_variants(doc):
|
|||
nlp = English()
|
||||
with make_tempdir() as tmpdir:
|
||||
output_file = tmpdir / "roundtrip.spacy"
|
||||
data = DocBin(docs=[doc]).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
DocBin(docs=[doc]).to_disk(output_file)
|
||||
# due to randomness, test only that this runs with no errors for now
|
||||
reader = Corpus(output_file)
|
||||
train_example = next(reader(nlp))
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from typing import Iterable, Iterator
|
||||
from typing import Iterable, Iterator, Union
|
||||
from pathlib import Path
|
||||
import numpy
|
||||
import zlib
|
||||
import srsly
|
||||
|
@ -9,6 +10,7 @@ from ..vocab import Vocab
|
|||
from ..compat import copy_reg
|
||||
from ..attrs import SPACY, ORTH, intify_attr
|
||||
from ..errors import Errors
|
||||
from ..util import ensure_path
|
||||
|
||||
# fmt: off
|
||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
||||
|
@ -204,6 +206,30 @@ class DocBin:
|
|||
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||
return self
|
||||
|
||||
def to_disk(self, path: Union[str, Path]) -> None:
|
||||
"""Save the DocBin to a file (typically called .spacy).
|
||||
|
||||
path (str / Path): The file path.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#to_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
with path.open("wb") as file_:
|
||||
file_.write(self.to_bytes())
|
||||
|
||||
def from_disk(self, path: Union[str, Path]) -> "DocBin":
|
||||
"""Load the DocBin from a file (typically called .spacy).
|
||||
|
||||
path (str / Path): The file path.
|
||||
RETURNS (DocBin): The loaded DocBin.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#to_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
with path.open("rb") as file_:
|
||||
self.from_bytes(file_.read())
|
||||
return self
|
||||
|
||||
|
||||
def merge_bins(bins):
|
||||
merged = None
|
||||
|
|
|
@ -177,28 +177,26 @@ run [`spacy pretrain`](/api/cli#pretrain).
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from pathlib import Path
|
||||
> from spacy.tokens import DocBin
|
||||
> from spacy.gold import Corpus
|
||||
> output_file = Path(dir) / "output.spacy"
|
||||
> data = DocBin(docs=docs).to_bytes()
|
||||
> with output_file.open("wb") as file_:
|
||||
> file_.write(data)
|
||||
> reader = Corpus(output_file)
|
||||
>
|
||||
> doc_bin = DocBin(docs=docs)
|
||||
> doc_bin.to_disk("./data.spacy")
|
||||
> reader = Corpus("./data.spacy")
|
||||
> ```
|
||||
|
||||
The main data format used in spaCy v3 is a binary format created by serializing
|
||||
a [`DocBin`](/api/docbin) object, which represents a collection of `Doc`
|
||||
objects. Typically, the extension for these binary files is `.spacy`, and they
|
||||
are used as input format for specifying a [training corpus](/api/corpus) and for
|
||||
spaCy's CLI [`train`](/api/cli#train) command.
|
||||
The main data format used in spaCy v3.0 is a **binary format** created by
|
||||
serializing a [`DocBin`](/api/docbin) object, which represents a collection of
|
||||
`Doc` objects. This means that you can train spaCy models using the same format
|
||||
it outputs: annotated `Doc` objects. The binary format is extremely **efficient
|
||||
in storage**, especially when packing multiple documents together.
|
||||
|
||||
This binary format is extremely efficient in storage, especially when packing
|
||||
multiple documents together.
|
||||
|
||||
The built-in [`convert`](/api/cli#convert) command helps you convert spaCy's
|
||||
previous [JSON format](#json-input) to this new `DocBin` format. It also
|
||||
supports conversion of the `.conllu` format used by the
|
||||
Typically, the extension for these binary files is `.spacy`, and they are used
|
||||
as input format for specifying a [training corpus](/api/corpus) and for spaCy's
|
||||
CLI [`train`](/api/cli#train) command. The built-in
|
||||
[`convert`](/api/cli#convert) command helps you convert spaCy's previous
|
||||
[JSON format](#json-input) to the new binary format format. It also supports
|
||||
conversion of the `.conllu` format used by the
|
||||
[Universal Dependencies corpora](https://github.com/UniversalDependencies).
|
||||
|
||||
### JSON training format {#json-input tag="deprecated"}
|
||||
|
|
|
@ -125,7 +125,8 @@ Serialize the `DocBin`'s annotations to a bytestring.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc_bin = DocBin(attrs=["DEP", "HEAD"])
|
||||
> docs = [nlp("Hello world!")]
|
||||
> doc_bin = DocBin(docs=docs)
|
||||
> doc_bin_bytes = doc_bin.to_bytes()
|
||||
> ```
|
||||
|
||||
|
@ -148,3 +149,36 @@ Deserialize the `DocBin`'s annotations from a bytestring.
|
|||
| ------------ | -------- | ---------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
||||
|
||||
## DocBin.to_disk {#to_disk tag="method" new="3"}
|
||||
|
||||
Save the serialized `DocBin` to a file. Typically uses the `.spacy` extension
|
||||
and the result can be used as the input data for
|
||||
[`spacy train`](/api/cli#train).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> docs = [nlp("Hello world!")]
|
||||
> doc_bin = DocBin(docs=docs)
|
||||
> doc_bin.to_disk("./data.spacy")
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------- | ------------ | ----------------------------------------------------- |
|
||||
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
|
||||
|
||||
## DocBin.from_disk {#from_disk tag="method" new="3"}
|
||||
|
||||
Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc_bin = DocBin().from_disk("./data.spacy")
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ------------ | ----------------------------------------------------- |
|
||||
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
|
||||
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
||||
|
|
Loading…
Reference in New Issue
Block a user