mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Add DocBin to/from_disk methods and update docs (#5892)
* Add DocBin to/from_disk methods and update docs * Use DocBin.from_disk in Corpus
This commit is contained in:
parent
4ca08c6d5d
commit
ef2c67cca5
|
@ -39,7 +39,7 @@ class Corpus:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
path,
|
path: Union[str, Path],
|
||||||
*,
|
*,
|
||||||
limit: int = 0,
|
limit: int = 0,
|
||||||
gold_preproc: bool = False,
|
gold_preproc: bool = False,
|
||||||
|
@ -136,8 +136,7 @@ class Corpus:
|
||||||
for loc in locs:
|
for loc in locs:
|
||||||
loc = util.ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
if loc.parts[-1].endswith(".spacy"):
|
if loc.parts[-1].endswith(".spacy"):
|
||||||
with loc.open("rb") as file_:
|
doc_bin = DocBin().from_disk(loc)
|
||||||
doc_bin = DocBin().from_bytes(file_.read())
|
|
||||||
docs = doc_bin.get_docs(vocab)
|
docs = doc_bin.get_docs(vocab)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
if len(doc):
|
if len(doc):
|
||||||
|
|
|
@ -484,12 +484,10 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
json_file = tmpdir / "roundtrip.json"
|
json_file = tmpdir / "roundtrip.json"
|
||||||
srsly.write_json(json_file, [docs_to_json(doc)])
|
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||||
output_file = tmpdir / "roundtrip.spacy"
|
output_file = tmpdir / "roundtrip.spacy"
|
||||||
data = DocBin(docs=[doc]).to_bytes()
|
DocBin(docs=[doc]).to_disk(output_file)
|
||||||
with output_file.open("wb") as file_:
|
|
||||||
file_.write(data)
|
|
||||||
reader = Corpus(output_file)
|
reader = Corpus(output_file)
|
||||||
reloaded_examples = list(reader(reloaded_nlp))
|
reloaded_examples = list(reader(reloaded_nlp))
|
||||||
assert len(doc) == sum(len(eg) for eg in reloaded_examples)
|
assert len(doc) == sum(len(eg) for eg in reloaded_examples)
|
||||||
reloaded_example = reloaded_examples[0]
|
reloaded_example = reloaded_examples[0]
|
||||||
assert text == reloaded_example.reference.text
|
assert text == reloaded_example.reference.text
|
||||||
assert idx == [t.idx for t in reloaded_example.reference]
|
assert idx == [t.idx for t in reloaded_example.reference]
|
||||||
|
@ -512,13 +510,11 @@ def test_make_orth_variants(doc):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
output_file = tmpdir / "roundtrip.spacy"
|
output_file = tmpdir / "roundtrip.spacy"
|
||||||
data = DocBin(docs=[doc]).to_bytes()
|
DocBin(docs=[doc]).to_disk(output_file)
|
||||||
with output_file.open("wb") as file_:
|
|
||||||
file_.write(data)
|
|
||||||
# due to randomness, test only that this runs with no errors for now
|
# due to randomness, test only that this runs with no errors for now
|
||||||
reader = Corpus(output_file)
|
reader = Corpus(output_file)
|
||||||
train_example = next(reader(nlp))
|
train_example = next(reader(nlp))
|
||||||
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip("Outdated")
|
@pytest.mark.skip("Outdated")
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Iterable, Iterator
|
from typing import Iterable, Iterator, Union
|
||||||
|
from pathlib import Path
|
||||||
import numpy
|
import numpy
|
||||||
import zlib
|
import zlib
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -9,6 +10,7 @@ from ..vocab import Vocab
|
||||||
from ..compat import copy_reg
|
from ..compat import copy_reg
|
||||||
from ..attrs import SPACY, ORTH, intify_attr
|
from ..attrs import SPACY, ORTH, intify_attr
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
from ..util import ensure_path
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
||||||
|
@ -204,6 +206,30 @@ class DocBin:
|
||||||
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path: Union[str, Path]) -> None:
|
||||||
|
"""Save the DocBin to a file (typically called .spacy).
|
||||||
|
|
||||||
|
path (str / Path): The file path.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/docbin#to_disk
|
||||||
|
"""
|
||||||
|
path = ensure_path(path)
|
||||||
|
with path.open("wb") as file_:
|
||||||
|
file_.write(self.to_bytes())
|
||||||
|
|
||||||
|
def from_disk(self, path: Union[str, Path]) -> "DocBin":
|
||||||
|
"""Load the DocBin from a file (typically called .spacy).
|
||||||
|
|
||||||
|
path (str / Path): The file path.
|
||||||
|
RETURNS (DocBin): The loaded DocBin.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/docbin#to_disk
|
||||||
|
"""
|
||||||
|
path = ensure_path(path)
|
||||||
|
with path.open("rb") as file_:
|
||||||
|
self.from_bytes(file_.read())
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
def merge_bins(bins):
|
def merge_bins(bins):
|
||||||
merged = None
|
merged = None
|
||||||
|
|
|
@ -177,28 +177,26 @@ run [`spacy pretrain`](/api/cli#pretrain).
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from pathlib import Path
|
|
||||||
> from spacy.tokens import DocBin
|
> from spacy.tokens import DocBin
|
||||||
> from spacy.gold import Corpus
|
> from spacy.gold import Corpus
|
||||||
> output_file = Path(dir) / "output.spacy"
|
>
|
||||||
> data = DocBin(docs=docs).to_bytes()
|
> doc_bin = DocBin(docs=docs)
|
||||||
> with output_file.open("wb") as file_:
|
> doc_bin.to_disk("./data.spacy")
|
||||||
> file_.write(data)
|
> reader = Corpus("./data.spacy")
|
||||||
> reader = Corpus(output_file)
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
The main data format used in spaCy v3 is a binary format created by serializing
|
The main data format used in spaCy v3.0 is a **binary format** created by
|
||||||
a [`DocBin`](/api/docbin) object, which represents a collection of `Doc`
|
serializing a [`DocBin`](/api/docbin) object, which represents a collection of
|
||||||
objects. Typically, the extension for these binary files is `.spacy`, and they
|
`Doc` objects. This means that you can train spaCy models using the same format
|
||||||
are used as input format for specifying a [training corpus](/api/corpus) and for
|
it outputs: annotated `Doc` objects. The binary format is extremely **efficient
|
||||||
spaCy's CLI [`train`](/api/cli#train) command.
|
in storage**, especially when packing multiple documents together.
|
||||||
|
|
||||||
This binary format is extremely efficient in storage, especially when packing
|
Typically, the extension for these binary files is `.spacy`, and they are used
|
||||||
multiple documents together.
|
as input format for specifying a [training corpus](/api/corpus) and for spaCy's
|
||||||
|
CLI [`train`](/api/cli#train) command. The built-in
|
||||||
The built-in [`convert`](/api/cli#convert) command helps you convert spaCy's
|
[`convert`](/api/cli#convert) command helps you convert spaCy's previous
|
||||||
previous [JSON format](#json-input) to this new `DocBin` format. It also
|
[JSON format](#json-input) to the new binary format format. It also supports
|
||||||
supports conversion of the `.conllu` format used by the
|
conversion of the `.conllu` format used by the
|
||||||
[Universal Dependencies corpora](https://github.com/UniversalDependencies).
|
[Universal Dependencies corpora](https://github.com/UniversalDependencies).
|
||||||
|
|
||||||
### JSON training format {#json-input tag="deprecated"}
|
### JSON training format {#json-input tag="deprecated"}
|
||||||
|
|
|
@ -125,7 +125,8 @@ Serialize the `DocBin`'s annotations to a bytestring.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> doc_bin = DocBin(attrs=["DEP", "HEAD"])
|
> docs = [nlp("Hello world!")]
|
||||||
|
> doc_bin = DocBin(docs=docs)
|
||||||
> doc_bin_bytes = doc_bin.to_bytes()
|
> doc_bin_bytes = doc_bin.to_bytes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -148,3 +149,36 @@ Deserialize the `DocBin`'s annotations from a bytestring.
|
||||||
| ------------ | -------- | ---------------------- |
|
| ------------ | -------- | ---------------------- |
|
||||||
| `bytes_data` | bytes | The data to load from. |
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
||||||
|
|
||||||
|
## DocBin.to_disk {#to_disk tag="method" new="3"}
|
||||||
|
|
||||||
|
Save the serialized `DocBin` to a file. Typically uses the `.spacy` extension
|
||||||
|
and the result can be used as the input data for
|
||||||
|
[`spacy train`](/api/cli#train).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> docs = [nlp("Hello world!")]
|
||||||
|
> doc_bin = DocBin(docs=docs)
|
||||||
|
> doc_bin.to_disk("./data.spacy")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| -------- | ------------ | ----------------------------------------------------- |
|
||||||
|
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
|
||||||
|
|
||||||
|
## DocBin.from_disk {#from_disk tag="method" new="3"}
|
||||||
|
|
||||||
|
Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc_bin = DocBin().from_disk("./data.spacy")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| ----------- | ------------ | ----------------------------------------------------- |
|
||||||
|
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
|
||||||
|
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user