diff --git a/spacy/errors.py b/spacy/errors.py index b2a201773..f3234a06b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -467,6 +467,8 @@ class Errors(object): E164 = ("x is neither increasing nor decreasing: {}.") E165 = ("Only one class present in y_true. ROC AUC score is not defined in " "that case.") + E166 = ("Can only merge DocBins with the same pre-defined attributes.\n" + "Current DocBin: {current}\nOther DocBin: {other}") @add_codes diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 8e4e24d46..634d7450a 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -8,7 +8,8 @@ from thinc.neural.ops import NumpyOps from ..compat import copy_reg from ..tokens import Doc -from ..attrs import SPACY, ORTH +from ..attrs import SPACY, ORTH, intify_attrs +from ..errors import Errors class DocBin(object): @@ -38,33 +39,46 @@ class DocBin(object): documents together, because you have less duplication in the strings. A notable downside to this format is that you can't easily extract just one - document from the pallet. + document from the DocBin. """ def __init__(self, attrs=None, store_user_data=False): - """Create a DocBin object, to hold serialized annotations. + """Create a DocBin object to hold serialized annotations. attrs (list): List of attributes to serialize. 'orth' and 'spacy' are always serialized, so they're not required. Defaults to None. + store_user_data (bool): Whether to include the `Doc.user_data`. + RETURNS (DocBin): The newly constructed object. + + DOCS: https://spacy.io/api/docbin#init """ attrs = attrs or [] - # Ensure ORTH is always attrs[0] + attrs = sorted(intify_attrs(attrs)) self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] - self.attrs.insert(0, ORTH) + self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] self.tokens = [] self.spaces = [] self.user_data = [] self.strings = set() self.store_user_data = store_user_data + def __len__(self): + """RETURNS: The number of Doc objects added to the DocBin.""" + return len(self.tokens) + def add(self, doc): - """Add a doc's annotations to the DocBin for serialization.""" + """Add a Doc's annotations to the DocBin for serialization. + + doc (Doc): The Doc object to add. + + DOCS: https://spacy.io/api/docbin#add + """ array = doc.to_array(self.attrs) if len(array.shape) == 1: array = array.reshape((array.shape[0], 1)) self.tokens.append(array) spaces = doc.to_array(SPACY) - assert array.shape[0] == spaces.shape[0] + assert array.shape[0] == spaces.shape[0] # this should never happen spaces = spaces.reshape((spaces.shape[0], 1)) self.spaces.append(numpy.asarray(spaces, dtype=bool)) self.strings.update(w.text for w in doc) @@ -72,7 +86,13 @@ class DocBin(object): self.user_data.append(srsly.msgpack_dumps(doc.user_data)) def get_docs(self, vocab): - """Recover Doc objects from the annotations, using the given vocab.""" + """Recover Doc objects from the annotations, using the given vocab. + + vocab (Vocab): The shared vocab. + YIELDS (Doc): The Doc objects. + + DOCS: https://spacy.io/api/docbin#get_docs + """ for string in self.strings: vocab[string] orth_col = self.attrs.index(ORTH) @@ -87,8 +107,16 @@ class DocBin(object): yield doc def merge(self, other): - """Extend the annotations of this DocBin with the annotations from another.""" - assert self.attrs == other.attrs + """Extend the annotations of this DocBin with the annotations from + another. Will raise an error if the pre-defined attrs of the two + DocBins don't match. + + other (DocBin): The DocBin to merge into the current bin. + + DOCS: https://spacy.io/api/docbin#merge + """ + if self.attrs != other.attrs: + raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs)) self.tokens.extend(other.tokens) self.spaces.extend(other.spaces) self.strings.update(other.strings) @@ -96,9 +124,14 @@ class DocBin(object): self.user_data.extend(other.user_data) def to_bytes(self): - """Serialize the DocBin's annotations into a byte string.""" + """Serialize the DocBin's annotations to a bytestring. + + RETURNS (bytes): The serialized DocBin. + + DOCS: https://spacy.io/api/docbin#to_bytes + """ for tokens in self.tokens: - assert len(tokens.shape) == 2, tokens.shape + assert len(tokens.shape) == 2, tokens.shape # this should never happen lengths = [len(tokens) for tokens in self.tokens] msg = { "attrs": self.attrs, @@ -111,9 +144,15 @@ class DocBin(object): msg["user_data"] = self.user_data return gzip.compress(srsly.msgpack_dumps(msg)) - def from_bytes(self, string): - """Deserialize the DocBin's annotations from a byte string.""" - msg = srsly.msgpack_loads(gzip.decompress(string)) + def from_bytes(self, bytes_data): + """Deserialize the DocBin's annotations from a bytestring. + + bytes_data (bytes): The data to load from. + RETURNS (DocBin): The loaded DocBin. + + DOCS: https://spacy.io/api/docbin#from_bytes + """ + msg = srsly.msgpack_loads(gzip.decompress(bytes_data)) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) lengths = numpy.fromstring(msg["lengths"], dtype="int32") @@ -127,7 +166,7 @@ class DocBin(object): if self.store_user_data and "user_data" in msg: self.user_data = list(msg["user_data"]) for tokens in self.tokens: - assert len(tokens.shape) == 2, tokens.shape + assert len(tokens.shape) == 2, tokens.shape # this should never happen return self diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md new file mode 100644 index 000000000..a4525906e --- /dev/null +++ b/website/docs/api/docbin.md @@ -0,0 +1,149 @@ +--- +title: DocBin +tag: class +new: 2.2 +teaser: Pack Doc objects for binary serialization +source: spacy/tokens/_serialize.py +--- + +The `DocBin` class lets you efficiently serialize the information from a +collection of `Doc` objects. You can control which information is serialized by +passing a list of attribute IDs, and optionally also specify whether the user +data is serialized. The `DocBin` is faster and produces smaller data sizes than +pickle, and allows you to deserialize without executing arbitrary Python code. A +notable downside to this format is that you can't easily extract just one +document from the `DocBin`. The serialization format is gzipped msgpack, where +the msgpack object has the following structure: + +```python +### msgpack object strcutrue +{ + "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] + "tokens": bytes, # Serialized numpy uint64 array with the token data + "spaces": bytes, # Serialized numpy boolean array with spaces data + "lengths": bytes, # Serialized numpy int32 array with the doc lengths + "strings": List[unicode] # List of unique strings in the token data +} +``` + +Strings for the words, tags, labels etc are represented by 64-bit hashes in the +token data, and every string that occurs at least once is passed via the strings +object. This means the storage is more efficient if you pack more documents +together, because you have less duplication in the strings. For usage examples, +see the docs on [serializing `Doc` objects](/usage/saving-loading#docs). + +## DocBin.\_\_init\_\_ {#init tag="method"} + +Create a `DocBin` object to hold serialized annotations. + +> #### Example +> +> ```python +> from spacy.tokens import DocBin +> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) +> ``` + +| Argument | Type | Description | +| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. | +| `store_user_data` | bool | Whether to include the `Doc.user_data`. Defaults to `False`. | +| **RETURNS** | `DocBin` | The newly constructed object. | + +## DocBin.\_\len\_\_ {#len tag="method"} + +Get the number of `Doc` objects that were added to the `DocBin`. + +> #### Example +> +> ```python +> doc_bin = DocBin(attrs=["LEMMA"]) +> doc = nlp("This is a document to serialize.") +> doc_bin.add(doc) +> assert len(doc_bin) == 1 +> ``` + +| Argument | Type | Description | +| ----------- | ---- | ------------------------------------------- | +| **RETURNS** | int | The number of `Doc`s added to the `DocBin`. | + +## DocBin.add {#add tag="method"} + +Add a `Doc`'s annotations to the `DocBin` for serialization. + +> #### Example +> +> ```python +> doc_bin = DocBin(attrs=["LEMMA"]) +> doc = nlp("This is a document to serialize.") +> doc_bin.add(doc) +> ``` + +| Argument | Type | Description | +| -------- | ----- | ------------------------ | +| `doc` | `Doc` | The `Doc` object to add. | + +## DocBin.get_docs {#get_docs tag="method"} + +Recover `Doc` objects from the annotations, using the given vocab. + +> #### Example +> +> ```python +> docs = list(doc_bin.get_docs(nlp.vocab)) +> ``` + +| Argument | Type | Description | +| ---------- | ------- | ------------------ | +| `vocab` | `Vocab` | The shared vocab. | +| **YIELDS** | `Doc` | The `Doc` objects. | + +## DocBin.merge {#merge tag="method"} + +Extend the annotations of this `DocBin` with the annotations from another. Will +raise an error if the pre-defined attrs of the two `DocBin`s don't match. + +> #### Example +> +> ```python +> doc_bin1 = DocBin(attrs=["LEMMA", "POS"]) +> doc_bin1.add(nlp("Hello world")) +> doc_bin2 = DocBin(attrs=["LEMMA", "POS"]) +> doc_bin2.add(nlp("This is a sentence")) +> merged_bins = doc_bin1.merge(doc_bin2) +> assert len(merged_bins) == 2 +> ``` + +| Argument | Type | Description | +| -------- | -------- | ------------------------------------------- | +| `other` | `DocBin` | The `DocBin` to merge into the current bin. | + +## DocBin.to_bytes {#to_bytes tag="method"} + +Serialize the `DocBin`'s annotations to a bytestring. + +> #### Example +> +> ```python +> doc_bin = DocBin(attrs=["DEP", "HEAD"]) +> doc_bin_bytes = doc_bin.to_bytes() +> ``` + +| Argument | Type | Description | +| ----------- | ----- | ------------------------ | +| **RETURNS** | bytes | The serialized `DocBin`. | + +## DocBin.from_bytes {#from_bytes tag="method"} + +Deserialize the `DocBin`'s annotations from a bytestring. + +> #### Example +> +> ```python +> doc_bin_bytes = doc_bin.to_bytes() +> new_doc_bin = DocBin().from_bytes(doc_bin_bytes) +> ``` + +| Argument | Type | Description | +| ------------ | -------- | ---------------------- | +| `bytes_data` | bytes | The data to load from. | +| **RETURNS** | `DocBin` | The loaded `DocBin`. | diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index d592277aa..3d904f01a 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -59,6 +59,39 @@ initializes the language class, creates and adds the pipeline components and _then_ loads in the binary data. You can read more about this process [here](/usage/processing-pipelines#pipelines). +### Serializing Doc objects efficiently {#docs new="2.2"} + +If you're working with lots of data, you'll probably need to pass analyses +between machines, either to use something like [Dask](https://dask.org) or +[Spark](https://spark.apache.org), or even just to save out work to disk. Often +it's sufficient to use the [`Doc.to_array`](/api/doc#to_array) functionality for +this, and just serialize the numpy arrays – but other times you want a more +general way to save and restore `Doc` objects. + +The [`DocBin`](/api/docbin) class makes it easy to serialize and deserialize a +collection of `Doc` objects together, and is much more efficient than calling +[`Doc.to_bytes`](/api/doc#to_bytes) on each individual `Doc` object. You can +also control what data gets saved, and you can merge pallets together for easy +map/reduce-style processing. + +```python +### {highlight="4,8,9,13,14"} +import spacy +from spacy.tokens import DocBin + +doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) +texts = ["Some text", "Lots of texts...", "..."] +nlp = spacy.load("en_core_web_sm") +for doc in nlp.pipe(texts): + doc_bin.add(doc) +bytes_data = docbin.to_bytes() + +# Deserialize later, e.g. in a new process +nlp = spacy.blank("en") +doc_bin = DocBin().from_bytes(bytes_data) +docs = list(doc_bin.get_docs(nlp.vocab)) +``` + ### Using Pickle {#pickle} > #### Example diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md index 868430908..376a9ae10 100644 --- a/website/docs/usage/v2-2.md +++ b/website/docs/usage/v2-2.md @@ -121,30 +121,38 @@ classification. ### New DocBin class to efficiently serialize Doc collections > #### Example -> +> > ```python > from spacy.tokens import DocBin -> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False) +> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) > for doc in nlp.pipe(texts): > doc_bin.add(doc) -> byte_data = docbin.to_bytes() +> bytes_data = doc_bin.to_bytes() > # Deserialize later, e.g. in a new process > nlp = spacy.blank("en") -> doc_bin = DocBin() +> doc_bin = DocBin().from_bytes(bytes_data) > docs = list(doc_bin.get_docs(nlp.vocab)) > ``` If you're working with lots of data, you'll probably need to pass analyses -between machines, either to use something like Dask or Spark, or even just to -save out work to disk. Often it's sufficient to use the doc.to_array() -functionality for this, and just serialize the numpy arrays --- but other times -you want a more general way to save and restore `Doc` objects. +between machines, either to use something like [Dask](https://dask.org) or +[Spark](https://spark.apache.org), or even just to save out work to disk. Often +it's sufficient to use the `Doc.to_array` functionality for this, and just +serialize the numpy arrays – but other times you want a more general way to save +and restore `Doc` objects. -The new `DocBin` class makes it easy to serialize and deserialize -a collection of `Doc` objects together, and is much more efficient than -calling `doc.to_bytes()` on each individual `Doc` object. You can also control -what data gets saved, and you can merge pallets together for easy -map/reduce-style processing. +The new `DocBin` class makes it easy to serialize and deserialize a collection +of `Doc` objects together, and is much more efficient than calling +`Doc.to_bytes` on each individual `Doc` object. You can also control what data +gets saved, and you can merge pallets together for easy map/reduce-style +processing. + + + +**API:** [`DocBin`](/api/docbin) **Usage: ** +[Serializing Doc objects](/usage/saving-loading#docs) + + ### CLI command to debug and validate training data {#debug-data} diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 7c6affe70..68d46605f 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -95,7 +95,8 @@ { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "GoldParse", "url": "/api/goldparse" }, { "text": "GoldCorpus", "url": "/api/goldcorpus" }, - { "text": "Scorer", "url": "/api/scorer" } + { "text": "Scorer", "url": "/api/scorer" }, + { "text": "DocBin", "url": "/api/docbin" } ] }, {