Update DocBin and add docs

2025-09-18 18:12:45 +03:00 · 2019-09-18 20:23:21 +02:00 · 2019-09-18 20:23:21 +02:00 · dd1810f05a
commit dd1810f05a
parent d62690b3ba
6 changed files with 262 additions and 30 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -467,6 +467,8 @@ class Errors(object):
    E164 = ("x is neither increasing nor decreasing: {}.")
    E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
            "that case.")
+    E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
+            "Current DocBin: {current}\nOther DocBin: {other}")


@add_codes
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -8,7 +8,8 @@ from thinc.neural.ops import NumpyOps

 from ..compat import copy_reg
 from ..tokens import Doc
-from ..attrs import SPACY, ORTH
+from ..attrs import SPACY, ORTH, intify_attrs
+from ..errors import Errors


 class DocBin(object):
@ -38,33 +39,46 @@ class DocBin(object):
    documents together, because you have less duplication in the strings.

    A notable downside to this format is that you can't easily extract just one
-    document from the pallet.
+    document from the DocBin.
    """

    def __init__(self, attrs=None, store_user_data=False):
-        """Create a DocBin object, to hold serialized annotations.
+        """Create a DocBin object to hold serialized annotations.

        attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
            always serialized, so they're not required. Defaults to None.
+        store_user_data (bool): Whether to include the `Doc.user_data`.
+        RETURNS (DocBin): The newly constructed object.
+
+        DOCS: https://spacy.io/api/docbin#init
        """
        attrs = attrs or []
-        # Ensure ORTH is always attrs[0]
+        attrs = sorted(intify_attrs(attrs))
        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
-        self.attrs.insert(0, ORTH)
+        self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
        self.tokens = []
        self.spaces = []
        self.user_data = []
        self.strings = set()
        self.store_user_data = store_user_data

+    def __len__(self):
+        """RETURNS: The number of Doc objects added to the DocBin."""
+        return len(self.tokens)
+
    def add(self, doc):
-        """Add a doc's annotations to the DocBin for serialization."""
+        """Add a Doc's annotations to the DocBin for serialization.
+
+        doc (Doc): The Doc object to add.
+
+        DOCS: https://spacy.io/api/docbin#add
+        """
        array = doc.to_array(self.attrs)
        if len(array.shape) == 1:
            array = array.reshape((array.shape[0], 1))
        self.tokens.append(array)
        spaces = doc.to_array(SPACY)
-        assert array.shape[0] == spaces.shape[0]
+        assert array.shape[0] == spaces.shape[0]  # this should never happen
        spaces = spaces.reshape((spaces.shape[0], 1))
        self.spaces.append(numpy.asarray(spaces, dtype=bool))
        self.strings.update(w.text for w in doc)
@ -72,7 +86,13 @@ class DocBin(object):
            self.user_data.append(srsly.msgpack_dumps(doc.user_data))

    def get_docs(self, vocab):
-        """Recover Doc objects from the annotations, using the given vocab."""
+        """Recover Doc objects from the annotations, using the given vocab.
+
+        vocab (Vocab): The shared vocab.
+        YIELDS (Doc): The Doc objects.
+
+        DOCS: https://spacy.io/api/docbin#get_docs
+        """
        for string in self.strings:
            vocab[string]
        orth_col = self.attrs.index(ORTH)
@ -87,8 +107,16 @@ class DocBin(object):
            yield doc

    def merge(self, other):
-        """Extend the annotations of this DocBin with the annotations from another."""
-        assert self.attrs == other.attrs
+        """Extend the annotations of this DocBin with the annotations from
+        another. Will raise an error if the pre-defined attrs of the two
+        DocBins don't match.
+
+        other (DocBin): The DocBin to merge into the current bin.
+
+        DOCS: https://spacy.io/api/docbin#merge
+        """
+        if self.attrs != other.attrs:
+            raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
        self.tokens.extend(other.tokens)
        self.spaces.extend(other.spaces)
        self.strings.update(other.strings)
@ -96,9 +124,14 @@ class DocBin(object):
            self.user_data.extend(other.user_data)

    def to_bytes(self):
-        """Serialize the DocBin's annotations into a byte string."""
+        """Serialize the DocBin's annotations to a bytestring.
+
+        RETURNS (bytes): The serialized DocBin.
+
+        DOCS: https://spacy.io/api/docbin#to_bytes
+        """
        for tokens in self.tokens:
-            assert len(tokens.shape) == 2, tokens.shape
+            assert len(tokens.shape) == 2, tokens.shape  # this should never happen
        lengths = [len(tokens) for tokens in self.tokens]
        msg = {
            "attrs": self.attrs,
@ -111,9 +144,15 @@ class DocBin(object):
            msg["user_data"] = self.user_data
        return gzip.compress(srsly.msgpack_dumps(msg))

-    def from_bytes(self, string):
-        """Deserialize the DocBin's annotations from a byte string."""
-        msg = srsly.msgpack_loads(gzip.decompress(string))
+    def from_bytes(self, bytes_data):
+        """Deserialize the DocBin's annotations from a bytestring.
+
+        bytes_data (bytes): The data to load from.
+        RETURNS (DocBin): The loaded DocBin.
+
+        DOCS: https://spacy.io/api/docbin#from_bytes
+        """
+        msg = srsly.msgpack_loads(gzip.decompress(bytes_data))
        self.attrs = msg["attrs"]
        self.strings = set(msg["strings"])
        lengths = numpy.fromstring(msg["lengths"], dtype="int32")
@ -127,7 +166,7 @@ class DocBin(object):
        if self.store_user_data and "user_data" in msg:
            self.user_data = list(msg["user_data"])
        for tokens in self.tokens:
-            assert len(tokens.shape) == 2, tokens.shape
+            assert len(tokens.shape) == 2, tokens.shape  # this should never happen
        return self


--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@ -0,0 +1,149 @@
+---
+title: DocBin
+tag: class
+new: 2.2
+teaser: Pack Doc objects for binary serialization
+source: spacy/tokens/_serialize.py
+---
+
+The `DocBin` class lets you efficiently serialize the information from a
+collection of `Doc` objects. You can control which information is serialized by
+passing a list of attribute IDs, and optionally also specify whether the user
+data is serialized. The `DocBin` is faster and produces smaller data sizes than
+pickle, and allows you to deserialize without executing arbitrary Python code. A
+notable downside to this format is that you can't easily extract just one
+document from the `DocBin`. The serialization format is gzipped msgpack, where
+the msgpack object has the following structure:
+
+```python
+### msgpack object strcutrue
+{
+    "attrs": List[uint64],    # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
+    "tokens": bytes,          # Serialized numpy uint64 array with the token data
+    "spaces": bytes,          # Serialized numpy boolean array with spaces data
+    "lengths": bytes,         # Serialized numpy int32 array with the doc lengths
+    "strings": List[unicode]  # List of unique strings in the token data
+}
+```
+
+Strings for the words, tags, labels etc are represented by 64-bit hashes in the
+token data, and every string that occurs at least once is passed via the strings
+object. This means the storage is more efficient if you pack more documents
+together, because you have less duplication in the strings. For usage examples,
+see the docs on [serializing `Doc` objects](/usage/saving-loading#docs).
+
+## DocBin.\_\_init\_\_ {#init tag="method"}
+
+Create a `DocBin` object to hold serialized annotations.
+
+> #### Example
+>
+> ```python
+> from spacy.tokens import DocBin
+> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
+> ```
+
+| Argument          | Type     | Description                                                                                                                                                                                |
+| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `attrs`           | list     | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
+| `store_user_data` | bool     | Whether to include the `Doc.user_data`. Defaults to `False`.                                                                                                                               |
+| **RETURNS**       | `DocBin` | The newly constructed object.                                                                                                                                                              |
+
+## DocBin.\_\len\_\_ {#len tag="method"}
+
+Get the number of `Doc` objects that were added to the `DocBin`.
+
+> #### Example
+>
+> ```python
+> doc_bin = DocBin(attrs=["LEMMA"])
+> doc = nlp("This is a document to serialize.")
+> doc_bin.add(doc)
+> assert len(doc_bin) == 1
+> ```
+
+| Argument    | Type | Description                                 |
+| ----------- | ---- | ------------------------------------------- |
+| **RETURNS** | int  | The number of `Doc`s added to the `DocBin`. |
+
+## DocBin.add {#add tag="method"}
+
+Add a `Doc`'s annotations to the `DocBin` for serialization.
+
+> #### Example
+>
+> ```python
+> doc_bin = DocBin(attrs=["LEMMA"])
+> doc = nlp("This is a document to serialize.")
+> doc_bin.add(doc)
+> ```
+
+| Argument | Type  | Description              |
+| -------- | ----- | ------------------------ |
+| `doc`    | `Doc` | The `Doc` object to add. |
+
+## DocBin.get_docs {#get_docs tag="method"}
+
+Recover `Doc` objects from the annotations, using the given vocab.
+
+> #### Example
+>
+> ```python
+> docs = list(doc_bin.get_docs(nlp.vocab))
+> ```
+
+| Argument   | Type    | Description        |
+| ---------- | ------- | ------------------ |
+| `vocab`    | `Vocab` | The shared vocab.  |
+| **YIELDS** | `Doc`   | The `Doc` objects. |
+
+## DocBin.merge {#merge tag="method"}
+
+Extend the annotations of this `DocBin` with the annotations from another. Will
+raise an error if the pre-defined attrs of the two `DocBin`s don't match.
+
+> #### Example
+>
+> ```python
+> doc_bin1 = DocBin(attrs=["LEMMA", "POS"])
+> doc_bin1.add(nlp("Hello world"))
+> doc_bin2 = DocBin(attrs=["LEMMA", "POS"])
+> doc_bin2.add(nlp("This is a sentence"))
+> merged_bins = doc_bin1.merge(doc_bin2)
+> assert len(merged_bins) == 2
+> ```
+
+| Argument | Type     | Description                                 |
+| -------- | -------- | ------------------------------------------- |
+| `other`  | `DocBin` | The `DocBin` to merge into the current bin. |
+
+## DocBin.to_bytes {#to_bytes tag="method"}
+
+Serialize the `DocBin`'s annotations to a bytestring.
+
+> #### Example
+>
+> ```python
+> doc_bin = DocBin(attrs=["DEP", "HEAD"])
+> doc_bin_bytes = doc_bin.to_bytes()
+> ```
+
+| Argument    | Type  | Description              |
+| ----------- | ----- | ------------------------ |
+| **RETURNS** | bytes | The serialized `DocBin`. |
+
+## DocBin.from_bytes {#from_bytes tag="method"}
+
+Deserialize the `DocBin`'s annotations from a bytestring.
+
+> #### Example
+>
+> ```python
+> doc_bin_bytes = doc_bin.to_bytes()
+> new_doc_bin = DocBin().from_bytes(doc_bin_bytes)
+> ```
+
+| Argument     | Type     | Description            |
+| ------------ | -------- | ---------------------- |
+| `bytes_data` | bytes    | The data to load from. |
+| **RETURNS**  | `DocBin` | The loaded `DocBin`.   |
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@ -59,6 +59,39 @@ initializes the language class, creates and adds the pipeline components and
 _then_ loads in the binary data. You can read more about this process
 [here](/usage/processing-pipelines#pipelines).

+### Serializing Doc objects efficiently {#docs new="2.2"}
+
+If you're working with lots of data, you'll probably need to pass analyses
+between machines, either to use something like [Dask](https://dask.org) or
+[Spark](https://spark.apache.org), or even just to save out work to disk. Often
+it's sufficient to use the [`Doc.to_array`](/api/doc#to_array) functionality for
+this, and just serialize the numpy arrays – but other times you want a more
+general way to save and restore `Doc` objects.
+
+The [`DocBin`](/api/docbin) class makes it easy to serialize and deserialize a
+collection of `Doc` objects together, and is much more efficient than calling
+[`Doc.to_bytes`](/api/doc#to_bytes) on each individual `Doc` object. You can
+also control what data gets saved, and you can merge pallets together for easy
+map/reduce-style processing.
+
+```python
+### {highlight="4,8,9,13,14"}
+import spacy
+from spacy.tokens import DocBin
+
+doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
+texts = ["Some text", "Lots of texts...", "..."]
+nlp = spacy.load("en_core_web_sm")
+for doc in nlp.pipe(texts):
+    doc_bin.add(doc)
+bytes_data = docbin.to_bytes()
+
+# Deserialize later, e.g. in a new process
+nlp = spacy.blank("en")
+doc_bin = DocBin().from_bytes(bytes_data)
+docs = list(doc_bin.get_docs(nlp.vocab))
+```
+
 ### Using Pickle {#pickle}

 > #### Example
--- a/website/docs/usage/v2-2.md
+++ b/website/docs/usage/v2-2.md
@ -121,30 +121,38 @@ classification.
 ### New DocBin class to efficiently serialize Doc collections

 > #### Example
-> 
+>
 > ```python
 > from spacy.tokens import DocBin
-> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False)
+> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
 > for doc in nlp.pipe(texts):
 >     doc_bin.add(doc)
-> byte_data = docbin.to_bytes()
+> bytes_data = doc_bin.to_bytes()
 > # Deserialize later, e.g. in a new process
 > nlp = spacy.blank("en")
-> doc_bin = DocBin()
+> doc_bin = DocBin().from_bytes(bytes_data)
 > docs = list(doc_bin.get_docs(nlp.vocab))
 > ```

 If you're working with lots of data, you'll probably need to pass analyses
-between machines, either to use something like Dask or Spark, or even just to
-save out work to disk. Often it's sufficient to use the doc.to_array()
-functionality for this, and just serialize the numpy arrays --- but other times
-you want a more general way to save and restore `Doc` objects.
+between machines, either to use something like [Dask](https://dask.org) or
+[Spark](https://spark.apache.org), or even just to save out work to disk. Often
+it's sufficient to use the `Doc.to_array` functionality for this, and just
+serialize the numpy arrays – but other times you want a more general way to save
+and restore `Doc` objects.

-The new `DocBin` class makes it easy to serialize and deserialize
-a collection of `Doc` objects together, and is much more efficient than
-calling `doc.to_bytes()` on each individual `Doc` object. You can also control
-what data gets saved, and you can merge pallets together for easy
-map/reduce-style processing.
+The new `DocBin` class makes it easy to serialize and deserialize a collection
+of `Doc` objects together, and is much more efficient than calling
+`Doc.to_bytes` on each individual `Doc` object. You can also control what data
+gets saved, and you can merge pallets together for easy map/reduce-style
+processing.
+
+<Infobox>
+
+**API:** [`DocBin`](/api/docbin) **Usage: **
+[Serializing Doc objects](/usage/saving-loading#docs)
+
+</Infobox>

 ### CLI command to debug and validate training data {#debug-data}

--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@ -95,7 +95,8 @@
                    { "text": "KnowledgeBase", "url": "/api/kb" },
                    { "text": "GoldParse", "url": "/api/goldparse" },
                    { "text": "GoldCorpus", "url": "/api/goldcorpus" },
-                    { "text": "Scorer", "url": "/api/scorer" }
+                    { "text": "Scorer", "url": "/api/scorer" },
+                    { "text": "DocBin", "url": "/api/docbin" }
                ]
            },
            {