DocBin: add version number, missing attributes and strings (#5685)

* Add version number to DocBin Add a version number to DocBin for future use. * Add POS to all attributes in DocBin * Add morph string to strings in DocBin * Update DocBin API * Add string for ENT_KB_ID in DocBin
2025-12-19 16:14:39 +03:00 · 2020-07-02 17:41:50 +02:00 · 2020-07-02 17:41:50 +02:00 · a723fa02a1
commit a723fa02a1
parent b5268955d7
2 changed files with 9 additions and 3 deletions
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
 from ..errors import Errors
-ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
+ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
 class DocBin(object):
@ -31,6 +31,7 @@ class DocBin(object):
        "spaces": bytes, # Serialized numpy boolean array with spaces data
        "lengths": bytes, # Serialized numpy int32 array with the doc lengths
        "strings": List[unicode] # List of unique strings in the token data
        "version": str, # DocBin version number
    }
    Strings for the words, tags, labels etc are represented by 64-bit hashes in
@ -53,6 +54,7 @@ class DocBin(object):
        DOCS: https://spacy.io/api/docbin#init
        """
        attrs = sorted([intify_attr(attr) for attr in attrs])
        self.version = "0.1"
        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
        self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
        self.tokens = []
@ -87,8 +89,10 @@ class DocBin(object):
            self.strings.add(token.text)
            self.strings.add(token.tag_)
            self.strings.add(token.lemma_)
            self.strings.add(token.morph_)
            self.strings.add(token.dep_)
            self.strings.add(token.ent_type_)
            self.strings.add(token.ent_kb_id_)
        self.cats.append(doc.cats)
        if self.store_user_data:
            self.user_data.append(srsly.msgpack_dumps(doc.user_data))
@ -147,6 +151,7 @@ class DocBin(object):
        spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
        msg = {
            "version": self.version,
            "attrs": self.attrs,
            "tokens": tokens.tobytes("C"),
            "spaces": spaces.tobytes("C"),
--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
 the msgpack object has the following structure:
 ```python
-### msgpack object strcutrue
+### msgpack object structrue
 {
    "version": str,           # DocBin version number
    "attrs": List[uint64],    # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
    "tokens": bytes,          # Serialized numpy uint64 array with the token data
    "spaces": bytes,          # Serialized numpy boolean array with spaces data
@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
 | Argument          | Type     | Description                                                                                                                                                                                |
 | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `attrs`           | list     | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
+| `attrs`           | list     | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
 | `store_user_data` | bool     | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`.                                                                                 |
 | **RETURNS**       | `DocBin` | The newly constructed object.                                                                                                                                                              |