diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index d16515a57..a56900988 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors -ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH") +ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") class DocBin(object): @@ -31,6 +31,7 @@ class DocBin(object): "spaces": bytes, # Serialized numpy boolean array with spaces data "lengths": bytes, # Serialized numpy int32 array with the doc lengths "strings": List[unicode] # List of unique strings in the token data + "version": str, # DocBin version number } Strings for the words, tags, labels etc are represented by 64-bit hashes in @@ -53,6 +54,7 @@ class DocBin(object): DOCS: https://spacy.io/api/docbin#init """ attrs = sorted([intify_attr(attr) for attr in attrs]) + self.version = "0.1" self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] self.tokens = [] @@ -87,8 +89,10 @@ class DocBin(object): self.strings.add(token.text) self.strings.add(token.tag_) self.strings.add(token.lemma_) + self.strings.add(token.morph_) self.strings.add(token.dep_) self.strings.add(token.ent_type_) + self.strings.add(token.ent_kb_id_) self.cats.append(doc.cats) if self.store_user_data: self.user_data.append(srsly.msgpack_dumps(doc.user_data)) @@ -147,6 +151,7 @@ class DocBin(object): spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) msg = { + "version": self.version, "attrs": self.attrs, "tokens": tokens.tobytes("C"), "spaces": spaces.tobytes("C"), diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index fe8c359f7..07f95f91d 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where the msgpack object has the following structure: ```python -### msgpack object strcutrue +### msgpack object structrue { + "version": str, # DocBin version number "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] "tokens": bytes, # Serialized numpy uint64 array with the token data "spaces": bytes, # Serialized numpy boolean array with spaces data @@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations. | Argument | Type | Description | | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. | +| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | | `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | | **RETURNS** | `DocBin` | The newly constructed object. |