DocBin: add version number, missing attributes and strings (#5685)

* Add version number to DocBin

Add a version number to DocBin for future use.

* Add POS to all attributes in DocBin

* Add morph string to strings in DocBin

* Update DocBin API

* Add string for ENT_KB_ID in DocBin
This commit is contained in:
Adriane Boyd 2020-07-02 17:41:50 +02:00 committed by GitHub
parent b5268955d7
commit a723fa02a1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 3 deletions

View File

@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors from ..errors import Errors
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH") ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
class DocBin(object): class DocBin(object):
@ -31,6 +31,7 @@ class DocBin(object):
"spaces": bytes, # Serialized numpy boolean array with spaces data "spaces": bytes, # Serialized numpy boolean array with spaces data
"lengths": bytes, # Serialized numpy int32 array with the doc lengths "lengths": bytes, # Serialized numpy int32 array with the doc lengths
"strings": List[unicode] # List of unique strings in the token data "strings": List[unicode] # List of unique strings in the token data
"version": str, # DocBin version number
} }
Strings for the words, tags, labels etc are represented by 64-bit hashes in Strings for the words, tags, labels etc are represented by 64-bit hashes in
@ -53,6 +54,7 @@ class DocBin(object):
DOCS: https://spacy.io/api/docbin#init DOCS: https://spacy.io/api/docbin#init
""" """
attrs = sorted([intify_attr(attr) for attr in attrs]) attrs = sorted([intify_attr(attr) for attr in attrs])
self.version = "0.1"
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
self.tokens = [] self.tokens = []
@ -87,8 +89,10 @@ class DocBin(object):
self.strings.add(token.text) self.strings.add(token.text)
self.strings.add(token.tag_) self.strings.add(token.tag_)
self.strings.add(token.lemma_) self.strings.add(token.lemma_)
self.strings.add(token.morph_)
self.strings.add(token.dep_) self.strings.add(token.dep_)
self.strings.add(token.ent_type_) self.strings.add(token.ent_type_)
self.strings.add(token.ent_kb_id_)
self.cats.append(doc.cats) self.cats.append(doc.cats)
if self.store_user_data: if self.store_user_data:
self.user_data.append(srsly.msgpack_dumps(doc.user_data)) self.user_data.append(srsly.msgpack_dumps(doc.user_data))
@ -147,6 +151,7 @@ class DocBin(object):
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
msg = { msg = {
"version": self.version,
"attrs": self.attrs, "attrs": self.attrs,
"tokens": tokens.tobytes("C"), "tokens": tokens.tobytes("C"),
"spaces": spaces.tobytes("C"), "spaces": spaces.tobytes("C"),

View File

@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
the msgpack object has the following structure: the msgpack object has the following structure:
```python ```python
### msgpack object strcutrue ### msgpack object structrue
{ {
"version": str, # DocBin version number
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
"tokens": bytes, # Serialized numpy uint64 array with the token data "tokens": bytes, # Serialized numpy uint64 array with the token data
"spaces": bytes, # Serialized numpy boolean array with spaces data "spaces": bytes, # Serialized numpy boolean array with spaces data
@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
| Argument | Type | Description | | Argument | Type | Description |
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. | | `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | | `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
| **RETURNS** | `DocBin` | The newly constructed object. | | **RETURNS** | `DocBin` | The newly constructed object. |