mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
DocBin: add version number, missing attributes and strings (#5685)
* Add version number to DocBin Add a version number to DocBin for future use. * Add POS to all attributes in DocBin * Add morph string to strings in DocBin * Update DocBin API * Add string for ENT_KB_ID in DocBin
This commit is contained in:
parent
b5268955d7
commit
a723fa02a1
|
@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
|
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
||||||
|
|
||||||
|
|
||||||
class DocBin(object):
|
class DocBin(object):
|
||||||
|
@ -31,6 +31,7 @@ class DocBin(object):
|
||||||
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
||||||
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
|
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
|
||||||
"strings": List[unicode] # List of unique strings in the token data
|
"strings": List[unicode] # List of unique strings in the token data
|
||||||
|
"version": str, # DocBin version number
|
||||||
}
|
}
|
||||||
|
|
||||||
Strings for the words, tags, labels etc are represented by 64-bit hashes in
|
Strings for the words, tags, labels etc are represented by 64-bit hashes in
|
||||||
|
@ -53,6 +54,7 @@ class DocBin(object):
|
||||||
DOCS: https://spacy.io/api/docbin#init
|
DOCS: https://spacy.io/api/docbin#init
|
||||||
"""
|
"""
|
||||||
attrs = sorted([intify_attr(attr) for attr in attrs])
|
attrs = sorted([intify_attr(attr) for attr in attrs])
|
||||||
|
self.version = "0.1"
|
||||||
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
|
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
|
||||||
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
|
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
|
||||||
self.tokens = []
|
self.tokens = []
|
||||||
|
@ -87,8 +89,10 @@ class DocBin(object):
|
||||||
self.strings.add(token.text)
|
self.strings.add(token.text)
|
||||||
self.strings.add(token.tag_)
|
self.strings.add(token.tag_)
|
||||||
self.strings.add(token.lemma_)
|
self.strings.add(token.lemma_)
|
||||||
|
self.strings.add(token.morph_)
|
||||||
self.strings.add(token.dep_)
|
self.strings.add(token.dep_)
|
||||||
self.strings.add(token.ent_type_)
|
self.strings.add(token.ent_type_)
|
||||||
|
self.strings.add(token.ent_kb_id_)
|
||||||
self.cats.append(doc.cats)
|
self.cats.append(doc.cats)
|
||||||
if self.store_user_data:
|
if self.store_user_data:
|
||||||
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
||||||
|
@ -147,6 +151,7 @@ class DocBin(object):
|
||||||
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
|
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
|
||||||
|
|
||||||
msg = {
|
msg = {
|
||||||
|
"version": self.version,
|
||||||
"attrs": self.attrs,
|
"attrs": self.attrs,
|
||||||
"tokens": tokens.tobytes("C"),
|
"tokens": tokens.tobytes("C"),
|
||||||
"spaces": spaces.tobytes("C"),
|
"spaces": spaces.tobytes("C"),
|
||||||
|
|
|
@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
|
||||||
the msgpack object has the following structure:
|
the msgpack object has the following structure:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### msgpack object strcutrue
|
### msgpack object structrue
|
||||||
{
|
{
|
||||||
|
"version": str, # DocBin version number
|
||||||
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
|
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
|
||||||
"tokens": bytes, # Serialized numpy uint64 array with the token data
|
"tokens": bytes, # Serialized numpy uint64 array with the token data
|
||||||
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
||||||
|
@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
|
| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
|
||||||
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
|
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
|
||||||
| **RETURNS** | `DocBin` | The newly constructed object. |
|
| **RETURNS** | `DocBin` | The newly constructed object. |
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user