mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
DocBin: add version number, missing attributes and strings (#5685)
* Add version number to DocBin Add a version number to DocBin for future use. * Add POS to all attributes in DocBin * Add morph string to strings in DocBin * Update DocBin API * Add string for ENT_KB_ID in DocBin
This commit is contained in:
parent
b5268955d7
commit
a723fa02a1
|
@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
|
|||
from ..errors import Errors
|
||||
|
||||
|
||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
|
||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
||||
|
||||
|
||||
class DocBin(object):
|
||||
|
@ -31,6 +31,7 @@ class DocBin(object):
|
|||
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
||||
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
|
||||
"strings": List[unicode] # List of unique strings in the token data
|
||||
"version": str, # DocBin version number
|
||||
}
|
||||
|
||||
Strings for the words, tags, labels etc are represented by 64-bit hashes in
|
||||
|
@ -53,6 +54,7 @@ class DocBin(object):
|
|||
DOCS: https://spacy.io/api/docbin#init
|
||||
"""
|
||||
attrs = sorted([intify_attr(attr) for attr in attrs])
|
||||
self.version = "0.1"
|
||||
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
|
||||
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
|
||||
self.tokens = []
|
||||
|
@ -87,8 +89,10 @@ class DocBin(object):
|
|||
self.strings.add(token.text)
|
||||
self.strings.add(token.tag_)
|
||||
self.strings.add(token.lemma_)
|
||||
self.strings.add(token.morph_)
|
||||
self.strings.add(token.dep_)
|
||||
self.strings.add(token.ent_type_)
|
||||
self.strings.add(token.ent_kb_id_)
|
||||
self.cats.append(doc.cats)
|
||||
if self.store_user_data:
|
||||
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
||||
|
@ -147,6 +151,7 @@ class DocBin(object):
|
|||
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
|
||||
|
||||
msg = {
|
||||
"version": self.version,
|
||||
"attrs": self.attrs,
|
||||
"tokens": tokens.tobytes("C"),
|
||||
"spaces": spaces.tobytes("C"),
|
||||
|
|
|
@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
|
|||
the msgpack object has the following structure:
|
||||
|
||||
```python
|
||||
### msgpack object strcutrue
|
||||
### msgpack object structrue
|
||||
{
|
||||
"version": str, # DocBin version number
|
||||
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
|
||||
"tokens": bytes, # Serialized numpy uint64 array with the token data
|
||||
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
||||
|
@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
|
|||
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
|
||||
| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
|
||||
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
|
||||
| **RETURNS** | `DocBin` | The newly constructed object. |
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user