mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Update DocBin and add docs
This commit is contained in:
parent
d62690b3ba
commit
dd1810f05a
|
@ -467,6 +467,8 @@ class Errors(object):
|
||||||
E164 = ("x is neither increasing nor decreasing: {}.")
|
E164 = ("x is neither increasing nor decreasing: {}.")
|
||||||
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
||||||
"that case.")
|
"that case.")
|
||||||
|
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
|
||||||
|
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -8,7 +8,8 @@ from thinc.neural.ops import NumpyOps
|
||||||
|
|
||||||
from ..compat import copy_reg
|
from ..compat import copy_reg
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import SPACY, ORTH
|
from ..attrs import SPACY, ORTH, intify_attrs
|
||||||
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
class DocBin(object):
|
class DocBin(object):
|
||||||
|
@ -38,33 +39,46 @@ class DocBin(object):
|
||||||
documents together, because you have less duplication in the strings.
|
documents together, because you have less duplication in the strings.
|
||||||
|
|
||||||
A notable downside to this format is that you can't easily extract just one
|
A notable downside to this format is that you can't easily extract just one
|
||||||
document from the pallet.
|
document from the DocBin.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, attrs=None, store_user_data=False):
|
def __init__(self, attrs=None, store_user_data=False):
|
||||||
"""Create a DocBin object, to hold serialized annotations.
|
"""Create a DocBin object to hold serialized annotations.
|
||||||
|
|
||||||
attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
|
attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
|
||||||
always serialized, so they're not required. Defaults to None.
|
always serialized, so they're not required. Defaults to None.
|
||||||
|
store_user_data (bool): Whether to include the `Doc.user_data`.
|
||||||
|
RETURNS (DocBin): The newly constructed object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/docbin#init
|
||||||
"""
|
"""
|
||||||
attrs = attrs or []
|
attrs = attrs or []
|
||||||
# Ensure ORTH is always attrs[0]
|
attrs = sorted(intify_attrs(attrs))
|
||||||
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
|
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
|
||||||
self.attrs.insert(0, ORTH)
|
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
|
||||||
self.tokens = []
|
self.tokens = []
|
||||||
self.spaces = []
|
self.spaces = []
|
||||||
self.user_data = []
|
self.user_data = []
|
||||||
self.strings = set()
|
self.strings = set()
|
||||||
self.store_user_data = store_user_data
|
self.store_user_data = store_user_data
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""RETURNS: The number of Doc objects added to the DocBin."""
|
||||||
|
return len(self.tokens)
|
||||||
|
|
||||||
def add(self, doc):
|
def add(self, doc):
|
||||||
"""Add a doc's annotations to the DocBin for serialization."""
|
"""Add a Doc's annotations to the DocBin for serialization.
|
||||||
|
|
||||||
|
doc (Doc): The Doc object to add.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/docbin#add
|
||||||
|
"""
|
||||||
array = doc.to_array(self.attrs)
|
array = doc.to_array(self.attrs)
|
||||||
if len(array.shape) == 1:
|
if len(array.shape) == 1:
|
||||||
array = array.reshape((array.shape[0], 1))
|
array = array.reshape((array.shape[0], 1))
|
||||||
self.tokens.append(array)
|
self.tokens.append(array)
|
||||||
spaces = doc.to_array(SPACY)
|
spaces = doc.to_array(SPACY)
|
||||||
assert array.shape[0] == spaces.shape[0]
|
assert array.shape[0] == spaces.shape[0] # this should never happen
|
||||||
spaces = spaces.reshape((spaces.shape[0], 1))
|
spaces = spaces.reshape((spaces.shape[0], 1))
|
||||||
self.spaces.append(numpy.asarray(spaces, dtype=bool))
|
self.spaces.append(numpy.asarray(spaces, dtype=bool))
|
||||||
self.strings.update(w.text for w in doc)
|
self.strings.update(w.text for w in doc)
|
||||||
|
@ -72,7 +86,13 @@ class DocBin(object):
|
||||||
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
||||||
|
|
||||||
def get_docs(self, vocab):
|
def get_docs(self, vocab):
|
||||||
"""Recover Doc objects from the annotations, using the given vocab."""
|
"""Recover Doc objects from the annotations, using the given vocab.
|
||||||
|
|
||||||
|
vocab (Vocab): The shared vocab.
|
||||||
|
YIELDS (Doc): The Doc objects.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/docbin#get_docs
|
||||||
|
"""
|
||||||
for string in self.strings:
|
for string in self.strings:
|
||||||
vocab[string]
|
vocab[string]
|
||||||
orth_col = self.attrs.index(ORTH)
|
orth_col = self.attrs.index(ORTH)
|
||||||
|
@ -87,8 +107,16 @@ class DocBin(object):
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def merge(self, other):
|
def merge(self, other):
|
||||||
"""Extend the annotations of this DocBin with the annotations from another."""
|
"""Extend the annotations of this DocBin with the annotations from
|
||||||
assert self.attrs == other.attrs
|
another. Will raise an error if the pre-defined attrs of the two
|
||||||
|
DocBins don't match.
|
||||||
|
|
||||||
|
other (DocBin): The DocBin to merge into the current bin.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/docbin#merge
|
||||||
|
"""
|
||||||
|
if self.attrs != other.attrs:
|
||||||
|
raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
|
||||||
self.tokens.extend(other.tokens)
|
self.tokens.extend(other.tokens)
|
||||||
self.spaces.extend(other.spaces)
|
self.spaces.extend(other.spaces)
|
||||||
self.strings.update(other.strings)
|
self.strings.update(other.strings)
|
||||||
|
@ -96,9 +124,14 @@ class DocBin(object):
|
||||||
self.user_data.extend(other.user_data)
|
self.user_data.extend(other.user_data)
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
"""Serialize the DocBin's annotations into a byte string."""
|
"""Serialize the DocBin's annotations to a bytestring.
|
||||||
|
|
||||||
|
RETURNS (bytes): The serialized DocBin.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/docbin#to_bytes
|
||||||
|
"""
|
||||||
for tokens in self.tokens:
|
for tokens in self.tokens:
|
||||||
assert len(tokens.shape) == 2, tokens.shape
|
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||||
lengths = [len(tokens) for tokens in self.tokens]
|
lengths = [len(tokens) for tokens in self.tokens]
|
||||||
msg = {
|
msg = {
|
||||||
"attrs": self.attrs,
|
"attrs": self.attrs,
|
||||||
|
@ -111,9 +144,15 @@ class DocBin(object):
|
||||||
msg["user_data"] = self.user_data
|
msg["user_data"] = self.user_data
|
||||||
return gzip.compress(srsly.msgpack_dumps(msg))
|
return gzip.compress(srsly.msgpack_dumps(msg))
|
||||||
|
|
||||||
def from_bytes(self, string):
|
def from_bytes(self, bytes_data):
|
||||||
"""Deserialize the DocBin's annotations from a byte string."""
|
"""Deserialize the DocBin's annotations from a bytestring.
|
||||||
msg = srsly.msgpack_loads(gzip.decompress(string))
|
|
||||||
|
bytes_data (bytes): The data to load from.
|
||||||
|
RETURNS (DocBin): The loaded DocBin.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/docbin#from_bytes
|
||||||
|
"""
|
||||||
|
msg = srsly.msgpack_loads(gzip.decompress(bytes_data))
|
||||||
self.attrs = msg["attrs"]
|
self.attrs = msg["attrs"]
|
||||||
self.strings = set(msg["strings"])
|
self.strings = set(msg["strings"])
|
||||||
lengths = numpy.fromstring(msg["lengths"], dtype="int32")
|
lengths = numpy.fromstring(msg["lengths"], dtype="int32")
|
||||||
|
@ -127,7 +166,7 @@ class DocBin(object):
|
||||||
if self.store_user_data and "user_data" in msg:
|
if self.store_user_data and "user_data" in msg:
|
||||||
self.user_data = list(msg["user_data"])
|
self.user_data = list(msg["user_data"])
|
||||||
for tokens in self.tokens:
|
for tokens in self.tokens:
|
||||||
assert len(tokens.shape) == 2, tokens.shape
|
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
149
website/docs/api/docbin.md
Normal file
149
website/docs/api/docbin.md
Normal file
|
@ -0,0 +1,149 @@
|
||||||
|
---
|
||||||
|
title: DocBin
|
||||||
|
tag: class
|
||||||
|
new: 2.2
|
||||||
|
teaser: Pack Doc objects for binary serialization
|
||||||
|
source: spacy/tokens/_serialize.py
|
||||||
|
---
|
||||||
|
|
||||||
|
The `DocBin` class lets you efficiently serialize the information from a
|
||||||
|
collection of `Doc` objects. You can control which information is serialized by
|
||||||
|
passing a list of attribute IDs, and optionally also specify whether the user
|
||||||
|
data is serialized. The `DocBin` is faster and produces smaller data sizes than
|
||||||
|
pickle, and allows you to deserialize without executing arbitrary Python code. A
|
||||||
|
notable downside to this format is that you can't easily extract just one
|
||||||
|
document from the `DocBin`. The serialization format is gzipped msgpack, where
|
||||||
|
the msgpack object has the following structure:
|
||||||
|
|
||||||
|
```python
|
||||||
|
### msgpack object strcutrue
|
||||||
|
{
|
||||||
|
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
|
||||||
|
"tokens": bytes, # Serialized numpy uint64 array with the token data
|
||||||
|
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
||||||
|
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
|
||||||
|
"strings": List[unicode] # List of unique strings in the token data
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Strings for the words, tags, labels etc are represented by 64-bit hashes in the
|
||||||
|
token data, and every string that occurs at least once is passed via the strings
|
||||||
|
object. This means the storage is more efficient if you pack more documents
|
||||||
|
together, because you have less duplication in the strings. For usage examples,
|
||||||
|
see the docs on [serializing `Doc` objects](/usage/saving-loading#docs).
|
||||||
|
|
||||||
|
## DocBin.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
Create a `DocBin` object to hold serialized annotations.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.tokens import DocBin
|
||||||
|
> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
|
||||||
|
| `store_user_data` | bool | Whether to include the `Doc.user_data`. Defaults to `False`. |
|
||||||
|
| **RETURNS** | `DocBin` | The newly constructed object. |
|
||||||
|
|
||||||
|
## DocBin.\_\len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
|
Get the number of `Doc` objects that were added to the `DocBin`.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc_bin = DocBin(attrs=["LEMMA"])
|
||||||
|
> doc = nlp("This is a document to serialize.")
|
||||||
|
> doc_bin.add(doc)
|
||||||
|
> assert len(doc_bin) == 1
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| ----------- | ---- | ------------------------------------------- |
|
||||||
|
| **RETURNS** | int | The number of `Doc`s added to the `DocBin`. |
|
||||||
|
|
||||||
|
## DocBin.add {#add tag="method"}
|
||||||
|
|
||||||
|
Add a `Doc`'s annotations to the `DocBin` for serialization.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc_bin = DocBin(attrs=["LEMMA"])
|
||||||
|
> doc = nlp("This is a document to serialize.")
|
||||||
|
> doc_bin.add(doc)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| -------- | ----- | ------------------------ |
|
||||||
|
| `doc` | `Doc` | The `Doc` object to add. |
|
||||||
|
|
||||||
|
## DocBin.get_docs {#get_docs tag="method"}
|
||||||
|
|
||||||
|
Recover `Doc` objects from the annotations, using the given vocab.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> docs = list(doc_bin.get_docs(nlp.vocab))
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| ---------- | ------- | ------------------ |
|
||||||
|
| `vocab` | `Vocab` | The shared vocab. |
|
||||||
|
| **YIELDS** | `Doc` | The `Doc` objects. |
|
||||||
|
|
||||||
|
## DocBin.merge {#merge tag="method"}
|
||||||
|
|
||||||
|
Extend the annotations of this `DocBin` with the annotations from another. Will
|
||||||
|
raise an error if the pre-defined attrs of the two `DocBin`s don't match.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc_bin1 = DocBin(attrs=["LEMMA", "POS"])
|
||||||
|
> doc_bin1.add(nlp("Hello world"))
|
||||||
|
> doc_bin2 = DocBin(attrs=["LEMMA", "POS"])
|
||||||
|
> doc_bin2.add(nlp("This is a sentence"))
|
||||||
|
> merged_bins = doc_bin1.merge(doc_bin2)
|
||||||
|
> assert len(merged_bins) == 2
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| -------- | -------- | ------------------------------------------- |
|
||||||
|
| `other` | `DocBin` | The `DocBin` to merge into the current bin. |
|
||||||
|
|
||||||
|
## DocBin.to_bytes {#to_bytes tag="method"}
|
||||||
|
|
||||||
|
Serialize the `DocBin`'s annotations to a bytestring.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc_bin = DocBin(attrs=["DEP", "HEAD"])
|
||||||
|
> doc_bin_bytes = doc_bin.to_bytes()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| ----------- | ----- | ------------------------ |
|
||||||
|
| **RETURNS** | bytes | The serialized `DocBin`. |
|
||||||
|
|
||||||
|
## DocBin.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
Deserialize the `DocBin`'s annotations from a bytestring.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc_bin_bytes = doc_bin.to_bytes()
|
||||||
|
> new_doc_bin = DocBin().from_bytes(doc_bin_bytes)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| ------------ | -------- | ---------------------- |
|
||||||
|
| `bytes_data` | bytes | The data to load from. |
|
||||||
|
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
|
@ -59,6 +59,39 @@ initializes the language class, creates and adds the pipeline components and
|
||||||
_then_ loads in the binary data. You can read more about this process
|
_then_ loads in the binary data. You can read more about this process
|
||||||
[here](/usage/processing-pipelines#pipelines).
|
[here](/usage/processing-pipelines#pipelines).
|
||||||
|
|
||||||
|
### Serializing Doc objects efficiently {#docs new="2.2"}
|
||||||
|
|
||||||
|
If you're working with lots of data, you'll probably need to pass analyses
|
||||||
|
between machines, either to use something like [Dask](https://dask.org) or
|
||||||
|
[Spark](https://spark.apache.org), or even just to save out work to disk. Often
|
||||||
|
it's sufficient to use the [`Doc.to_array`](/api/doc#to_array) functionality for
|
||||||
|
this, and just serialize the numpy arrays – but other times you want a more
|
||||||
|
general way to save and restore `Doc` objects.
|
||||||
|
|
||||||
|
The [`DocBin`](/api/docbin) class makes it easy to serialize and deserialize a
|
||||||
|
collection of `Doc` objects together, and is much more efficient than calling
|
||||||
|
[`Doc.to_bytes`](/api/doc#to_bytes) on each individual `Doc` object. You can
|
||||||
|
also control what data gets saved, and you can merge pallets together for easy
|
||||||
|
map/reduce-style processing.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### {highlight="4,8,9,13,14"}
|
||||||
|
import spacy
|
||||||
|
from spacy.tokens import DocBin
|
||||||
|
|
||||||
|
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
|
||||||
|
texts = ["Some text", "Lots of texts...", "..."]
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
for doc in nlp.pipe(texts):
|
||||||
|
doc_bin.add(doc)
|
||||||
|
bytes_data = docbin.to_bytes()
|
||||||
|
|
||||||
|
# Deserialize later, e.g. in a new process
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
doc_bin = DocBin().from_bytes(bytes_data)
|
||||||
|
docs = list(doc_bin.get_docs(nlp.vocab))
|
||||||
|
```
|
||||||
|
|
||||||
### Using Pickle {#pickle}
|
### Using Pickle {#pickle}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -124,27 +124,35 @@ classification.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.tokens import DocBin
|
> from spacy.tokens import DocBin
|
||||||
> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False)
|
> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
|
||||||
> for doc in nlp.pipe(texts):
|
> for doc in nlp.pipe(texts):
|
||||||
> doc_bin.add(doc)
|
> doc_bin.add(doc)
|
||||||
> byte_data = docbin.to_bytes()
|
> bytes_data = doc_bin.to_bytes()
|
||||||
> # Deserialize later, e.g. in a new process
|
> # Deserialize later, e.g. in a new process
|
||||||
> nlp = spacy.blank("en")
|
> nlp = spacy.blank("en")
|
||||||
> doc_bin = DocBin()
|
> doc_bin = DocBin().from_bytes(bytes_data)
|
||||||
> docs = list(doc_bin.get_docs(nlp.vocab))
|
> docs = list(doc_bin.get_docs(nlp.vocab))
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
If you're working with lots of data, you'll probably need to pass analyses
|
If you're working with lots of data, you'll probably need to pass analyses
|
||||||
between machines, either to use something like Dask or Spark, or even just to
|
between machines, either to use something like [Dask](https://dask.org) or
|
||||||
save out work to disk. Often it's sufficient to use the doc.to_array()
|
[Spark](https://spark.apache.org), or even just to save out work to disk. Often
|
||||||
functionality for this, and just serialize the numpy arrays --- but other times
|
it's sufficient to use the `Doc.to_array` functionality for this, and just
|
||||||
you want a more general way to save and restore `Doc` objects.
|
serialize the numpy arrays – but other times you want a more general way to save
|
||||||
|
and restore `Doc` objects.
|
||||||
|
|
||||||
The new `DocBin` class makes it easy to serialize and deserialize
|
The new `DocBin` class makes it easy to serialize and deserialize a collection
|
||||||
a collection of `Doc` objects together, and is much more efficient than
|
of `Doc` objects together, and is much more efficient than calling
|
||||||
calling `doc.to_bytes()` on each individual `Doc` object. You can also control
|
`Doc.to_bytes` on each individual `Doc` object. You can also control what data
|
||||||
what data gets saved, and you can merge pallets together for easy
|
gets saved, and you can merge pallets together for easy map/reduce-style
|
||||||
map/reduce-style processing.
|
processing.
|
||||||
|
|
||||||
|
<Infobox>
|
||||||
|
|
||||||
|
**API:** [`DocBin`](/api/docbin) **Usage: **
|
||||||
|
[Serializing Doc objects](/usage/saving-loading#docs)
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### CLI command to debug and validate training data {#debug-data}
|
### CLI command to debug and validate training data {#debug-data}
|
||||||
|
|
||||||
|
|
|
@ -95,7 +95,8 @@
|
||||||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
||||||
{ "text": "GoldParse", "url": "/api/goldparse" },
|
{ "text": "GoldParse", "url": "/api/goldparse" },
|
||||||
{ "text": "GoldCorpus", "url": "/api/goldcorpus" },
|
{ "text": "GoldCorpus", "url": "/api/goldcorpus" },
|
||||||
{ "text": "Scorer", "url": "/api/scorer" }
|
{ "text": "Scorer", "url": "/api/scorer" },
|
||||||
|
{ "text": "DocBin", "url": "/api/docbin" }
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue
Block a user