Update DocBin and add docs

This commit is contained in:
Ines Montani 2019-09-18 20:23:21 +02:00
parent d62690b3ba
commit dd1810f05a
6 changed files with 262 additions and 30 deletions

View File

@ -467,6 +467,8 @@ class Errors(object):
E164 = ("x is neither increasing nor decreasing: {}.") E164 = ("x is neither increasing nor decreasing: {}.")
E165 = ("Only one class present in y_true. ROC AUC score is not defined in " E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
"that case.") "that case.")
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
"Current DocBin: {current}\nOther DocBin: {other}")
@add_codes @add_codes

View File

@ -8,7 +8,8 @@ from thinc.neural.ops import NumpyOps
from ..compat import copy_reg from ..compat import copy_reg
from ..tokens import Doc from ..tokens import Doc
from ..attrs import SPACY, ORTH from ..attrs import SPACY, ORTH, intify_attrs
from ..errors import Errors
class DocBin(object): class DocBin(object):
@ -38,33 +39,46 @@ class DocBin(object):
documents together, because you have less duplication in the strings. documents together, because you have less duplication in the strings.
A notable downside to this format is that you can't easily extract just one A notable downside to this format is that you can't easily extract just one
document from the pallet. document from the DocBin.
""" """
def __init__(self, attrs=None, store_user_data=False): def __init__(self, attrs=None, store_user_data=False):
"""Create a DocBin object, to hold serialized annotations. """Create a DocBin object to hold serialized annotations.
attrs (list): List of attributes to serialize. 'orth' and 'spacy' are attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
always serialized, so they're not required. Defaults to None. always serialized, so they're not required. Defaults to None.
store_user_data (bool): Whether to include the `Doc.user_data`.
RETURNS (DocBin): The newly constructed object.
DOCS: https://spacy.io/api/docbin#init
""" """
attrs = attrs or [] attrs = attrs or []
# Ensure ORTH is always attrs[0] attrs = sorted(intify_attrs(attrs))
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
self.attrs.insert(0, ORTH) self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
self.tokens = [] self.tokens = []
self.spaces = [] self.spaces = []
self.user_data = [] self.user_data = []
self.strings = set() self.strings = set()
self.store_user_data = store_user_data self.store_user_data = store_user_data
def __len__(self):
"""RETURNS: The number of Doc objects added to the DocBin."""
return len(self.tokens)
def add(self, doc): def add(self, doc):
"""Add a doc's annotations to the DocBin for serialization.""" """Add a Doc's annotations to the DocBin for serialization.
doc (Doc): The Doc object to add.
DOCS: https://spacy.io/api/docbin#add
"""
array = doc.to_array(self.attrs) array = doc.to_array(self.attrs)
if len(array.shape) == 1: if len(array.shape) == 1:
array = array.reshape((array.shape[0], 1)) array = array.reshape((array.shape[0], 1))
self.tokens.append(array) self.tokens.append(array)
spaces = doc.to_array(SPACY) spaces = doc.to_array(SPACY)
assert array.shape[0] == spaces.shape[0] assert array.shape[0] == spaces.shape[0] # this should never happen
spaces = spaces.reshape((spaces.shape[0], 1)) spaces = spaces.reshape((spaces.shape[0], 1))
self.spaces.append(numpy.asarray(spaces, dtype=bool)) self.spaces.append(numpy.asarray(spaces, dtype=bool))
self.strings.update(w.text for w in doc) self.strings.update(w.text for w in doc)
@ -72,7 +86,13 @@ class DocBin(object):
self.user_data.append(srsly.msgpack_dumps(doc.user_data)) self.user_data.append(srsly.msgpack_dumps(doc.user_data))
def get_docs(self, vocab): def get_docs(self, vocab):
"""Recover Doc objects from the annotations, using the given vocab.""" """Recover Doc objects from the annotations, using the given vocab.
vocab (Vocab): The shared vocab.
YIELDS (Doc): The Doc objects.
DOCS: https://spacy.io/api/docbin#get_docs
"""
for string in self.strings: for string in self.strings:
vocab[string] vocab[string]
orth_col = self.attrs.index(ORTH) orth_col = self.attrs.index(ORTH)
@ -87,8 +107,16 @@ class DocBin(object):
yield doc yield doc
def merge(self, other): def merge(self, other):
"""Extend the annotations of this DocBin with the annotations from another.""" """Extend the annotations of this DocBin with the annotations from
assert self.attrs == other.attrs another. Will raise an error if the pre-defined attrs of the two
DocBins don't match.
other (DocBin): The DocBin to merge into the current bin.
DOCS: https://spacy.io/api/docbin#merge
"""
if self.attrs != other.attrs:
raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
self.tokens.extend(other.tokens) self.tokens.extend(other.tokens)
self.spaces.extend(other.spaces) self.spaces.extend(other.spaces)
self.strings.update(other.strings) self.strings.update(other.strings)
@ -96,9 +124,14 @@ class DocBin(object):
self.user_data.extend(other.user_data) self.user_data.extend(other.user_data)
def to_bytes(self): def to_bytes(self):
"""Serialize the DocBin's annotations into a byte string.""" """Serialize the DocBin's annotations to a bytestring.
RETURNS (bytes): The serialized DocBin.
DOCS: https://spacy.io/api/docbin#to_bytes
"""
for tokens in self.tokens: for tokens in self.tokens:
assert len(tokens.shape) == 2, tokens.shape assert len(tokens.shape) == 2, tokens.shape # this should never happen
lengths = [len(tokens) for tokens in self.tokens] lengths = [len(tokens) for tokens in self.tokens]
msg = { msg = {
"attrs": self.attrs, "attrs": self.attrs,
@ -111,9 +144,15 @@ class DocBin(object):
msg["user_data"] = self.user_data msg["user_data"] = self.user_data
return gzip.compress(srsly.msgpack_dumps(msg)) return gzip.compress(srsly.msgpack_dumps(msg))
def from_bytes(self, string): def from_bytes(self, bytes_data):
"""Deserialize the DocBin's annotations from a byte string.""" """Deserialize the DocBin's annotations from a bytestring.
msg = srsly.msgpack_loads(gzip.decompress(string))
bytes_data (bytes): The data to load from.
RETURNS (DocBin): The loaded DocBin.
DOCS: https://spacy.io/api/docbin#from_bytes
"""
msg = srsly.msgpack_loads(gzip.decompress(bytes_data))
self.attrs = msg["attrs"] self.attrs = msg["attrs"]
self.strings = set(msg["strings"]) self.strings = set(msg["strings"])
lengths = numpy.fromstring(msg["lengths"], dtype="int32") lengths = numpy.fromstring(msg["lengths"], dtype="int32")
@ -127,7 +166,7 @@ class DocBin(object):
if self.store_user_data and "user_data" in msg: if self.store_user_data and "user_data" in msg:
self.user_data = list(msg["user_data"]) self.user_data = list(msg["user_data"])
for tokens in self.tokens: for tokens in self.tokens:
assert len(tokens.shape) == 2, tokens.shape assert len(tokens.shape) == 2, tokens.shape # this should never happen
return self return self

149
website/docs/api/docbin.md Normal file
View File

@ -0,0 +1,149 @@
---
title: DocBin
tag: class
new: 2.2
teaser: Pack Doc objects for binary serialization
source: spacy/tokens/_serialize.py
---
The `DocBin` class lets you efficiently serialize the information from a
collection of `Doc` objects. You can control which information is serialized by
passing a list of attribute IDs, and optionally also specify whether the user
data is serialized. The `DocBin` is faster and produces smaller data sizes than
pickle, and allows you to deserialize without executing arbitrary Python code. A
notable downside to this format is that you can't easily extract just one
document from the `DocBin`. The serialization format is gzipped msgpack, where
the msgpack object has the following structure:
```python
### msgpack object strcutrue
{
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
"tokens": bytes, # Serialized numpy uint64 array with the token data
"spaces": bytes, # Serialized numpy boolean array with spaces data
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
"strings": List[unicode] # List of unique strings in the token data
}
```
Strings for the words, tags, labels etc are represented by 64-bit hashes in the
token data, and every string that occurs at least once is passed via the strings
object. This means the storage is more efficient if you pack more documents
together, because you have less duplication in the strings. For usage examples,
see the docs on [serializing `Doc` objects](/usage/saving-loading#docs).
## DocBin.\_\_init\_\_ {#init tag="method"}
Create a `DocBin` object to hold serialized annotations.
> #### Example
>
> ```python
> from spacy.tokens import DocBin
> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
> ```
| Argument | Type | Description |
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
| `store_user_data` | bool | Whether to include the `Doc.user_data`. Defaults to `False`. |
| **RETURNS** | `DocBin` | The newly constructed object. |
## DocBin.\_\len\_\_ {#len tag="method"}
Get the number of `Doc` objects that were added to the `DocBin`.
> #### Example
>
> ```python
> doc_bin = DocBin(attrs=["LEMMA"])
> doc = nlp("This is a document to serialize.")
> doc_bin.add(doc)
> assert len(doc_bin) == 1
> ```
| Argument | Type | Description |
| ----------- | ---- | ------------------------------------------- |
| **RETURNS** | int | The number of `Doc`s added to the `DocBin`. |
## DocBin.add {#add tag="method"}
Add a `Doc`'s annotations to the `DocBin` for serialization.
> #### Example
>
> ```python
> doc_bin = DocBin(attrs=["LEMMA"])
> doc = nlp("This is a document to serialize.")
> doc_bin.add(doc)
> ```
| Argument | Type | Description |
| -------- | ----- | ------------------------ |
| `doc` | `Doc` | The `Doc` object to add. |
## DocBin.get_docs {#get_docs tag="method"}
Recover `Doc` objects from the annotations, using the given vocab.
> #### Example
>
> ```python
> docs = list(doc_bin.get_docs(nlp.vocab))
> ```
| Argument | Type | Description |
| ---------- | ------- | ------------------ |
| `vocab` | `Vocab` | The shared vocab. |
| **YIELDS** | `Doc` | The `Doc` objects. |
## DocBin.merge {#merge tag="method"}
Extend the annotations of this `DocBin` with the annotations from another. Will
raise an error if the pre-defined attrs of the two `DocBin`s don't match.
> #### Example
>
> ```python
> doc_bin1 = DocBin(attrs=["LEMMA", "POS"])
> doc_bin1.add(nlp("Hello world"))
> doc_bin2 = DocBin(attrs=["LEMMA", "POS"])
> doc_bin2.add(nlp("This is a sentence"))
> merged_bins = doc_bin1.merge(doc_bin2)
> assert len(merged_bins) == 2
> ```
| Argument | Type | Description |
| -------- | -------- | ------------------------------------------- |
| `other` | `DocBin` | The `DocBin` to merge into the current bin. |
## DocBin.to_bytes {#to_bytes tag="method"}
Serialize the `DocBin`'s annotations to a bytestring.
> #### Example
>
> ```python
> doc_bin = DocBin(attrs=["DEP", "HEAD"])
> doc_bin_bytes = doc_bin.to_bytes()
> ```
| Argument | Type | Description |
| ----------- | ----- | ------------------------ |
| **RETURNS** | bytes | The serialized `DocBin`. |
## DocBin.from_bytes {#from_bytes tag="method"}
Deserialize the `DocBin`'s annotations from a bytestring.
> #### Example
>
> ```python
> doc_bin_bytes = doc_bin.to_bytes()
> new_doc_bin = DocBin().from_bytes(doc_bin_bytes)
> ```
| Argument | Type | Description |
| ------------ | -------- | ---------------------- |
| `bytes_data` | bytes | The data to load from. |
| **RETURNS** | `DocBin` | The loaded `DocBin`. |

View File

@ -59,6 +59,39 @@ initializes the language class, creates and adds the pipeline components and
_then_ loads in the binary data. You can read more about this process _then_ loads in the binary data. You can read more about this process
[here](/usage/processing-pipelines#pipelines). [here](/usage/processing-pipelines#pipelines).
### Serializing Doc objects efficiently {#docs new="2.2"}
If you're working with lots of data, you'll probably need to pass analyses
between machines, either to use something like [Dask](https://dask.org) or
[Spark](https://spark.apache.org), or even just to save out work to disk. Often
it's sufficient to use the [`Doc.to_array`](/api/doc#to_array) functionality for
this, and just serialize the numpy arrays but other times you want a more
general way to save and restore `Doc` objects.
The [`DocBin`](/api/docbin) class makes it easy to serialize and deserialize a
collection of `Doc` objects together, and is much more efficient than calling
[`Doc.to_bytes`](/api/doc#to_bytes) on each individual `Doc` object. You can
also control what data gets saved, and you can merge pallets together for easy
map/reduce-style processing.
```python
### {highlight="4,8,9,13,14"}
import spacy
from spacy.tokens import DocBin
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
texts = ["Some text", "Lots of texts...", "..."]
nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(texts):
doc_bin.add(doc)
bytes_data = docbin.to_bytes()
# Deserialize later, e.g. in a new process
nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(bytes_data)
docs = list(doc_bin.get_docs(nlp.vocab))
```
### Using Pickle {#pickle} ### Using Pickle {#pickle}
> #### Example > #### Example

View File

@ -124,27 +124,35 @@ classification.
> >
> ```python > ```python
> from spacy.tokens import DocBin > from spacy.tokens import DocBin
> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False) > doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
> for doc in nlp.pipe(texts): > for doc in nlp.pipe(texts):
> doc_bin.add(doc) > doc_bin.add(doc)
> byte_data = docbin.to_bytes() > bytes_data = doc_bin.to_bytes()
> # Deserialize later, e.g. in a new process > # Deserialize later, e.g. in a new process
> nlp = spacy.blank("en") > nlp = spacy.blank("en")
> doc_bin = DocBin() > doc_bin = DocBin().from_bytes(bytes_data)
> docs = list(doc_bin.get_docs(nlp.vocab)) > docs = list(doc_bin.get_docs(nlp.vocab))
> ``` > ```
If you're working with lots of data, you'll probably need to pass analyses If you're working with lots of data, you'll probably need to pass analyses
between machines, either to use something like Dask or Spark, or even just to between machines, either to use something like [Dask](https://dask.org) or
save out work to disk. Often it's sufficient to use the doc.to_array() [Spark](https://spark.apache.org), or even just to save out work to disk. Often
functionality for this, and just serialize the numpy arrays --- but other times it's sufficient to use the `Doc.to_array` functionality for this, and just
you want a more general way to save and restore `Doc` objects. serialize the numpy arrays but other times you want a more general way to save
and restore `Doc` objects.
The new `DocBin` class makes it easy to serialize and deserialize The new `DocBin` class makes it easy to serialize and deserialize a collection
a collection of `Doc` objects together, and is much more efficient than of `Doc` objects together, and is much more efficient than calling
calling `doc.to_bytes()` on each individual `Doc` object. You can also control `Doc.to_bytes` on each individual `Doc` object. You can also control what data
what data gets saved, and you can merge pallets together for easy gets saved, and you can merge pallets together for easy map/reduce-style
map/reduce-style processing. processing.
<Infobox>
**API:** [`DocBin`](/api/docbin) **Usage: **
[Serializing Doc objects](/usage/saving-loading#docs)
</Infobox>
### CLI command to debug and validate training data {#debug-data} ### CLI command to debug and validate training data {#debug-data}

View File

@ -95,7 +95,8 @@
{ "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "KnowledgeBase", "url": "/api/kb" },
{ "text": "GoldParse", "url": "/api/goldparse" }, { "text": "GoldParse", "url": "/api/goldparse" },
{ "text": "GoldCorpus", "url": "/api/goldcorpus" }, { "text": "GoldCorpus", "url": "/api/goldcorpus" },
{ "text": "Scorer", "url": "/api/scorer" } { "text": "Scorer", "url": "/api/scorer" },
{ "text": "DocBin", "url": "/api/docbin" }
] ]
}, },
{ {