mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Update DocBin and add docs
This commit is contained in:
parent
d62690b3ba
commit
dd1810f05a
|
@ -467,6 +467,8 @@ class Errors(object):
|
|||
E164 = ("x is neither increasing nor decreasing: {}.")
|
||||
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
||||
"that case.")
|
||||
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
|
||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -8,7 +8,8 @@ from thinc.neural.ops import NumpyOps
|
|||
|
||||
from ..compat import copy_reg
|
||||
from ..tokens import Doc
|
||||
from ..attrs import SPACY, ORTH
|
||||
from ..attrs import SPACY, ORTH, intify_attrs
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
class DocBin(object):
|
||||
|
@ -38,33 +39,46 @@ class DocBin(object):
|
|||
documents together, because you have less duplication in the strings.
|
||||
|
||||
A notable downside to this format is that you can't easily extract just one
|
||||
document from the pallet.
|
||||
document from the DocBin.
|
||||
"""
|
||||
|
||||
def __init__(self, attrs=None, store_user_data=False):
|
||||
"""Create a DocBin object, to hold serialized annotations.
|
||||
"""Create a DocBin object to hold serialized annotations.
|
||||
|
||||
attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
|
||||
always serialized, so they're not required. Defaults to None.
|
||||
store_user_data (bool): Whether to include the `Doc.user_data`.
|
||||
RETURNS (DocBin): The newly constructed object.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#init
|
||||
"""
|
||||
attrs = attrs or []
|
||||
# Ensure ORTH is always attrs[0]
|
||||
attrs = sorted(intify_attrs(attrs))
|
||||
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
|
||||
self.attrs.insert(0, ORTH)
|
||||
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
|
||||
self.tokens = []
|
||||
self.spaces = []
|
||||
self.user_data = []
|
||||
self.strings = set()
|
||||
self.store_user_data = store_user_data
|
||||
|
||||
def __len__(self):
|
||||
"""RETURNS: The number of Doc objects added to the DocBin."""
|
||||
return len(self.tokens)
|
||||
|
||||
def add(self, doc):
|
||||
"""Add a doc's annotations to the DocBin for serialization."""
|
||||
"""Add a Doc's annotations to the DocBin for serialization.
|
||||
|
||||
doc (Doc): The Doc object to add.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#add
|
||||
"""
|
||||
array = doc.to_array(self.attrs)
|
||||
if len(array.shape) == 1:
|
||||
array = array.reshape((array.shape[0], 1))
|
||||
self.tokens.append(array)
|
||||
spaces = doc.to_array(SPACY)
|
||||
assert array.shape[0] == spaces.shape[0]
|
||||
assert array.shape[0] == spaces.shape[0] # this should never happen
|
||||
spaces = spaces.reshape((spaces.shape[0], 1))
|
||||
self.spaces.append(numpy.asarray(spaces, dtype=bool))
|
||||
self.strings.update(w.text for w in doc)
|
||||
|
@ -72,7 +86,13 @@ class DocBin(object):
|
|||
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
||||
|
||||
def get_docs(self, vocab):
|
||||
"""Recover Doc objects from the annotations, using the given vocab."""
|
||||
"""Recover Doc objects from the annotations, using the given vocab.
|
||||
|
||||
vocab (Vocab): The shared vocab.
|
||||
YIELDS (Doc): The Doc objects.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#get_docs
|
||||
"""
|
||||
for string in self.strings:
|
||||
vocab[string]
|
||||
orth_col = self.attrs.index(ORTH)
|
||||
|
@ -87,8 +107,16 @@ class DocBin(object):
|
|||
yield doc
|
||||
|
||||
def merge(self, other):
|
||||
"""Extend the annotations of this DocBin with the annotations from another."""
|
||||
assert self.attrs == other.attrs
|
||||
"""Extend the annotations of this DocBin with the annotations from
|
||||
another. Will raise an error if the pre-defined attrs of the two
|
||||
DocBins don't match.
|
||||
|
||||
other (DocBin): The DocBin to merge into the current bin.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#merge
|
||||
"""
|
||||
if self.attrs != other.attrs:
|
||||
raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
|
||||
self.tokens.extend(other.tokens)
|
||||
self.spaces.extend(other.spaces)
|
||||
self.strings.update(other.strings)
|
||||
|
@ -96,9 +124,14 @@ class DocBin(object):
|
|||
self.user_data.extend(other.user_data)
|
||||
|
||||
def to_bytes(self):
|
||||
"""Serialize the DocBin's annotations into a byte string."""
|
||||
"""Serialize the DocBin's annotations to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized DocBin.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#to_bytes
|
||||
"""
|
||||
for tokens in self.tokens:
|
||||
assert len(tokens.shape) == 2, tokens.shape
|
||||
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||
lengths = [len(tokens) for tokens in self.tokens]
|
||||
msg = {
|
||||
"attrs": self.attrs,
|
||||
|
@ -111,9 +144,15 @@ class DocBin(object):
|
|||
msg["user_data"] = self.user_data
|
||||
return gzip.compress(srsly.msgpack_dumps(msg))
|
||||
|
||||
def from_bytes(self, string):
|
||||
"""Deserialize the DocBin's annotations from a byte string."""
|
||||
msg = srsly.msgpack_loads(gzip.decompress(string))
|
||||
def from_bytes(self, bytes_data):
|
||||
"""Deserialize the DocBin's annotations from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
RETURNS (DocBin): The loaded DocBin.
|
||||
|
||||
DOCS: https://spacy.io/api/docbin#from_bytes
|
||||
"""
|
||||
msg = srsly.msgpack_loads(gzip.decompress(bytes_data))
|
||||
self.attrs = msg["attrs"]
|
||||
self.strings = set(msg["strings"])
|
||||
lengths = numpy.fromstring(msg["lengths"], dtype="int32")
|
||||
|
@ -127,7 +166,7 @@ class DocBin(object):
|
|||
if self.store_user_data and "user_data" in msg:
|
||||
self.user_data = list(msg["user_data"])
|
||||
for tokens in self.tokens:
|
||||
assert len(tokens.shape) == 2, tokens.shape
|
||||
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||
return self
|
||||
|
||||
|
||||
|
|
149
website/docs/api/docbin.md
Normal file
149
website/docs/api/docbin.md
Normal file
|
@ -0,0 +1,149 @@
|
|||
---
|
||||
title: DocBin
|
||||
tag: class
|
||||
new: 2.2
|
||||
teaser: Pack Doc objects for binary serialization
|
||||
source: spacy/tokens/_serialize.py
|
||||
---
|
||||
|
||||
The `DocBin` class lets you efficiently serialize the information from a
|
||||
collection of `Doc` objects. You can control which information is serialized by
|
||||
passing a list of attribute IDs, and optionally also specify whether the user
|
||||
data is serialized. The `DocBin` is faster and produces smaller data sizes than
|
||||
pickle, and allows you to deserialize without executing arbitrary Python code. A
|
||||
notable downside to this format is that you can't easily extract just one
|
||||
document from the `DocBin`. The serialization format is gzipped msgpack, where
|
||||
the msgpack object has the following structure:
|
||||
|
||||
```python
|
||||
### msgpack object strcutrue
|
||||
{
|
||||
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
|
||||
"tokens": bytes, # Serialized numpy uint64 array with the token data
|
||||
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
||||
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
|
||||
"strings": List[unicode] # List of unique strings in the token data
|
||||
}
|
||||
```
|
||||
|
||||
Strings for the words, tags, labels etc are represented by 64-bit hashes in the
|
||||
token data, and every string that occurs at least once is passed via the strings
|
||||
object. This means the storage is more efficient if you pack more documents
|
||||
together, because you have less duplication in the strings. For usage examples,
|
||||
see the docs on [serializing `Doc` objects](/usage/saving-loading#docs).
|
||||
|
||||
## DocBin.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a `DocBin` object to hold serialized annotations.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.tokens import DocBin
|
||||
> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
|
||||
| `store_user_data` | bool | Whether to include the `Doc.user_data`. Defaults to `False`. |
|
||||
| **RETURNS** | `DocBin` | The newly constructed object. |
|
||||
|
||||
## DocBin.\_\len\_\_ {#len tag="method"}
|
||||
|
||||
Get the number of `Doc` objects that were added to the `DocBin`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc_bin = DocBin(attrs=["LEMMA"])
|
||||
> doc = nlp("This is a document to serialize.")
|
||||
> doc_bin.add(doc)
|
||||
> assert len(doc_bin) == 1
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------- |
|
||||
| **RETURNS** | int | The number of `Doc`s added to the `DocBin`. |
|
||||
|
||||
## DocBin.add {#add tag="method"}
|
||||
|
||||
Add a `Doc`'s annotations to the `DocBin` for serialization.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc_bin = DocBin(attrs=["LEMMA"])
|
||||
> doc = nlp("This is a document to serialize.")
|
||||
> doc_bin.add(doc)
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to add. |
|
||||
|
||||
## DocBin.get_docs {#get_docs tag="method"}
|
||||
|
||||
Recover `Doc` objects from the annotations, using the given vocab.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> docs = list(doc_bin.get_docs(nlp.vocab))
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ---------- | ------- | ------------------ |
|
||||
| `vocab` | `Vocab` | The shared vocab. |
|
||||
| **YIELDS** | `Doc` | The `Doc` objects. |
|
||||
|
||||
## DocBin.merge {#merge tag="method"}
|
||||
|
||||
Extend the annotations of this `DocBin` with the annotations from another. Will
|
||||
raise an error if the pre-defined attrs of the two `DocBin`s don't match.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc_bin1 = DocBin(attrs=["LEMMA", "POS"])
|
||||
> doc_bin1.add(nlp("Hello world"))
|
||||
> doc_bin2 = DocBin(attrs=["LEMMA", "POS"])
|
||||
> doc_bin2.add(nlp("This is a sentence"))
|
||||
> merged_bins = doc_bin1.merge(doc_bin2)
|
||||
> assert len(merged_bins) == 2
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------- | -------- | ------------------------------------------- |
|
||||
| `other` | `DocBin` | The `DocBin` to merge into the current bin. |
|
||||
|
||||
## DocBin.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
Serialize the `DocBin`'s annotations to a bytestring.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc_bin = DocBin(attrs=["DEP", "HEAD"])
|
||||
> doc_bin_bytes = doc_bin.to_bytes()
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| **RETURNS** | bytes | The serialized `DocBin`. |
|
||||
|
||||
## DocBin.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
Deserialize the `DocBin`'s annotations from a bytestring.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc_bin_bytes = doc_bin.to_bytes()
|
||||
> new_doc_bin = DocBin().from_bytes(doc_bin_bytes)
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------ | -------- | ---------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
|
@ -59,6 +59,39 @@ initializes the language class, creates and adds the pipeline components and
|
|||
_then_ loads in the binary data. You can read more about this process
|
||||
[here](/usage/processing-pipelines#pipelines).
|
||||
|
||||
### Serializing Doc objects efficiently {#docs new="2.2"}
|
||||
|
||||
If you're working with lots of data, you'll probably need to pass analyses
|
||||
between machines, either to use something like [Dask](https://dask.org) or
|
||||
[Spark](https://spark.apache.org), or even just to save out work to disk. Often
|
||||
it's sufficient to use the [`Doc.to_array`](/api/doc#to_array) functionality for
|
||||
this, and just serialize the numpy arrays – but other times you want a more
|
||||
general way to save and restore `Doc` objects.
|
||||
|
||||
The [`DocBin`](/api/docbin) class makes it easy to serialize and deserialize a
|
||||
collection of `Doc` objects together, and is much more efficient than calling
|
||||
[`Doc.to_bytes`](/api/doc#to_bytes) on each individual `Doc` object. You can
|
||||
also control what data gets saved, and you can merge pallets together for easy
|
||||
map/reduce-style processing.
|
||||
|
||||
```python
|
||||
### {highlight="4,8,9,13,14"}
|
||||
import spacy
|
||||
from spacy.tokens import DocBin
|
||||
|
||||
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
|
||||
texts = ["Some text", "Lots of texts...", "..."]
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
for doc in nlp.pipe(texts):
|
||||
doc_bin.add(doc)
|
||||
bytes_data = docbin.to_bytes()
|
||||
|
||||
# Deserialize later, e.g. in a new process
|
||||
nlp = spacy.blank("en")
|
||||
doc_bin = DocBin().from_bytes(bytes_data)
|
||||
docs = list(doc_bin.get_docs(nlp.vocab))
|
||||
```
|
||||
|
||||
### Using Pickle {#pickle}
|
||||
|
||||
> #### Example
|
||||
|
|
|
@ -121,30 +121,38 @@ classification.
|
|||
### New DocBin class to efficiently serialize Doc collections
|
||||
|
||||
> #### Example
|
||||
>
|
||||
>
|
||||
> ```python
|
||||
> from spacy.tokens import DocBin
|
||||
> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False)
|
||||
> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
|
||||
> for doc in nlp.pipe(texts):
|
||||
> doc_bin.add(doc)
|
||||
> byte_data = docbin.to_bytes()
|
||||
> bytes_data = doc_bin.to_bytes()
|
||||
> # Deserialize later, e.g. in a new process
|
||||
> nlp = spacy.blank("en")
|
||||
> doc_bin = DocBin()
|
||||
> doc_bin = DocBin().from_bytes(bytes_data)
|
||||
> docs = list(doc_bin.get_docs(nlp.vocab))
|
||||
> ```
|
||||
|
||||
If you're working with lots of data, you'll probably need to pass analyses
|
||||
between machines, either to use something like Dask or Spark, or even just to
|
||||
save out work to disk. Often it's sufficient to use the doc.to_array()
|
||||
functionality for this, and just serialize the numpy arrays --- but other times
|
||||
you want a more general way to save and restore `Doc` objects.
|
||||
between machines, either to use something like [Dask](https://dask.org) or
|
||||
[Spark](https://spark.apache.org), or even just to save out work to disk. Often
|
||||
it's sufficient to use the `Doc.to_array` functionality for this, and just
|
||||
serialize the numpy arrays – but other times you want a more general way to save
|
||||
and restore `Doc` objects.
|
||||
|
||||
The new `DocBin` class makes it easy to serialize and deserialize
|
||||
a collection of `Doc` objects together, and is much more efficient than
|
||||
calling `doc.to_bytes()` on each individual `Doc` object. You can also control
|
||||
what data gets saved, and you can merge pallets together for easy
|
||||
map/reduce-style processing.
|
||||
The new `DocBin` class makes it easy to serialize and deserialize a collection
|
||||
of `Doc` objects together, and is much more efficient than calling
|
||||
`Doc.to_bytes` on each individual `Doc` object. You can also control what data
|
||||
gets saved, and you can merge pallets together for easy map/reduce-style
|
||||
processing.
|
||||
|
||||
<Infobox>
|
||||
|
||||
**API:** [`DocBin`](/api/docbin) **Usage: **
|
||||
[Serializing Doc objects](/usage/saving-loading#docs)
|
||||
|
||||
</Infobox>
|
||||
|
||||
### CLI command to debug and validate training data {#debug-data}
|
||||
|
||||
|
|
|
@ -95,7 +95,8 @@
|
|||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
||||
{ "text": "GoldParse", "url": "/api/goldparse" },
|
||||
{ "text": "GoldCorpus", "url": "/api/goldcorpus" },
|
||||
{ "text": "Scorer", "url": "/api/scorer" }
|
||||
{ "text": "Scorer", "url": "/api/scorer" },
|
||||
{ "text": "DocBin", "url": "/api/docbin" }
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue
Block a user