From 3507943b15e27cad91e55d2c3ff24ad0d959c683 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 18 Sep 2019 13:25:47 +0200 Subject: [PATCH] Add docstring for DocPallet --- spacy/tokens/_serialize.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 222806545..473d941b4 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -12,8 +12,34 @@ from ..attrs import SPACY, ORTH class DocPallet(object): - """Serialize analyses from a collection of doc objects.""" + """Pack Doc objects for export. + + The DocPallet class lets you efficiently serialize the information from a + collection of Doc objects. You can control which information is serialized + by passing a list of attribute IDs, and optionally also specify whether the + user data is serialized. The DocPallet is faster and produces smaller data + sizes than pickle, and allows you to deserialize without executing arbitrary + Python code. + The serialization format is gzipped msgpack, where the msgpack object has + the following structure: + + { + "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] + "tokens": bytes, # Serialized numpy uint64 array with the token data + "spaces": bytes, # Serialized numpy boolean array with spaces data + "lengths": bytes, # Serialized numpy int32 array with the doc lengths + "strings": List[unicode] # List of unique strings in the token data + } + + Strings for the words, tags, labels etc are represented by 64-bit hashes in + the token data, and every string that occurs at least once is passed via the + strings object. This means the storage is more efficient if you pack more + documents together, because you have less duplication in the strings. + + A notable downside to this format is that you can't easily extract just one + document from the pallet. + """ def __init__(self, attrs=None, store_user_data=False): """Create a DocBox object, to hold serialized annotations.