mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 21:51:24 +03:00 
			
		
		
		
	Update DocBin and add docs
This commit is contained in:
		
							parent
							
								
									d62690b3ba
								
							
						
					
					
						commit
						dd1810f05a
					
				|  | @ -467,6 +467,8 @@ class Errors(object): | |||
|     E164 = ("x is neither increasing nor decreasing: {}.") | ||||
|     E165 = ("Only one class present in y_true. ROC AUC score is not defined in " | ||||
|             "that case.") | ||||
|     E166 = ("Can only merge DocBins with the same pre-defined attributes.\n" | ||||
|             "Current DocBin: {current}\nOther DocBin: {other}") | ||||
| 
 | ||||
| 
 | ||||
| @add_codes | ||||
|  |  | |||
|  | @ -8,7 +8,8 @@ from thinc.neural.ops import NumpyOps | |||
| 
 | ||||
| from ..compat import copy_reg | ||||
| from ..tokens import Doc | ||||
| from ..attrs import SPACY, ORTH | ||||
| from ..attrs import SPACY, ORTH, intify_attrs | ||||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| class DocBin(object): | ||||
|  | @ -38,33 +39,46 @@ class DocBin(object): | |||
|     documents together, because you have less duplication in the strings. | ||||
| 
 | ||||
|     A notable downside to this format is that you can't easily extract just one | ||||
|     document from the pallet. | ||||
|     document from the DocBin. | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, attrs=None, store_user_data=False): | ||||
|         """Create a DocBin object, to hold serialized annotations. | ||||
|         """Create a DocBin object to hold serialized annotations. | ||||
| 
 | ||||
|         attrs (list): List of attributes to serialize. 'orth' and 'spacy' are | ||||
|             always serialized, so they're not required. Defaults to None. | ||||
|         store_user_data (bool): Whether to include the `Doc.user_data`. | ||||
|         RETURNS (DocBin): The newly constructed object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/docbin#init | ||||
|         """ | ||||
|         attrs = attrs or [] | ||||
|         # Ensure ORTH is always attrs[0] | ||||
|         attrs = sorted(intify_attrs(attrs)) | ||||
|         self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] | ||||
|         self.attrs.insert(0, ORTH) | ||||
|         self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0] | ||||
|         self.tokens = [] | ||||
|         self.spaces = [] | ||||
|         self.user_data = [] | ||||
|         self.strings = set() | ||||
|         self.store_user_data = store_user_data | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         """RETURNS: The number of Doc objects added to the DocBin.""" | ||||
|         return len(self.tokens) | ||||
| 
 | ||||
|     def add(self, doc): | ||||
|         """Add a doc's annotations to the DocBin for serialization.""" | ||||
|         """Add a Doc's annotations to the DocBin for serialization. | ||||
| 
 | ||||
|         doc (Doc): The Doc object to add. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/docbin#add | ||||
|         """ | ||||
|         array = doc.to_array(self.attrs) | ||||
|         if len(array.shape) == 1: | ||||
|             array = array.reshape((array.shape[0], 1)) | ||||
|         self.tokens.append(array) | ||||
|         spaces = doc.to_array(SPACY) | ||||
|         assert array.shape[0] == spaces.shape[0] | ||||
|         assert array.shape[0] == spaces.shape[0]  # this should never happen | ||||
|         spaces = spaces.reshape((spaces.shape[0], 1)) | ||||
|         self.spaces.append(numpy.asarray(spaces, dtype=bool)) | ||||
|         self.strings.update(w.text for w in doc) | ||||
|  | @ -72,7 +86,13 @@ class DocBin(object): | |||
|             self.user_data.append(srsly.msgpack_dumps(doc.user_data)) | ||||
| 
 | ||||
|     def get_docs(self, vocab): | ||||
|         """Recover Doc objects from the annotations, using the given vocab.""" | ||||
|         """Recover Doc objects from the annotations, using the given vocab. | ||||
| 
 | ||||
|         vocab (Vocab): The shared vocab. | ||||
|         YIELDS (Doc): The Doc objects. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/docbin#get_docs | ||||
|         """ | ||||
|         for string in self.strings: | ||||
|             vocab[string] | ||||
|         orth_col = self.attrs.index(ORTH) | ||||
|  | @ -87,8 +107,16 @@ class DocBin(object): | |||
|             yield doc | ||||
| 
 | ||||
|     def merge(self, other): | ||||
|         """Extend the annotations of this DocBin with the annotations from another.""" | ||||
|         assert self.attrs == other.attrs | ||||
|         """Extend the annotations of this DocBin with the annotations from | ||||
|         another. Will raise an error if the pre-defined attrs of the two | ||||
|         DocBins don't match. | ||||
| 
 | ||||
|         other (DocBin): The DocBin to merge into the current bin. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/docbin#merge | ||||
|         """ | ||||
|         if self.attrs != other.attrs: | ||||
|             raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs)) | ||||
|         self.tokens.extend(other.tokens) | ||||
|         self.spaces.extend(other.spaces) | ||||
|         self.strings.update(other.strings) | ||||
|  | @ -96,9 +124,14 @@ class DocBin(object): | |||
|             self.user_data.extend(other.user_data) | ||||
| 
 | ||||
|     def to_bytes(self): | ||||
|         """Serialize the DocBin's annotations into a byte string.""" | ||||
|         """Serialize the DocBin's annotations to a bytestring. | ||||
| 
 | ||||
|         RETURNS (bytes): The serialized DocBin. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/docbin#to_bytes | ||||
|         """ | ||||
|         for tokens in self.tokens: | ||||
|             assert len(tokens.shape) == 2, tokens.shape | ||||
|             assert len(tokens.shape) == 2, tokens.shape  # this should never happen | ||||
|         lengths = [len(tokens) for tokens in self.tokens] | ||||
|         msg = { | ||||
|             "attrs": self.attrs, | ||||
|  | @ -111,9 +144,15 @@ class DocBin(object): | |||
|             msg["user_data"] = self.user_data | ||||
|         return gzip.compress(srsly.msgpack_dumps(msg)) | ||||
| 
 | ||||
|     def from_bytes(self, string): | ||||
|         """Deserialize the DocBin's annotations from a byte string.""" | ||||
|         msg = srsly.msgpack_loads(gzip.decompress(string)) | ||||
|     def from_bytes(self, bytes_data): | ||||
|         """Deserialize the DocBin's annotations from a bytestring. | ||||
| 
 | ||||
|         bytes_data (bytes): The data to load from. | ||||
|         RETURNS (DocBin): The loaded DocBin. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/docbin#from_bytes | ||||
|         """ | ||||
|         msg = srsly.msgpack_loads(gzip.decompress(bytes_data)) | ||||
|         self.attrs = msg["attrs"] | ||||
|         self.strings = set(msg["strings"]) | ||||
|         lengths = numpy.fromstring(msg["lengths"], dtype="int32") | ||||
|  | @ -127,7 +166,7 @@ class DocBin(object): | |||
|         if self.store_user_data and "user_data" in msg: | ||||
|             self.user_data = list(msg["user_data"]) | ||||
|         for tokens in self.tokens: | ||||
|             assert len(tokens.shape) == 2, tokens.shape | ||||
|             assert len(tokens.shape) == 2, tokens.shape  # this should never happen | ||||
|         return self | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										149
									
								
								website/docs/api/docbin.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										149
									
								
								website/docs/api/docbin.md
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,149 @@ | |||
| --- | ||||
| title: DocBin | ||||
| tag: class | ||||
| new: 2.2 | ||||
| teaser: Pack Doc objects for binary serialization | ||||
| source: spacy/tokens/_serialize.py | ||||
| --- | ||||
| 
 | ||||
| The `DocBin` class lets you efficiently serialize the information from a | ||||
| collection of `Doc` objects. You can control which information is serialized by | ||||
| passing a list of attribute IDs, and optionally also specify whether the user | ||||
| data is serialized. The `DocBin` is faster and produces smaller data sizes than | ||||
| pickle, and allows you to deserialize without executing arbitrary Python code. A | ||||
| notable downside to this format is that you can't easily extract just one | ||||
| document from the `DocBin`. The serialization format is gzipped msgpack, where | ||||
| the msgpack object has the following structure: | ||||
| 
 | ||||
| ```python | ||||
| ### msgpack object strcutrue | ||||
| { | ||||
|     "attrs": List[uint64],    # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] | ||||
|     "tokens": bytes,          # Serialized numpy uint64 array with the token data | ||||
|     "spaces": bytes,          # Serialized numpy boolean array with spaces data | ||||
|     "lengths": bytes,         # Serialized numpy int32 array with the doc lengths | ||||
|     "strings": List[unicode]  # List of unique strings in the token data | ||||
| } | ||||
| ``` | ||||
| 
 | ||||
| Strings for the words, tags, labels etc are represented by 64-bit hashes in the | ||||
| token data, and every string that occurs at least once is passed via the strings | ||||
| object. This means the storage is more efficient if you pack more documents | ||||
| together, because you have less duplication in the strings. For usage examples, | ||||
| see the docs on [serializing `Doc` objects](/usage/saving-loading#docs). | ||||
| 
 | ||||
| ## DocBin.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| Create a `DocBin` object to hold serialized annotations. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.tokens import DocBin | ||||
| > doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) | ||||
| > ``` | ||||
| 
 | ||||
| | Argument          | Type     | Description                                                                                                                                                                                | | ||||
| | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `attrs`           | list     | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. | | ||||
| | `store_user_data` | bool     | Whether to include the `Doc.user_data`. Defaults to `False`.                                                                                                                               | | ||||
| | **RETURNS**       | `DocBin` | The newly constructed object.                                                                                                                                                              | | ||||
| 
 | ||||
| ## DocBin.\_\len\_\_ {#len tag="method"} | ||||
| 
 | ||||
| Get the number of `Doc` objects that were added to the `DocBin`. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > doc_bin = DocBin(attrs=["LEMMA"]) | ||||
| > doc = nlp("This is a document to serialize.") | ||||
| > doc_bin.add(doc) | ||||
| > assert len(doc_bin) == 1 | ||||
| > ``` | ||||
| 
 | ||||
| | Argument    | Type | Description                                 | | ||||
| | ----------- | ---- | ------------------------------------------- | | ||||
| | **RETURNS** | int  | The number of `Doc`s added to the `DocBin`. | | ||||
| 
 | ||||
| ## DocBin.add {#add tag="method"} | ||||
| 
 | ||||
| Add a `Doc`'s annotations to the `DocBin` for serialization. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > doc_bin = DocBin(attrs=["LEMMA"]) | ||||
| > doc = nlp("This is a document to serialize.") | ||||
| > doc_bin.add(doc) | ||||
| > ``` | ||||
| 
 | ||||
| | Argument | Type  | Description              | | ||||
| | -------- | ----- | ------------------------ | | ||||
| | `doc`    | `Doc` | The `Doc` object to add. | | ||||
| 
 | ||||
| ## DocBin.get_docs {#get_docs tag="method"} | ||||
| 
 | ||||
| Recover `Doc` objects from the annotations, using the given vocab. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > docs = list(doc_bin.get_docs(nlp.vocab)) | ||||
| > ``` | ||||
| 
 | ||||
| | Argument   | Type    | Description        | | ||||
| | ---------- | ------- | ------------------ | | ||||
| | `vocab`    | `Vocab` | The shared vocab.  | | ||||
| | **YIELDS** | `Doc`   | The `Doc` objects. | | ||||
| 
 | ||||
| ## DocBin.merge {#merge tag="method"} | ||||
| 
 | ||||
| Extend the annotations of this `DocBin` with the annotations from another. Will | ||||
| raise an error if the pre-defined attrs of the two `DocBin`s don't match. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > doc_bin1 = DocBin(attrs=["LEMMA", "POS"]) | ||||
| > doc_bin1.add(nlp("Hello world")) | ||||
| > doc_bin2 = DocBin(attrs=["LEMMA", "POS"]) | ||||
| > doc_bin2.add(nlp("This is a sentence")) | ||||
| > merged_bins = doc_bin1.merge(doc_bin2) | ||||
| > assert len(merged_bins) == 2 | ||||
| > ``` | ||||
| 
 | ||||
| | Argument | Type     | Description                                 | | ||||
| | -------- | -------- | ------------------------------------------- | | ||||
| | `other`  | `DocBin` | The `DocBin` to merge into the current bin. | | ||||
| 
 | ||||
| ## DocBin.to_bytes {#to_bytes tag="method"} | ||||
| 
 | ||||
| Serialize the `DocBin`'s annotations to a bytestring. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > doc_bin = DocBin(attrs=["DEP", "HEAD"]) | ||||
| > doc_bin_bytes = doc_bin.to_bytes() | ||||
| > ``` | ||||
| 
 | ||||
| | Argument    | Type  | Description              | | ||||
| | ----------- | ----- | ------------------------ | | ||||
| | **RETURNS** | bytes | The serialized `DocBin`. | | ||||
| 
 | ||||
| ## DocBin.from_bytes {#from_bytes tag="method"} | ||||
| 
 | ||||
| Deserialize the `DocBin`'s annotations from a bytestring. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > doc_bin_bytes = doc_bin.to_bytes() | ||||
| > new_doc_bin = DocBin().from_bytes(doc_bin_bytes) | ||||
| > ``` | ||||
| 
 | ||||
| | Argument     | Type     | Description            | | ||||
| | ------------ | -------- | ---------------------- | | ||||
| | `bytes_data` | bytes    | The data to load from. | | ||||
| | **RETURNS**  | `DocBin` | The loaded `DocBin`.   | | ||||
|  | @ -59,6 +59,39 @@ initializes the language class, creates and adds the pipeline components and | |||
| _then_ loads in the binary data. You can read more about this process | ||||
| [here](/usage/processing-pipelines#pipelines). | ||||
| 
 | ||||
| ### Serializing Doc objects efficiently {#docs new="2.2"} | ||||
| 
 | ||||
| If you're working with lots of data, you'll probably need to pass analyses | ||||
| between machines, either to use something like [Dask](https://dask.org) or | ||||
| [Spark](https://spark.apache.org), or even just to save out work to disk. Often | ||||
| it's sufficient to use the [`Doc.to_array`](/api/doc#to_array) functionality for | ||||
| this, and just serialize the numpy arrays – but other times you want a more | ||||
| general way to save and restore `Doc` objects. | ||||
| 
 | ||||
| The [`DocBin`](/api/docbin) class makes it easy to serialize and deserialize a | ||||
| collection of `Doc` objects together, and is much more efficient than calling | ||||
| [`Doc.to_bytes`](/api/doc#to_bytes) on each individual `Doc` object. You can | ||||
| also control what data gets saved, and you can merge pallets together for easy | ||||
| map/reduce-style processing. | ||||
| 
 | ||||
| ```python | ||||
| ### {highlight="4,8,9,13,14"} | ||||
| import spacy | ||||
| from spacy.tokens import DocBin | ||||
| 
 | ||||
| doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) | ||||
| texts = ["Some text", "Lots of texts...", "..."] | ||||
| nlp = spacy.load("en_core_web_sm") | ||||
| for doc in nlp.pipe(texts): | ||||
|     doc_bin.add(doc) | ||||
| bytes_data = docbin.to_bytes() | ||||
| 
 | ||||
| # Deserialize later, e.g. in a new process | ||||
| nlp = spacy.blank("en") | ||||
| doc_bin = DocBin().from_bytes(bytes_data) | ||||
| docs = list(doc_bin.get_docs(nlp.vocab)) | ||||
| ``` | ||||
| 
 | ||||
| ### Using Pickle {#pickle} | ||||
| 
 | ||||
| > #### Example | ||||
|  |  | |||
|  | @ -124,27 +124,35 @@ classification. | |||
| > | ||||
| > ```python | ||||
| > from spacy.tokens import DocBin | ||||
| > doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False) | ||||
| > doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) | ||||
| > for doc in nlp.pipe(texts): | ||||
| >     doc_bin.add(doc) | ||||
| > byte_data = docbin.to_bytes() | ||||
| > bytes_data = doc_bin.to_bytes() | ||||
| > # Deserialize later, e.g. in a new process | ||||
| > nlp = spacy.blank("en") | ||||
| > doc_bin = DocBin() | ||||
| > doc_bin = DocBin().from_bytes(bytes_data) | ||||
| > docs = list(doc_bin.get_docs(nlp.vocab)) | ||||
| > ``` | ||||
| 
 | ||||
| If you're working with lots of data, you'll probably need to pass analyses | ||||
| between machines, either to use something like Dask or Spark, or even just to | ||||
| save out work to disk. Often it's sufficient to use the doc.to_array() | ||||
| functionality for this, and just serialize the numpy arrays --- but other times | ||||
| you want a more general way to save and restore `Doc` objects. | ||||
| between machines, either to use something like [Dask](https://dask.org) or | ||||
| [Spark](https://spark.apache.org), or even just to save out work to disk. Often | ||||
| it's sufficient to use the `Doc.to_array` functionality for this, and just | ||||
| serialize the numpy arrays – but other times you want a more general way to save | ||||
| and restore `Doc` objects. | ||||
| 
 | ||||
| The new `DocBin` class makes it easy to serialize and deserialize | ||||
| a collection of `Doc` objects together, and is much more efficient than | ||||
| calling `doc.to_bytes()` on each individual `Doc` object. You can also control | ||||
| what data gets saved, and you can merge pallets together for easy | ||||
| map/reduce-style processing. | ||||
| The new `DocBin` class makes it easy to serialize and deserialize a collection | ||||
| of `Doc` objects together, and is much more efficient than calling | ||||
| `Doc.to_bytes` on each individual `Doc` object. You can also control what data | ||||
| gets saved, and you can merge pallets together for easy map/reduce-style | ||||
| processing. | ||||
| 
 | ||||
| <Infobox> | ||||
| 
 | ||||
| **API:** [`DocBin`](/api/docbin) **Usage: ** | ||||
| [Serializing Doc objects](/usage/saving-loading#docs) | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ### CLI command to debug and validate training data {#debug-data} | ||||
| 
 | ||||
|  |  | |||
|  | @ -95,7 +95,8 @@ | |||
|                     { "text": "KnowledgeBase", "url": "/api/kb" }, | ||||
|                     { "text": "GoldParse", "url": "/api/goldparse" }, | ||||
|                     { "text": "GoldCorpus", "url": "/api/goldcorpus" }, | ||||
|                     { "text": "Scorer", "url": "/api/scorer" } | ||||
|                     { "text": "Scorer", "url": "/api/scorer" }, | ||||
|                     { "text": "DocBin", "url": "/api/docbin" } | ||||
|                 ] | ||||
|             }, | ||||
|             { | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user