mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	DocBin: add version number, missing attributes and strings (#5685)
* Add version number to DocBin Add a version number to DocBin for future use. * Add POS to all attributes in DocBin * Add morph string to strings in DocBin * Update DocBin API * Add string for ENT_KB_ID in DocBin
This commit is contained in:
		
							parent
							
								
									b5268955d7
								
							
						
					
					
						commit
						a723fa02a1
					
				|  | @ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr | |||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH") | ||||
| ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") | ||||
| 
 | ||||
| 
 | ||||
| class DocBin(object): | ||||
|  | @ -31,6 +31,7 @@ class DocBin(object): | |||
|         "spaces": bytes, # Serialized numpy boolean array with spaces data | ||||
|         "lengths": bytes, # Serialized numpy int32 array with the doc lengths | ||||
|         "strings": List[unicode] # List of unique strings in the token data | ||||
|         "version": str, # DocBin version number | ||||
|     } | ||||
| 
 | ||||
|     Strings for the words, tags, labels etc are represented by 64-bit hashes in | ||||
|  | @ -53,6 +54,7 @@ class DocBin(object): | |||
|         DOCS: https://spacy.io/api/docbin#init | ||||
|         """ | ||||
|         attrs = sorted([intify_attr(attr) for attr in attrs]) | ||||
|         self.version = "0.1" | ||||
|         self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] | ||||
|         self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0] | ||||
|         self.tokens = [] | ||||
|  | @ -87,8 +89,10 @@ class DocBin(object): | |||
|             self.strings.add(token.text) | ||||
|             self.strings.add(token.tag_) | ||||
|             self.strings.add(token.lemma_) | ||||
|             self.strings.add(token.morph_) | ||||
|             self.strings.add(token.dep_) | ||||
|             self.strings.add(token.ent_type_) | ||||
|             self.strings.add(token.ent_kb_id_) | ||||
|         self.cats.append(doc.cats) | ||||
|         if self.store_user_data: | ||||
|             self.user_data.append(srsly.msgpack_dumps(doc.user_data)) | ||||
|  | @ -147,6 +151,7 @@ class DocBin(object): | |||
|         spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) | ||||
| 
 | ||||
|         msg = { | ||||
|             "version": self.version, | ||||
|             "attrs": self.attrs, | ||||
|             "tokens": tokens.tobytes("C"), | ||||
|             "spaces": spaces.tobytes("C"), | ||||
|  |  | |||
|  | @ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where | |||
| the msgpack object has the following structure: | ||||
| 
 | ||||
| ```python | ||||
| ### msgpack object strcutrue | ||||
| ### msgpack object structrue | ||||
| { | ||||
|     "version": str,           # DocBin version number | ||||
|     "attrs": List[uint64],    # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] | ||||
|     "tokens": bytes,          # Serialized numpy uint64 array with the token data | ||||
|     "spaces": bytes,          # Serialized numpy boolean array with spaces data | ||||
|  | @ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations. | |||
| 
 | ||||
| | Argument          | Type     | Description                                                                                                                                                                                | | ||||
| | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `attrs`           | list     | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. | | ||||
| | `attrs`           | list     | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | | ||||
| | `store_user_data` | bool     | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`.                                                                                 | | ||||
| | **RETURNS**       | `DocBin` | The newly constructed object.                                                                                                                                                              | | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user