mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 13:41:21 +03:00 
			
		
		
		
	Add strings and ENT_KB_ID to Doc serialization (#5691)
* Add strings for all writeable Token attributes to `Doc.to/from_bytes()`. * Add ENT_KB_ID to default attributes.
This commit is contained in:
		
							parent
							
								
									971826a96d
								
							
						
					
					
						commit
						a77c4c3465
					
				|  | @ -106,10 +106,16 @@ def test_doc_api_getitem(en_tokenizer): | |||
| ) | ||||
| def test_doc_api_serialize(en_tokenizer, text): | ||||
|     tokens = en_tokenizer(text) | ||||
|     tokens[0].lemma_ = "lemma" | ||||
|     tokens[0].norm_ = "norm" | ||||
|     tokens[0].ent_kb_id_ = "ent_kb_id" | ||||
|     new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes()) | ||||
|     assert tokens.text == new_tokens.text | ||||
|     assert [t.text for t in tokens] == [t.text for t in new_tokens] | ||||
|     assert [t.orth for t in tokens] == [t.orth for t in new_tokens] | ||||
|     assert new_tokens[0].lemma_ == "lemma" | ||||
|     assert new_tokens[0].norm_ == "norm" | ||||
|     assert new_tokens[0].ent_kb_id_ == "ent_kb_id" | ||||
| 
 | ||||
|     new_tokens = Doc(tokens.vocab).from_bytes( | ||||
|         tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"] | ||||
|  |  | |||
|  | @ -892,7 +892,7 @@ cdef class Doc: | |||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#to_bytes | ||||
|         """ | ||||
|         array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM]  # TODO: ENT_KB_ID ? | ||||
|         array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID] | ||||
|         if self.is_tagged: | ||||
|             array_head.extend([TAG, POS]) | ||||
|         # If doc parsed add head and dep attribute | ||||
|  | @ -901,6 +901,14 @@ cdef class Doc: | |||
|         # Otherwise add sent_start | ||||
|         else: | ||||
|             array_head.append(SENT_START) | ||||
|         strings = set() | ||||
|         for token in self: | ||||
|             strings.add(token.tag_) | ||||
|             strings.add(token.lemma_) | ||||
|             strings.add(token.dep_) | ||||
|             strings.add(token.ent_type_) | ||||
|             strings.add(token.ent_kb_id_) | ||||
|             strings.add(token.norm_) | ||||
|         # Msgpack doesn't distinguish between lists and tuples, which is | ||||
|         # vexing for user data. As a best guess, we *know* that within | ||||
|         # keys, we must have tuples. In values we just have to hope | ||||
|  | @ -912,6 +920,7 @@ cdef class Doc: | |||
|             "sentiment": lambda: self.sentiment, | ||||
|             "tensor": lambda: self.tensor, | ||||
|             "cats": lambda: self.cats, | ||||
|             "strings": lambda: list(strings), | ||||
|         } | ||||
|         for key in kwargs: | ||||
|             if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"): | ||||
|  | @ -942,6 +951,7 @@ cdef class Doc: | |||
|             "sentiment": lambda b: None, | ||||
|             "tensor": lambda b: None, | ||||
|             "cats": lambda b: None, | ||||
|             "strings": lambda b: None, | ||||
|             "user_data_keys": lambda b: None, | ||||
|             "user_data_values": lambda b: None, | ||||
|         } | ||||
|  | @ -965,6 +975,9 @@ cdef class Doc: | |||
|             self.tensor = msg["tensor"] | ||||
|         if "cats" not in exclude and "cats" in msg: | ||||
|             self.cats = msg["cats"] | ||||
|         if "strings" not in exclude and "strings" in msg: | ||||
|             for s in msg["strings"]: | ||||
|                 self.vocab.strings.add(s) | ||||
|         start = 0 | ||||
|         cdef const LexemeC* lex | ||||
|         cdef unicode orth_ | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user