mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	adding aliases per entity in the KB
This commit is contained in:
		
							parent
							
								
									3945fd21b0
								
							
						
					
					
						commit
						5ac7edf53c
					
				
							
								
								
									
										53
									
								
								spacy/kb.pxd
									
									
									
									
									
								
							
							
						
						
									
										53
									
								
								spacy/kb.pxd
									
									
									
									
									
								
							|  | @ -27,15 +27,25 @@ cdef struct _EntryC: | |||
|     float prob | ||||
| 
 | ||||
| 
 | ||||
| # Each alias struct stores a list of Entry pointers with their prior probabilities | ||||
| # for this specific mention/alias. | ||||
| cdef struct _AliasC: | ||||
| 
 | ||||
|     # All entry candidates for this alias | ||||
|     const vector[int64_t] entry_indices | ||||
| 
 | ||||
|     # Prior probability P(entity|alias) - should sum up to (at most) 1. | ||||
|     const vector[float] probs | ||||
| 
 | ||||
| 
 | ||||
| cdef class KnowledgeBase: | ||||
|     cdef Pool mem | ||||
| 
 | ||||
|     # This maps 64bit keys to 64bit values. Here the key would be a hash of | ||||
|     # a unique string name for the entity, and the value would be the position | ||||
|     # of the _EntryC struct in our vector. | ||||
|     # This maps 64bit keys (hash of unique entity string) | ||||
|     # to 64bit values (position of the _EntryC struct in the _entries vector). | ||||
|     # The PreshMap is pretty space efficient, as it uses open addressing. So | ||||
|     # the only overhead is the vacancy rate, which is approximately 30%. | ||||
|     cdef PreshMap _index | ||||
|     cdef PreshMap _entry_index | ||||
| 
 | ||||
|     # Each entry takes 128 bits, and again we'll have a 30% or so overhead for | ||||
|     # over allocation. | ||||
|  | @ -43,6 +53,16 @@ cdef class KnowledgeBase: | |||
|     # Storing 1m entries would take 41.6mb under this scheme. | ||||
|     cdef vector[_EntryC] _entries | ||||
| 
 | ||||
|     # This maps 64bit keys (hash of unique alias string) | ||||
|     # to 64bit values (position of the _AliasC struct in the _aliases_table vector). | ||||
|     cdef PreshMap _alias_index | ||||
| 
 | ||||
|     # This should map mention hashes to (entry_id, prob) tuples. The probability | ||||
|     # should be P(entity | mention), which is pretty important to know. | ||||
|     # We can pack both pieces of information into a 64-bit value, to keep things | ||||
|     # efficient. | ||||
|     cdef vector[_AliasC] _aliases_table | ||||
| 
 | ||||
|     # This is the part which might take more space: storing various | ||||
|     # categorical features for the entries, and storing vectors for disambiguation | ||||
|     # and possibly usage. | ||||
|  | @ -61,23 +81,30 @@ cdef class KnowledgeBase: | |||
|     # optional data, we can let users configure a DB as the backend for this. | ||||
|     cdef object _features_table | ||||
| 
 | ||||
|     # This should map mention hashes to (entry_id, prob) tuples. The probability | ||||
|     # should be P(entity | mention), which is pretty important to know. | ||||
|     # We can pack both pieces of information into a 64-bit value, to keep things | ||||
|     # efficient. | ||||
|     cdef object _aliases_table | ||||
| 
 | ||||
|     cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows, | ||||
|     cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows, | ||||
|                     int feats_row): | ||||
|         """Add an entry to the knowledge base.""" | ||||
|         # This is what we'll map the hash key to. It's where the entry will sit | ||||
|         # in the vector of entries, so we can get it later. | ||||
|         cdef int64_t index = self._entries.size() | ||||
|         cdef int64_t entity_index = self._entries.size() | ||||
|         self._entries.push_back( | ||||
|             _EntryC( | ||||
|                 vector_rows=vector_rows, | ||||
|                 feats_row=feats_row, | ||||
|                 prob=prob | ||||
|             )) | ||||
|         self._index[key] = index | ||||
|         return index | ||||
|         self._index[entity_key] = entity_index | ||||
|         return entity_index | ||||
| 
 | ||||
|     cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs): | ||||
|         """Connect a mention to a list of potential entities with their prior probabilities .""" | ||||
|         cdef int64_t alias_index = self._aliases_table.size() | ||||
| 
 | ||||
|         self._aliases_table.push_back( | ||||
|             _AliasC( | ||||
|                 entry_indices=entry_indices, | ||||
|                 probs=probs | ||||
|             )) | ||||
|         self._alias_index[alias_key] = alias_index | ||||
|         return alias_index | ||||
							
								
								
									
										25
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							|  | @ -5,16 +5,16 @@ cdef class KnowledgeBase: | |||
|     def __len__(self): | ||||
|         return self._entries.size() | ||||
| 
 | ||||
|     def add_entity(self, name, float prob, vectors=None, features=None, aliases=None): | ||||
|     def add_entity(self, entity_id: str, float prob, vectors=None, features=None): | ||||
|         # TODO: more friendly check for non-unique name | ||||
|         if name in self: | ||||
|         if entity_id in self: | ||||
|             return | ||||
| 
 | ||||
|         cdef hash_t name_hash = hash_string(name) | ||||
|         cdef hash_t id_hash = hash_string(entity_id) | ||||
|         cdef int32_t dummy_value = 342 | ||||
|         self.c_add_entity(name_hash, prob, &dummy_value, dummy_value) | ||||
|         self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) | ||||
|         # TODO self._vectors_table.get_pointer(vectors), | ||||
|         #  self._features_table.get(features)) | ||||
|         # self._features_table.get(features)) | ||||
| 
 | ||||
|     def add_alias(self, alias, entities, probabilities): | ||||
|         """For a given alias, add its potential entities and prior probabilies to the KB.""" | ||||
|  | @ -22,10 +22,13 @@ cdef class KnowledgeBase: | |||
|         cdef hash_t entity_hash = 0 | ||||
|         cdef int64_t entity_index = 0 | ||||
| 
 | ||||
|         # TODO: check len(entities) == len(probabilities) | ||||
|         for entity, prob in zip(entities, probabilities): | ||||
|             entity_hash = hash_string(entity) | ||||
|             entity_index = self._index[entity_hash] | ||||
|             # TODO: check that entity is already in this KB (entity_index is OK) | ||||
|             self._aliases_table.add(alias_hash, entity_index, prob) | ||||
|         cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities] | ||||
| 
 | ||||
|         self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities) | ||||
| 
 | ||||
|         # TODO: check that alias hadn't been defined before | ||||
|         # TODO: check that entity is already in this KB (entity_index is OK) | ||||
|         # TODO: check sum(probabilities) <= 1 | ||||
|         # TODO: check len(entities) == len(probabilities) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user