mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	very minimal KB functionality working
This commit is contained in:
		
							parent
							
								
									5ac7edf53c
								
							
						
					
					
						commit
						a14fb54b17
					
				
							
								
								
									
										1
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -40,6 +40,7 @@ MOD_NAMES = [
 | 
				
			||||||
    "spacy.lexeme",
 | 
					    "spacy.lexeme",
 | 
				
			||||||
    "spacy.vocab",
 | 
					    "spacy.vocab",
 | 
				
			||||||
    "spacy.attrs",
 | 
					    "spacy.attrs",
 | 
				
			||||||
 | 
					    "spacy.kb",
 | 
				
			||||||
    "spacy.morphology",
 | 
					    "spacy.morphology",
 | 
				
			||||||
    "spacy.pipeline.pipes",
 | 
					    "spacy.pipeline.pipes",
 | 
				
			||||||
    "spacy.syntax.stateclass",
 | 
					    "spacy.syntax.stateclass",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										17
									
								
								spacy/kb.pxd
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								spacy/kb.pxd
									
									
									
									
									
								
							| 
						 | 
					@ -4,6 +4,7 @@ from preshed.maps cimport PreshMap
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
from libc.stdint cimport int32_t, int64_t
 | 
					from libc.stdint cimport int32_t, int64_t
 | 
				
			||||||
from .typedefs cimport hash_t
 | 
					from .typedefs cimport hash_t
 | 
				
			||||||
 | 
					from .strings cimport hash_string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Internal struct, for storage and disambiguation. This isn't what we return
 | 
					# Internal struct, for storage and disambiguation. This isn't what we return
 | 
				
			||||||
| 
						 | 
					@ -32,10 +33,10 @@ cdef struct _EntryC:
 | 
				
			||||||
cdef struct _AliasC:
 | 
					cdef struct _AliasC:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # All entry candidates for this alias
 | 
					    # All entry candidates for this alias
 | 
				
			||||||
    const vector[int64_t] entry_indices
 | 
					    vector[int64_t] entry_indices
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Prior probability P(entity|alias) - should sum up to (at most) 1.
 | 
					    # Prior probability P(entity|alias) - should sum up to (at most) 1.
 | 
				
			||||||
    const vector[float] probs
 | 
					    vector[float] probs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class KnowledgeBase:
 | 
					cdef class KnowledgeBase:
 | 
				
			||||||
| 
						 | 
					@ -94,13 +95,21 @@ cdef class KnowledgeBase:
 | 
				
			||||||
                feats_row=feats_row,
 | 
					                feats_row=feats_row,
 | 
				
			||||||
                prob=prob
 | 
					                prob=prob
 | 
				
			||||||
            ))
 | 
					            ))
 | 
				
			||||||
        self._index[entity_key] = entity_index
 | 
					        self._entry_index[entity_key] = entity_index
 | 
				
			||||||
        return entity_index
 | 
					        return entity_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
 | 
					    cdef inline int64_t c_add_aliases(self, hash_t alias_key, entities, probabilities):
 | 
				
			||||||
        """Connect a mention to a list of potential entities with their prior probabilities ."""
 | 
					        """Connect a mention to a list of potential entities with their prior probabilities ."""
 | 
				
			||||||
        cdef int64_t alias_index = self._aliases_table.size()
 | 
					        cdef int64_t alias_index = self._aliases_table.size()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        cdef vector[int64_t] entry_indices
 | 
				
			||||||
 | 
					        cdef vector[float] probs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for entity, prob in zip(entities, probs):
 | 
				
			||||||
 | 
					            entry_index = self._entry_index[hash_string(entity)]
 | 
				
			||||||
 | 
					            entry_indices.push_back(entry_index)
 | 
				
			||||||
 | 
					            probs.push_back(prob)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self._aliases_table.push_back(
 | 
					        self._aliases_table.push_back(
 | 
				
			||||||
            _AliasC(
 | 
					            _AliasC(
 | 
				
			||||||
                entry_indices=entry_indices,
 | 
					                entry_indices=entry_indices,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										30
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							
							
						
						
									
										30
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							| 
						 | 
					@ -1,34 +1,42 @@
 | 
				
			||||||
from .strings cimport hash_string
 | 
					# cython: profile=True
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
 | 
					from preshed.maps import PreshMap
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class KnowledgeBase:
 | 
					cdef class KnowledgeBase:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self):
 | 
				
			||||||
 | 
					        self._entry_index = PreshMap()
 | 
				
			||||||
 | 
					        self._alias_index = PreshMap()
 | 
				
			||||||
 | 
					        self.mem = Pool()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
        return self._entries.size()
 | 
					        return self._entries.size()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add_entity(self, entity_id: str, float prob, vectors=None, features=None):
 | 
					    def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
 | 
				
			||||||
 | 
					        cdef hash_t id_hash = hash_string(entity_id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # TODO: more friendly check for non-unique name
 | 
					        # TODO: more friendly check for non-unique name
 | 
				
			||||||
        if entity_id in self:
 | 
					        if id_hash in self._entry_index:
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef hash_t id_hash = hash_string(entity_id)
 | 
					
 | 
				
			||||||
        cdef int32_t dummy_value = 342
 | 
					        cdef int32_t dummy_value = 342
 | 
				
			||||||
        self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
 | 
					        self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
 | 
				
			||||||
        # TODO self._vectors_table.get_pointer(vectors),
 | 
					        # TODO self._vectors_table.get_pointer(vectors),
 | 
				
			||||||
        # self._features_table.get(features))
 | 
					        # self._features_table.get(features))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add_alias(self, alias, entities, probabilities):
 | 
					    def add_alias(self, unicode alias, entities, probabilities):
 | 
				
			||||||
        """For a given alias, add its potential entities and prior probabilies to the KB."""
 | 
					        """For a given alias, add its potential entities and prior probabilies to the KB."""
 | 
				
			||||||
        cdef hash_t alias_hash = hash_string(alias)
 | 
					        cdef hash_t alias_hash = hash_string(alias)
 | 
				
			||||||
        cdef hash_t entity_hash = 0
 | 
					 | 
				
			||||||
        cdef int64_t entity_index = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # TODO: check that alias hadn't been defined before
 | 
					        # TODO: check that alias hadn't been defined before
 | 
				
			||||||
        # TODO: check that entity is already in this KB (entity_index is OK)
 | 
					        # TODO: check that entity is already in this KB (entity_index is OK)
 | 
				
			||||||
        # TODO: check sum(probabilities) <= 1
 | 
					        # TODO: check sum(probabilities) <= 1
 | 
				
			||||||
        # TODO: check len(entities) == len(probabilities)
 | 
					        # TODO: check len(entities) == len(probabilities)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,16 @@
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
 | 
					from spacy.kb import KnowledgeBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_kb():
 | 
				
			||||||
 | 
					    mykb = KnowledgeBase()
 | 
				
			||||||
 | 
					    print("kb size", len(mykb))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    entity_id = "Q42"
 | 
				
			||||||
 | 
					    mykb.add_entity(entity_id=entity_id, prob=0.5)
 | 
				
			||||||
 | 
					    print("adding entity", entity_id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print("kb size", len(mykb))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def add_el():
 | 
					def add_el():
 | 
				
			||||||
| 
						 | 
					@ -23,4 +35,5 @@ def add_el():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    add_el()
 | 
					    # add_el()
 | 
				
			||||||
 | 
					    create_kb()
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user