mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
add pyx and separate method to add aliases
This commit is contained in:
parent
feb71e15fd
commit
27483f9080
21
spacy/kb.pxd
21
spacy/kb.pxd
|
@ -3,8 +3,7 @@ from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
from .typedefs cimport attr_t, hash_t
|
from .typedefs cimport hash_t
|
||||||
from .strings cimport hash_string
|
|
||||||
|
|
||||||
|
|
||||||
# Internal struct, for storage and disambiguation. This isn't what we return
|
# Internal struct, for storage and disambiguation. This isn't what we return
|
||||||
|
@ -68,26 +67,10 @@ cdef class KnowledgeBase:
|
||||||
# efficient.
|
# efficient.
|
||||||
cdef object _aliases_table
|
cdef object _aliases_table
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self._entries.size()
|
|
||||||
|
|
||||||
def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
|
|
||||||
# TODO: more friendly check for non-unique name
|
|
||||||
if name in self:
|
|
||||||
return
|
|
||||||
|
|
||||||
cdef hash_t key = hash_string(name)
|
|
||||||
self.c_add_entity(key, prob, self._vectors_table.get_pointer(vectors),
|
|
||||||
self._features_table.get(features))
|
|
||||||
|
|
||||||
# TODO: hash the aliases?
|
|
||||||
for alias, prob_alias in aliases:
|
|
||||||
self._aliases_table.add(alias, key, prob_alias)
|
|
||||||
|
|
||||||
cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
|
cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
|
||||||
int feats_row) nogil:
|
int feats_row) nogil:
|
||||||
"""Add an entry to the knowledge base."""
|
"""Add an entry to the knowledge base."""
|
||||||
# This is what we'll map the orth to. It's where the entry will sit
|
# This is what we'll map the hash key to. It's where the entry will sit
|
||||||
# in the vector of entries, so we can get it later.
|
# in the vector of entries, so we can get it later.
|
||||||
cdef int64_t index = self._entries.size()
|
cdef int64_t index = self._entries.size()
|
||||||
self._entries.push_back(
|
self._entries.push_back(
|
||||||
|
|
27
spacy/kb.pyx
Normal file
27
spacy/kb.pyx
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
from .strings cimport hash_string
|
||||||
|
|
||||||
|
|
||||||
|
cdef class KnowledgeBase:
|
||||||
|
def __len__(self):
|
||||||
|
return self._entries.size()
|
||||||
|
|
||||||
|
def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
|
||||||
|
# TODO: more friendly check for non-unique name
|
||||||
|
if name in self:
|
||||||
|
return
|
||||||
|
|
||||||
|
cdef hash_t name_hash = hash_string(name)
|
||||||
|
self.c_add_entity(name_hash, prob, self._vectors_table.get_pointer(vectors),
|
||||||
|
self._features_table.get(features))
|
||||||
|
|
||||||
|
def add_alias(self, alias, entities, probabilities):
|
||||||
|
"""For a given alias, add its potential entities and prior probabilies to the KB."""
|
||||||
|
cdef hash_t alias_hash = hash_string(alias)
|
||||||
|
|
||||||
|
# TODO: check len(entities) == len(probabilities)
|
||||||
|
for entity, prob in zip(entities, probabilities):
|
||||||
|
cdef hash_t entity_hash = hash_string(entity)
|
||||||
|
cdef int64_t entity_index = self._index[entity_hash]
|
||||||
|
# TODO: check that entity is already in this KB (entity_index is OK)
|
||||||
|
self._aliases_table.add(alias_hash, entity_index, prob)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user