From 61a33f55d2eec93a335dfecc9c9a5e85c339e00a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 10 Apr 2019 16:06:09 +0200 Subject: [PATCH] little fixes --- spacy/kb.pxd | 12 +++++++++--- spacy/kb.pyx | 13 +++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index e34a0a9ba..e57c162fc 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -13,7 +13,7 @@ from .typedefs cimport hash_t # of bits we need to keep track of the answers. cdef struct _EntryC: - # The hash of this entry's unique ID and name in the kB + # The hash of this entry's unique ID/name in the kB hash_t entity_hash # Allows retrieval of one or more vectors. @@ -99,7 +99,7 @@ cdef class KnowledgeBase: cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows, int feats_row): """Add an entry to the knowledge base.""" - # This is what we'll map the hash key to. It's where the entry will sit + # This is what we'll map the entity hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. cdef int64_t new_index = self._entries.size() self._entries.push_back( @@ -114,6 +114,8 @@ cdef class KnowledgeBase: cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): """Connect a mention to a list of potential entities with their prior probabilities .""" + # This is what we'll map the alias hash key to. It's where the alias will be defined + # in the vector of aliases. cdef int64_t new_index = self._aliases_table.size() self._aliases_table.push_back( @@ -126,12 +128,14 @@ cdef class KnowledgeBase: cdef inline _create_empty_vectors(self): """ - Making sure the first element of each vector is a dummy, + Initializing the vectors and making sure the first element of each vector is a dummy, because the PreshMap maps pointing to indices in these vectors can not contain 0 as value cf. https://github.com/explosion/preshed/issues/17 """ cdef int32_t dummy_value = 0 self.vocab.strings.add("") + + self._entry_index = PreshMap() self._entries.push_back( _EntryC( entity_hash=self.vocab.strings[""], @@ -139,6 +143,8 @@ cdef class KnowledgeBase: feats_row=dummy_value, prob=dummy_value )) + + self._alias_index = PreshMap() self._aliases_table.push_back( _AliasC( entry_indices=[dummy_value], diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 3a0a8b918..38c393355 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,3 +1,4 @@ +# cython: infer_types=True # cython: profile=True # coding: utf8 from spacy.errors import Errors, Warnings, user_warning @@ -19,7 +20,7 @@ cdef class Candidate: @property def entity_(self): """RETURNS (unicode): ID/name of this entity in the KB""" - return self.kb.vocab.strings[self.entity] + return self.kb.vocab.strings[self.entity_hash] @property def alias(self): @@ -29,7 +30,7 @@ cdef class Candidate: @property def alias_(self): """RETURNS (unicode): ID of the original alias""" - return self.kb.vocab.strings[self.alias] + return self.kb.vocab.strings[self.alias_hash] @property def prior_prob(self): @@ -40,8 +41,6 @@ cdef class KnowledgeBase: def __init__(self, Vocab vocab): self.vocab = vocab - self._entry_index = PreshMap() - self._alias_index = PreshMap() self.mem = Pool() self._create_empty_vectors() @@ -56,8 +55,8 @@ cdef class KnowledgeBase: def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None): """ - Add an entity to the KB. - Return the hash of the entity ID at the end + Add an entity to the KB, optionally specifying its log probability based on corpus frequency + Return the hash of the entity ID/name at the end """ cdef hash_t entity_hash = self.vocab.strings.add(entity) @@ -98,8 +97,6 @@ cdef class KnowledgeBase: user_warning(Warnings.W017.format(alias=alias)) return - cdef hash_t entity_hash - cdef vector[int64_t] entry_indices cdef vector[float] probs