diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index b8d26832a..663b68168 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -14,41 +14,46 @@ class Candidate(abc.ABC): def __init__( self, mention: str, - entity_id: int, - entity_name: str, + entity_id: Union[str, int], entity_vector: List[float], prior_prob: float, ): """Initializes properties of `Candidate` instance. mention (str): Mention text for this candidate. - entity_id (int): Unique entity ID. - entity_name (str): Entity name. + entity_id (Union[str, int]): Unique entity ID. entity_vector (List[float]): Entity embedding. prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) it might be better to eschew this information and always supply the same value. """ - self._mention_ = mention - self._entity = entity_id - self._entity_ = entity_name + self._mention = mention + self._entity_id = entity_id + # Note that hashing an int value yields the same int value. + self._entity_id_hash = hash(entity_id) self._entity_vector = entity_vector self._prior_prob = prior_prob @property - def entity(self) -> int: - """RETURNS (int): Unique entity ID.""" - return self._entity + def entity_id(self) -> Union[str, int]: + """RETURNS (Union[str, int]): Unique entity ID.""" + return self._entity_id @property - def entity_(self) -> str: - """RETURNS (int): Entity name.""" - return self._entity_ + def entity_id_int(self) -> int: + """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, + otherwise the hash of the entity ID string).""" + return self._entity_id_hash @property - def mention_(self) -> str: + def entity_id_str(self) -> str: + """RETURNS (str): String representation of entity ID.""" + return str(self._entity_id) + + @property + def mention(self) -> str: """RETURNS (str): Mention.""" - return self._mention_ + return self._mention @property def entity_vector(self) -> List[float]: @@ -66,49 +71,40 @@ class InMemoryCandidate(Candidate): def __init__( self, - retrieve_string_from_hash: Callable[[int], str], - entity_hash: int, - entity_freq: int, + hash_to_str: Callable[[int], str], + entity_id: int, + mention: str, entity_vector: List[float], - mention_hash: int, prior_prob: float, + entity_freq: int ): """ - retrieve_string_from_hash (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab - hash. - entity_hash (str): Hashed entity name /ID. + hash_to_str (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab hash. + entity_id (str): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). entity_freq (int): Entity frequency in KB corpus. entity_vector (List[float]): Entity embedding. - mention_hash (int): Hashed mention. + mention (str): Mention. prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) it might be better to eschew this information and always supply the same value. """ super().__init__( - mention=retrieve_string_from_hash(mention_hash), - entity_id=entity_hash, - entity_name=retrieve_string_from_hash(entity_hash), + mention=mention, + entity_id=entity_id, entity_vector=entity_vector, prior_prob=prior_prob, ) - self._retrieve_string_from_hash = retrieve_string_from_hash - self._entity = entity_hash + self._hash_to_str = hash_to_str self._entity_freq = entity_freq - self._mention = mention_hash - self._prior_prob = prior_prob - - @property - def entity(self) -> int: - """RETURNS (int): hash of the entity_id's KB ID/name""" - return self._entity - - @property - def mention(self) -> int: - """RETURNS (int): Mention hash.""" - return self._mention + self._entity_id_str = self._hash_to_str(self._entity_id) @property def entity_freq(self) -> float: """RETURNS (float): Relative entity frequency.""" return self._entity_freq + + @property + def entity_id_str(self) -> str: + """RETURNS (str): String representation of entity ID.""" + return self._entity_id_str diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index d7a986320..ecdc148f4 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -240,12 +240,12 @@ cdef class InMemoryLookupKB(KnowledgeBase): return [ InMemoryCandidate( - retrieve_string_from_hash=self.vocab.strings.__getitem__, - entity_hash=self._entries[entry_index].entity_hash, - entity_freq=self._entries[entry_index].freq, + hash_to_str=self.vocab.strings.__getitem__, + entity_id=self._entries[entry_index].entity_hash, + mention=alias, entity_vector=self._vectors_table[self._entries[entry_index].vector_index], - mention_hash=alias_hash, - prior_prob=prior_prob + prior_prob=prior_prob, + entity_freq=self._entries[entry_index].freq ) for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) if entry_index != 0 diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 907307056..e892141cc 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -535,12 +535,12 @@ class EntityLinker(TrainablePipe): ) elif len(candidates) == 1 and self.threshold is None: # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_) + final_kb_ids.append(candidates[0].entity_id_str) self._add_activations( doc_scores=doc_scores, doc_ents=doc_ents, scores=[1.0], - ents=[candidates[0].entity], + ents=[candidates[0].entity_id_int], ) else: random.shuffle(candidates) @@ -570,7 +570,7 @@ class EntityLinker(TrainablePipe): raise ValueError(Errors.E161) scores = prior_probs + sims - (prior_probs * sims) final_kb_ids.append( - candidates[scores.argmax().item()].entity_ + candidates[scores.argmax().item()].entity_id_str if self.threshold is None or scores.max() >= self.threshold else EntityLinker.NIL @@ -579,7 +579,7 @@ class EntityLinker(TrainablePipe): doc_scores=doc_scores, doc_ents=doc_ents, scores=scores, - ents=[c.entity for c in candidates], + ents=[c.entity_id_int for c in candidates], ) self._add_doc_activations( docs_scores=docs_scores, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 23eb5e205..7c82db3c7 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -468,8 +468,8 @@ def test_candidate_generation(nlp): assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates - assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" - assert get_candidates(mykb, adam_ent)[0].mention_ == "adam" + assert get_candidates(mykb, adam_ent)[0].entity_id_str == "Q2" + assert get_candidates(mykb, adam_ent)[0].mention == "adam" assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) @@ -560,10 +560,9 @@ def test_vocab_serialization(nlp): candidates = mykb._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity == q2_hash - assert candidates[0].entity_ == "Q2" - assert candidates[0].mention == adam_hash - assert candidates[0].mention_ == "adam" + assert candidates[0].entity_id_int == q2_hash + assert candidates[0].entity_id_str == "Q2" + assert candidates[0].mention == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") @@ -572,10 +571,9 @@ def test_vocab_serialization(nlp): candidates = kb_new_vocab._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity == q2_hash - assert candidates[0].entity_ == "Q2" - assert candidates[0].mention == adam_hash - assert candidates[0].mention_ == "adam" + assert candidates[0].entity_id_int == q2_hash + assert candidates[0].entity_id_str == "Q2" + assert candidates[0].mention == "adam" assert kb_new_vocab.get_vector("Q2") == [2] assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 9e501c32f..860b9f8c7 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -63,19 +63,19 @@ def _check_kb(kb): assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb._get_alias_candidates("double07"), key=lambda x: x.entity_) + candidates = sorted(kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_str) assert len(candidates) == 2 - assert candidates[0].entity_ == "Q007" + assert candidates[0].entity_id_str == "Q007" assert 6.999 < candidates[0].entity_freq < 7.01 assert candidates[0].entity_vector == [0, 0, 7] - assert candidates[0].mention_ == "double07" + assert candidates[0].mention == "double07" assert 0.899 < candidates[0].prior_prob < 0.901 - assert candidates[1].entity_ == "Q17" + assert candidates[1].entity_id_str == "Q17" assert 1.99 < candidates[1].entity_freq < 2.01 assert candidates[1].entity_vector == [7, 1, 0] - assert candidates[1].mention_ == "double07" + assert candidates[1].mention == "double07" assert 0.099 < candidates[1].prior_prob < 0.101