From 61bacf81bd97f6ee671428cae861ceb2600ff50c Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 3 Mar 2023 09:54:28 +0100 Subject: [PATCH 01/39] Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem --- website/docs/api/kb.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index 5c5abaef9..3a1cefe8d 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -200,7 +200,7 @@ to a certain prior probability. ### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"} -Construct a `InMemoryCandidate` object. Usually this constructor is not called +Construct an `InMemoryCandidate` object. Usually this constructor is not called directly, but instead these objects are returned by the `get_candidates` method of the [`entity_linker`](/api/entitylinker) pipe. From 46fe069f8731c3d591963fe4a1e3e1c1a4b1eef9 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 3 Mar 2023 10:29:53 +0100 Subject: [PATCH 02/39] Rename alias -> mention. --- spacy/kb/candidate.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index 3cc3a6c59..af691b415 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -3,10 +3,10 @@ from typing import List, Union, Callable class Candidate(abc.ABC): - """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved + """A `Candidate` object refers to a textual mention that may or may not be resolved to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking algorithm which will disambiguate the various candidates to the correct one. - Each candidate (alias, entity_id) pair is assigned a certain prior probability. + Each candidate (mention, entity_id) pair is assigned a certain prior probability. DOCS: https://spacy.io/api/kb/#candidate-init """ @@ -70,7 +70,7 @@ class InMemoryCandidate(Candidate): entity_hash: int, entity_freq: int, entity_vector: List[float], - alias_hash: int, + mention_hash: int, prior_prob: float, ): """ @@ -79,14 +79,14 @@ class InMemoryCandidate(Candidate): entity_hash (str): Hashed entity name /ID. entity_freq (int): Entity frequency in KB corpus. entity_vector (List[float]): Entity embedding. - alias_hash (int): Hashed alias. + mention_hash (int): Hashed mention. prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) it might be better to eschew this information and always supply the same value. """ super().__init__( - mention=retrieve_string_from_hash(alias_hash), + mention=retrieve_string_from_hash(mention_hash), entity_id=entity_hash, entity_name=retrieve_string_from_hash(entity_hash), entity_vector=entity_vector, @@ -95,7 +95,7 @@ class InMemoryCandidate(Candidate): self._retrieve_string_from_hash = retrieve_string_from_hash self._entity_hash = entity_hash self._entity_freq = entity_freq - self._alias_hash = alias_hash + self._mention_hash = mention_hash self._prior_prob = prior_prob @property @@ -104,15 +104,11 @@ class InMemoryCandidate(Candidate): return self._entity_hash @property - def alias(self) -> int: - """RETURNS (int): hash of the alias""" - return self._alias_hash - - @property - def alias_(self) -> str: - """RETURNS (str): ID of the original alias""" - return self._retrieve_string_from_hash(self._alias_hash) + def mention_hash(self) -> int: + """RETURNS (int): Mention hash.""" + return self._mention_hash @property def entity_freq(self) -> float: + """RETURNS (float): Relative entity frequency.""" return self._entity_freq From 94e57d0ed5fe1981cf5ac2b54964d0e3f14533a4 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 3 Mar 2023 11:08:17 +0100 Subject: [PATCH 03/39] Refactor Candidate attribute names. Update docs and tests accordingly. --- spacy/kb/candidate.py | 24 ++++++++-------- spacy/kb/kb_in_memory.pyx | 6 ++-- spacy/tests/pipeline/test_entity_linker.py | 30 ++++++++++---------- spacy/tests/serialize/test_serialize_kb.py | 6 ++-- website/docs/api/kb.mdx | 33 ++++++---------------- 5 files changed, 41 insertions(+), 58 deletions(-) diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index af691b415..b8d26832a 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -29,26 +29,26 @@ class Candidate(abc.ABC): cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) it might be better to eschew this information and always supply the same value. """ - self._mention = mention - self._entity_id = entity_id - self._entity_name = entity_name + self._mention_ = mention + self._entity = entity_id + self._entity_ = entity_name self._entity_vector = entity_vector self._prior_prob = prior_prob @property def entity(self) -> int: """RETURNS (int): Unique entity ID.""" - return self._entity_id + return self._entity @property def entity_(self) -> str: """RETURNS (int): Entity name.""" - return self._entity_name + return self._entity_ @property - def mention(self) -> str: + def mention_(self) -> str: """RETURNS (str): Mention.""" - return self._mention + return self._mention_ @property def entity_vector(self) -> List[float]: @@ -93,20 +93,20 @@ class InMemoryCandidate(Candidate): prior_prob=prior_prob, ) self._retrieve_string_from_hash = retrieve_string_from_hash - self._entity_hash = entity_hash + self._entity = entity_hash self._entity_freq = entity_freq - self._mention_hash = mention_hash + self._mention = mention_hash self._prior_prob = prior_prob @property def entity(self) -> int: """RETURNS (int): hash of the entity_id's KB ID/name""" - return self._entity_hash + return self._entity @property - def mention_hash(self) -> int: + def mention(self) -> int: """RETURNS (int): Mention hash.""" - return self._mention_hash + return self._mention @property def entity_freq(self) -> float: diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index f39432f5e..d7a986320 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -224,9 +224,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): self._aliases_table[alias_index] = alias_entry def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]: - return self.get_alias_candidates(mention.text) # type: ignore + return self._get_alias_candidates(mention.text) # type: ignore - def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: + def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: """ Return candidate entities for an alias. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. @@ -244,7 +244,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): entity_hash=self._entries[entry_index].entity_hash, entity_freq=self._entries[entry_index].freq, entity_vector=self._vectors_table[self._entries[entry_index].vector_index], - alias_hash=alias_hash, + mention_hash=alias_hash, prior_prob=prior_prob ) for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index cb1e4a733..23eb5e205 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -469,7 +469,7 @@ def test_candidate_generation(nlp): # test the content of the candidates assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" - assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" + assert get_candidates(mykb, adam_ent)[0].mention_ == "adam" assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) @@ -499,7 +499,7 @@ def test_el_pipe_configuration(nlp): assert doc[2].ent_kb_id_ == "Q2" def get_lowercased_candidates(kb, span): - return kb.get_alias_candidates(span.text.lower()) + return kb._get_alias_candidates(span.text.lower()) def get_lowercased_candidates_batch(kb, spans): return [get_lowercased_candidates(kb, span) for span in spans] @@ -558,24 +558,24 @@ def test_vocab_serialization(nlp): mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) - candidates = mykb.get_alias_candidates("adam") + candidates = mykb._get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" - assert candidates[0].alias == adam_hash - assert candidates[0].alias_ == "adam" + assert candidates[0].mention == adam_hash + assert candidates[0].mention_ == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1) kb_new_vocab.from_disk(d / "kb") - candidates = kb_new_vocab.get_alias_candidates("adam") + candidates = kb_new_vocab._get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" - assert candidates[0].alias == adam_hash - assert candidates[0].alias_ == "adam" + assert candidates[0].mention == adam_hash + assert candidates[0].mention_ == "adam" assert kb_new_vocab.get_vector("Q2") == [2] assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) @@ -595,20 +595,20 @@ def test_append_alias(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates - assert len(mykb.get_alias_candidates("douglas")) == 2 + assert len(mykb._get_alias_candidates("douglas")) == 2 # append an alias mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) # test the size of the relevant candidates has been incremented - assert len(mykb.get_alias_candidates("douglas")) == 3 + assert len(mykb._get_alias_candidates("douglas")) == 3 # append the same alias-entity pair again should not work (will throw a warning) with pytest.warns(UserWarning): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) # test the size of the relevant candidates remained unchanged - assert len(mykb.get_alias_candidates("douglas")) == 3 + assert len(mykb._get_alias_candidates("douglas")) == 3 @pytest.mark.filterwarnings("ignore:\\[W036") @@ -905,11 +905,11 @@ def test_kb_to_bytes(): assert kb_2.contains_alias("Russ Cochran") assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_alias_strings() == kb_2.get_alias_strings() - assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( - kb_2.get_alias_candidates("Russ Cochran") + assert len(kb_1._get_alias_candidates("Russ Cochran")) == len( + kb_2._get_alias_candidates("Russ Cochran") ) - assert len(kb_1.get_alias_candidates("Randomness")) == len( - kb_2.get_alias_candidates("Randomness") + assert len(kb_1._get_alias_candidates("Randomness")) == len( + kb_2._get_alias_candidates("Randomness") ) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 8d3653ab1..9e501c32f 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -63,19 +63,19 @@ def _check_kb(kb): assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_) + candidates = sorted(kb._get_alias_candidates("double07"), key=lambda x: x.entity_) assert len(candidates) == 2 assert candidates[0].entity_ == "Q007" assert 6.999 < candidates[0].entity_freq < 7.01 assert candidates[0].entity_vector == [0, 0, 7] - assert candidates[0].alias_ == "double07" + assert candidates[0].mention_ == "double07" assert 0.899 < candidates[0].prior_prob < 0.901 assert candidates[1].entity_ == "Q17" assert 1.99 < candidates[1].entity_freq < 2.01 assert candidates[1].entity_vector == [7, 1, 0] - assert candidates[1].alias_ == "double07" + assert candidates[1].mention_ == "double07" assert 0.099 < candidates[1].prior_prob < 0.101 diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index 3a1cefe8d..4d51dbc16 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -103,23 +103,6 @@ to you. | `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | -## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"} - - - This method is _not_ available from spaCy 3.5 onwards. - - -From spaCy 3.5 on `KnowledgeBase` is an abstract class (with -[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to -allow more flexibility in customizing knowledge bases. Some of its methods were -moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring, -one of those being `get_alias_candidates()`. This method is now available as -[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). -Note: -[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates) -defaults to -[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). - ## KnowledgeBase.get_vector {id="get_vector",tag="method"} Given a certain entity ID, retrieve its pretrained entity vector. @@ -207,19 +190,19 @@ of the [`entity_linker`](/api/entitylinker) pipe. > #### Example```python > > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb, -> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) +> entity_hash, entity_freq, entity_vector, mention_hash, prior_prob) > > ``` > > ``` -| Name | Description | -| ------------- | ------------------------------------------------------------------------- | -| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ | -| `entity_hash` | The hash of the entity's KB ID. ~~int~~ | -| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ | -| `alias_hash` | The hash of the textual mention or alias. ~~int~~ | -| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------- | +| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ | +| `entity_hash` | The hash of the entity's KB ID. ~~int~~ | +| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ | +| `mention_hash` | The hash of the textual mention. ~~int~~ | +| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | ## InMemoryCandidate attributes {id="candidate-attributes"} From 38dce966e55dbeb61d1d085ff97ce50f5095dea8 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Sun, 5 Mar 2023 13:49:13 +0100 Subject: [PATCH 04/39] Refacor Candidate attributes and their usage. --- spacy/kb/candidate.py | 76 ++++++++++------------ spacy/kb/kb_in_memory.pyx | 10 +-- spacy/pipeline/entity_linker.py | 8 +-- spacy/tests/pipeline/test_entity_linker.py | 18 +++-- spacy/tests/serialize/test_serialize_kb.py | 10 +-- 5 files changed, 58 insertions(+), 64 deletions(-) diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index b8d26832a..663b68168 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -14,41 +14,46 @@ class Candidate(abc.ABC): def __init__( self, mention: str, - entity_id: int, - entity_name: str, + entity_id: Union[str, int], entity_vector: List[float], prior_prob: float, ): """Initializes properties of `Candidate` instance. mention (str): Mention text for this candidate. - entity_id (int): Unique entity ID. - entity_name (str): Entity name. + entity_id (Union[str, int]): Unique entity ID. entity_vector (List[float]): Entity embedding. prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) it might be better to eschew this information and always supply the same value. """ - self._mention_ = mention - self._entity = entity_id - self._entity_ = entity_name + self._mention = mention + self._entity_id = entity_id + # Note that hashing an int value yields the same int value. + self._entity_id_hash = hash(entity_id) self._entity_vector = entity_vector self._prior_prob = prior_prob @property - def entity(self) -> int: - """RETURNS (int): Unique entity ID.""" - return self._entity + def entity_id(self) -> Union[str, int]: + """RETURNS (Union[str, int]): Unique entity ID.""" + return self._entity_id @property - def entity_(self) -> str: - """RETURNS (int): Entity name.""" - return self._entity_ + def entity_id_int(self) -> int: + """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, + otherwise the hash of the entity ID string).""" + return self._entity_id_hash @property - def mention_(self) -> str: + def entity_id_str(self) -> str: + """RETURNS (str): String representation of entity ID.""" + return str(self._entity_id) + + @property + def mention(self) -> str: """RETURNS (str): Mention.""" - return self._mention_ + return self._mention @property def entity_vector(self) -> List[float]: @@ -66,49 +71,40 @@ class InMemoryCandidate(Candidate): def __init__( self, - retrieve_string_from_hash: Callable[[int], str], - entity_hash: int, - entity_freq: int, + hash_to_str: Callable[[int], str], + entity_id: int, + mention: str, entity_vector: List[float], - mention_hash: int, prior_prob: float, + entity_freq: int ): """ - retrieve_string_from_hash (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab - hash. - entity_hash (str): Hashed entity name /ID. + hash_to_str (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab hash. + entity_id (str): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). entity_freq (int): Entity frequency in KB corpus. entity_vector (List[float]): Entity embedding. - mention_hash (int): Hashed mention. + mention (str): Mention. prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) it might be better to eschew this information and always supply the same value. """ super().__init__( - mention=retrieve_string_from_hash(mention_hash), - entity_id=entity_hash, - entity_name=retrieve_string_from_hash(entity_hash), + mention=mention, + entity_id=entity_id, entity_vector=entity_vector, prior_prob=prior_prob, ) - self._retrieve_string_from_hash = retrieve_string_from_hash - self._entity = entity_hash + self._hash_to_str = hash_to_str self._entity_freq = entity_freq - self._mention = mention_hash - self._prior_prob = prior_prob - - @property - def entity(self) -> int: - """RETURNS (int): hash of the entity_id's KB ID/name""" - return self._entity - - @property - def mention(self) -> int: - """RETURNS (int): Mention hash.""" - return self._mention + self._entity_id_str = self._hash_to_str(self._entity_id) @property def entity_freq(self) -> float: """RETURNS (float): Relative entity frequency.""" return self._entity_freq + + @property + def entity_id_str(self) -> str: + """RETURNS (str): String representation of entity ID.""" + return self._entity_id_str diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index d7a986320..ecdc148f4 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -240,12 +240,12 @@ cdef class InMemoryLookupKB(KnowledgeBase): return [ InMemoryCandidate( - retrieve_string_from_hash=self.vocab.strings.__getitem__, - entity_hash=self._entries[entry_index].entity_hash, - entity_freq=self._entries[entry_index].freq, + hash_to_str=self.vocab.strings.__getitem__, + entity_id=self._entries[entry_index].entity_hash, + mention=alias, entity_vector=self._vectors_table[self._entries[entry_index].vector_index], - mention_hash=alias_hash, - prior_prob=prior_prob + prior_prob=prior_prob, + entity_freq=self._entries[entry_index].freq ) for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) if entry_index != 0 diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 907307056..e892141cc 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -535,12 +535,12 @@ class EntityLinker(TrainablePipe): ) elif len(candidates) == 1 and self.threshold is None: # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_) + final_kb_ids.append(candidates[0].entity_id_str) self._add_activations( doc_scores=doc_scores, doc_ents=doc_ents, scores=[1.0], - ents=[candidates[0].entity], + ents=[candidates[0].entity_id_int], ) else: random.shuffle(candidates) @@ -570,7 +570,7 @@ class EntityLinker(TrainablePipe): raise ValueError(Errors.E161) scores = prior_probs + sims - (prior_probs * sims) final_kb_ids.append( - candidates[scores.argmax().item()].entity_ + candidates[scores.argmax().item()].entity_id_str if self.threshold is None or scores.max() >= self.threshold else EntityLinker.NIL @@ -579,7 +579,7 @@ class EntityLinker(TrainablePipe): doc_scores=doc_scores, doc_ents=doc_ents, scores=scores, - ents=[c.entity for c in candidates], + ents=[c.entity_id_int for c in candidates], ) self._add_doc_activations( docs_scores=docs_scores, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 23eb5e205..7c82db3c7 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -468,8 +468,8 @@ def test_candidate_generation(nlp): assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates - assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" - assert get_candidates(mykb, adam_ent)[0].mention_ == "adam" + assert get_candidates(mykb, adam_ent)[0].entity_id_str == "Q2" + assert get_candidates(mykb, adam_ent)[0].mention == "adam" assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) @@ -560,10 +560,9 @@ def test_vocab_serialization(nlp): candidates = mykb._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity == q2_hash - assert candidates[0].entity_ == "Q2" - assert candidates[0].mention == adam_hash - assert candidates[0].mention_ == "adam" + assert candidates[0].entity_id_int == q2_hash + assert candidates[0].entity_id_str == "Q2" + assert candidates[0].mention == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") @@ -572,10 +571,9 @@ def test_vocab_serialization(nlp): candidates = kb_new_vocab._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity == q2_hash - assert candidates[0].entity_ == "Q2" - assert candidates[0].mention == adam_hash - assert candidates[0].mention_ == "adam" + assert candidates[0].entity_id_int == q2_hash + assert candidates[0].entity_id_str == "Q2" + assert candidates[0].mention == "adam" assert kb_new_vocab.get_vector("Q2") == [2] assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 9e501c32f..860b9f8c7 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -63,19 +63,19 @@ def _check_kb(kb): assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb._get_alias_candidates("double07"), key=lambda x: x.entity_) + candidates = sorted(kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_str) assert len(candidates) == 2 - assert candidates[0].entity_ == "Q007" + assert candidates[0].entity_id_str == "Q007" assert 6.999 < candidates[0].entity_freq < 7.01 assert candidates[0].entity_vector == [0, 0, 7] - assert candidates[0].mention_ == "double07" + assert candidates[0].mention == "double07" assert 0.899 < candidates[0].prior_prob < 0.901 - assert candidates[1].entity_ == "Q17" + assert candidates[1].entity_id_str == "Q17" assert 1.99 < candidates[1].entity_freq < 2.01 assert candidates[1].entity_vector == [7, 1, 0] - assert candidates[1].mention_ == "double07" + assert candidates[1].mention == "double07" assert 0.099 < candidates[1].prior_prob < 0.101 From 5f40b3e5231ef686ecc7682444d23757a2ea6b3a Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Sun, 5 Mar 2023 14:14:16 +0100 Subject: [PATCH 05/39] Format. --- spacy/kb/candidate.py | 2 +- spacy/tests/serialize/test_serialize_kb.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index 663b68168..b32b72dd8 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -76,7 +76,7 @@ class InMemoryCandidate(Candidate): mention: str, entity_vector: List[float], prior_prob: float, - entity_freq: int + entity_freq: int, ): """ hash_to_str (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab hash. diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 860b9f8c7..3d75862c6 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -63,7 +63,9 @@ def _check_kb(kb): assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_str) + candidates = sorted( + kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_str + ) assert len(candidates) == 2 assert candidates[0].entity_id_str == "Q007" From 670e1ca7c5a1dfc29f97e13c71b54c2485ba0353 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Sun, 5 Mar 2023 14:33:32 +0100 Subject: [PATCH 06/39] Fix mypy error. --- spacy/errors.py | 1 + spacy/kb/candidate.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index eadbf63d6..9473f1cf7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -952,6 +952,7 @@ class Errors(metaclass=ErrorsWithCodes): "with `displacy.serve(doc, port=port)`") E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " "or use `auto_switch_port=True` to pick an available port automatically.") + E1051 = ("Expected `entity_id` to be of type {should_type}, but is of type {is_type}.") # v4 error strings E4000 = ("Expected a Doc as input, but got: '{type}'") diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index b32b72dd8..a481d88f5 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -1,6 +1,8 @@ import abc from typing import List, Union, Callable +from ..errors import Errors + class Candidate(abc.ABC): """A `Candidate` object refers to a textual mention that may or may not be resolved @@ -97,6 +99,10 @@ class InMemoryCandidate(Candidate): ) self._hash_to_str = hash_to_str self._entity_freq = entity_freq + if not isinstance(self._entity_id, int): + raise ValueError( + Errors.E1051.format(should_type="int", is_type=str(type(entity_id))) + ) self._entity_id_str = self._hash_to_str(self._entity_id) @property From 2ac586fdb5681119a4a7842577764fd7940977cc Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Sun, 5 Mar 2023 14:43:32 +0100 Subject: [PATCH 07/39] Update error code in line with v4 convention. --- spacy/errors.py | 2 +- spacy/kb/candidate.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 9473f1cf7..74111e48d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -952,7 +952,6 @@ class Errors(metaclass=ErrorsWithCodes): "with `displacy.serve(doc, port=port)`") E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " "or use `auto_switch_port=True` to pick an available port automatically.") - E1051 = ("Expected `entity_id` to be of type {should_type}, but is of type {is_type}.") # v4 error strings E4000 = ("Expected a Doc as input, but got: '{type}'") @@ -962,6 +961,7 @@ class Errors(metaclass=ErrorsWithCodes): E4003 = ("Training examples for distillation must have the exact same tokens in the " "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") + E4005 = ("Expected `entity_id` to be of type {should_type}, but is of type {is_type}.") RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index a481d88f5..9778b2880 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -101,7 +101,7 @@ class InMemoryCandidate(Candidate): self._entity_freq = entity_freq if not isinstance(self._entity_id, int): raise ValueError( - Errors.E1051.format(should_type="int", is_type=str(type(entity_id))) + Errors.E4005.format(should_type="int", is_type=str(type(entity_id))) ) self._entity_id_str = self._hash_to_str(self._entity_id) From 41b3a0d932aafb4db9db02ae2e03b560305e0d53 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 7 Mar 2023 13:10:45 +0100 Subject: [PATCH 08/39] Drop support for EntityLinker_v1. (#12377) --- spacy/errors.py | 1 + spacy/pipeline/entity_linker.py | 23 ++-------------------- spacy/tests/pipeline/test_entity_linker.py | 7 +------ 3 files changed, 4 insertions(+), 27 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 5049100d8..390de126e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -960,6 +960,7 @@ class Errors(metaclass=ErrorsWithCodes): E4003 = ("Training examples for distillation must have the exact same tokens in the " "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") + E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index cd13a4b21..6a187b6c3 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -117,28 +117,9 @@ def make_entity_linker( prediction is discarded. If None, predictions are not filtered by any threshold. save_activations (bool): save model activations in Doc when annotating. """ - if not model.attrs.get("include_span_maker", False): - try: - from spacy_legacy.components.entity_linker import EntityLinker_v1 - except: - raise ImportError( - "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12." - ) - # The only difference in arguments here is that use_gold_ents and threshold aren't available. - return EntityLinker_v1( - nlp.vocab, - model, - name, - labels_discard=labels_discard, - n_sents=n_sents, - incl_prior=incl_prior, - incl_context=incl_context, - entity_vector_length=entity_vector_length, - get_candidates=get_candidates, - overwrite=overwrite, - scorer=scorer, - ) + raise ValueError(Errors.E4005) + return EntityLinker( nlp.vocab, model, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index ed84ce674..87cacfc9d 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -993,13 +993,11 @@ def test_scorer_links(): @pytest.mark.parametrize( "name,config", [ - ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ], ) # fmt: on def test_legacy_architectures(name, config): - from spacy_legacy.components.entity_linker import EntityLinker_v1 # Ensure that the legacy architectures still work vector_length = 3 @@ -1022,10 +1020,7 @@ def test_legacy_architectures(name, config): return mykb entity_linker = nlp.add_pipe(name, config={"model": config}) - if config["@architectures"] == "spacy.EntityLinker.v1": - assert isinstance(entity_linker, EntityLinker_v1) - else: - assert isinstance(entity_linker, EntityLinker) + assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) optimizer = nlp.initialize(get_examples=lambda: train_examples) From 082992aebb45773b5334b41b125886b1bb03bb7d Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 7 Mar 2023 13:54:11 +0100 Subject: [PATCH 09/39] Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem --- spacy/kb/candidate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index 9778b2880..2a18b753c 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -82,7 +82,7 @@ class InMemoryCandidate(Candidate): ): """ hash_to_str (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab hash. - entity_id (str): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). + entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). entity_freq (int): Entity frequency in KB corpus. entity_vector (List[float]): Entity embedding. mention (str): Mention. From f8a02f7fef740ac8ed4e5db4dc6f0f9f8a86b4c4 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 7 Mar 2023 13:58:42 +0100 Subject: [PATCH 10/39] Updated error code. --- spacy/errors.py | 2 +- spacy/kb/candidate.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 58d2c81a0..73d867792 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -960,7 +960,7 @@ class Errors(metaclass=ErrorsWithCodes): E4003 = ("Training examples for distillation must have the exact same tokens in the " "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") - E4005 = ("Expected `entity_id` to be of type {should_type}, but is of type {is_type}.") + E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.") RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index 2a18b753c..911af1127 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -101,7 +101,7 @@ class InMemoryCandidate(Candidate): self._entity_freq = entity_freq if not isinstance(self._entity_id, int): raise ValueError( - Errors.E4005.format(should_type="int", is_type=str(type(entity_id))) + Errors.E4006.format(exp_type="int", found_type=str(type(entity_id))) ) self._entity_id_str = self._hash_to_str(self._entity_id) From cea58ade8969592266483848feae3fd06f537e6b Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 7 Mar 2023 14:35:38 +0100 Subject: [PATCH 11/39] Simplify interface for int/str representations. --- spacy/kb/candidate.py | 11 +++-------- spacy/pipeline/entity_linker.py | 8 ++++---- spacy/tests/pipeline/test_entity_linker.py | 10 +++++----- spacy/tests/serialize/test_serialize_kb.py | 6 +++--- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index 911af1127..b1c188e09 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -37,18 +37,13 @@ class Candidate(abc.ABC): self._prior_prob = prior_prob @property - def entity_id(self) -> Union[str, int]: - """RETURNS (Union[str, int]): Unique entity ID.""" - return self._entity_id - - @property - def entity_id_int(self) -> int: + def entity_id(self) -> int: """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, otherwise the hash of the entity ID string).""" return self._entity_id_hash @property - def entity_id_str(self) -> str: + def entity_id_(self) -> str: """RETURNS (str): String representation of entity ID.""" return str(self._entity_id) @@ -111,6 +106,6 @@ class InMemoryCandidate(Candidate): return self._entity_freq @property - def entity_id_str(self) -> str: + def entity_id_(self) -> str: """RETURNS (str): String representation of entity ID.""" return self._entity_id_str diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 4d8370fd3..39cff218a 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -522,12 +522,12 @@ class EntityLinker(TrainablePipe): ) elif len(candidates) == 1 and self.threshold is None: # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_id_str) + final_kb_ids.append(candidates[0].entity_id_) self._add_activations( doc_scores=doc_scores, doc_ents=doc_ents, scores=[1.0], - ents=[candidates[0].entity_id_int], + ents=[candidates[0].entity_id], ) else: random.shuffle(candidates) @@ -557,7 +557,7 @@ class EntityLinker(TrainablePipe): raise ValueError(Errors.E161) scores = prior_probs + sims - (prior_probs * sims) final_kb_ids.append( - candidates[scores.argmax().item()].entity_id_str + candidates[scores.argmax().item()].entity_id_ if self.threshold is None or scores.max() >= self.threshold else EntityLinker.NIL @@ -566,7 +566,7 @@ class EntityLinker(TrainablePipe): doc_scores=doc_scores, doc_ents=doc_ents, scores=scores, - ents=[c.entity_id_int for c in candidates], + ents=[c.entity_id for c in candidates], ) self._add_doc_activations( docs_scores=docs_scores, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 1b5117947..d0cfdd3c7 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -471,7 +471,7 @@ def test_candidate_generation(nlp): assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates - assert get_candidates(mykb, adam_ent)[0].entity_id_str == "Q2" + assert get_candidates(mykb, adam_ent)[0].entity_id_ == "Q2" assert get_candidates(mykb, adam_ent)[0].mention == "adam" assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) @@ -563,8 +563,8 @@ def test_vocab_serialization(nlp): candidates = mykb._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity_id_int == q2_hash - assert candidates[0].entity_id_str == "Q2" + assert candidates[0].entity_id == q2_hash + assert candidates[0].entity_id_ == "Q2" assert candidates[0].mention == "adam" with make_tempdir() as d: @@ -574,8 +574,8 @@ def test_vocab_serialization(nlp): candidates = kb_new_vocab._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity_id_int == q2_hash - assert candidates[0].entity_id_str == "Q2" + assert candidates[0].entity_id == q2_hash + assert candidates[0].entity_id_ == "Q2" assert candidates[0].mention == "adam" assert kb_new_vocab.get_vector("Q2") == [2] diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 7365765cb..336fd16fe 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -67,17 +67,17 @@ def _check_kb(kb): # check candidates & probabilities candidates = sorted( - kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_str + kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_ ) assert len(candidates) == 2 - assert candidates[0].entity_id_str == "Q007" + assert candidates[0].entity_id_ == "Q007" assert 6.999 < candidates[0].entity_freq < 7.01 assert candidates[0].entity_vector == [0, 0, 7] assert candidates[0].mention == "double07" assert 0.899 < candidates[0].prior_prob < 0.901 - assert candidates[1].entity_id_str == "Q17" + assert candidates[1].entity_id_ == "Q17" assert 1.99 < candidates[1].entity_freq < 2.01 assert candidates[1].entity_vector == [7, 1, 0] assert candidates[1].mention == "double07" From 1ba2fc42070f2065b766deb8a582ad10c84615ac Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 9 Mar 2023 12:01:42 +0100 Subject: [PATCH 12/39] Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem --- website/docs/api/kb.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index 4d51dbc16..e3b699140 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -175,7 +175,7 @@ Restore the state of the knowledge base from a given directory. Note that the ## InMemoryCandidate {id="candidate",tag="class"} -A `InMemoryCandidate` object refers to a textual mention (alias) that may or may +An `InMemoryCandidate` object refers to a textual mention (alias) that may or may not be resolved to a specific entity from a `KnowledgeBase`. This will be used as input for the entity linking algorithm which will disambiguate the various candidates to the correct one. Each candidate `(alias, entity)` pair is assigned From 1c937db3af6a66fa36fbb0538dc975450254d301 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 9 Mar 2023 12:06:15 +0100 Subject: [PATCH 13/39] Rename 'alias' to 'mention'. --- spacy/errors.py | 2 +- spacy/kb/kb.pyx | 10 +++++----- spacy/kb/kb_in_memory.pyx | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 51a10be17..92770b8a8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes): "ignoring the duplicate entry.") W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " "incorrect. Modify PhraseMatcher._terminal_hash to fix.") - W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " + W024 = ("Entity '{entity}' - mention '{mention}' combination already exists in " "the Knowledge Base.") W026 = ("Unable to set all sentence boundaries from dependency parses. If " "you are constructing a parse tree incrementally by setting " diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index ce4bc0138..158c3304f 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -11,7 +11,7 @@ from ..errors import Errors cdef class KnowledgeBase: - """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases, + """A `KnowledgeBase` instance stores unique identifiers for entities and their textual mentions, to support entity linking of named entities to real-world concepts. This is an abstract class and requires its operations to be implemented. @@ -32,8 +32,8 @@ cdef class KnowledgeBase: def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: """ - Return candidate entities for specified texts. Each candidate defines the entity, the original alias, - and the prior probability of that alias resolving to that entity. + Return candidate entities for specified texts. Each candidate defines the entity, the original mention, + and the prior probability of this mention resolving to that entity. If no candidate is found for a given text, an empty list is returned. mentions (Iterable[Span]): Mentions for which to get candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. @@ -42,8 +42,8 @@ cdef class KnowledgeBase: def get_candidates(self, mention: Span) -> Iterable[Candidate]: """ - Return candidate entities for specified text. Each candidate defines the entity, the original alias, - and the prior probability of that alias resolving to that entity. + Return candidate entities for specified text. Each candidate defines the entity, the original mention, + and the prior probability of that mention resolving to that entity. If the no candidate is found for a given text, an empty list is returned. mention (Span): Mention for which to get candidates. RETURNS (Iterable[Candidate]): Identified candidates. diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index d456f729b..692390f10 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -22,7 +22,7 @@ from .candidate import InMemoryCandidate cdef class InMemoryLookupKB(KnowledgeBase): - """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases, + """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual mentions, to support entity linking of named entities to real-world concepts. DOCS: https://spacy.io/api/inmemorylookupkb @@ -216,7 +216,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): if is_present: if not ignore_warnings: - warnings.warn(Warnings.W024.format(entity=entity, alias=alias)) + warnings.warn(Warnings.W024.format(entity=entity, mention=alias)) else: entry_indices.push_back(int(entry_index)) alias_entry.entry_indices = entry_indices From b4760414173ed03799559fc5713f6c5e17943998 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 9 Mar 2023 14:44:41 +0100 Subject: [PATCH 14/39] Port Candidate and InMemoryCandidate to Cython. --- setup.py | 2 + spacy/kb/__init__.py | 1 + spacy/kb/candidate.pxd | 17 ++++++ spacy/kb/{candidate.py => candidate.pyx} | 62 +++++++++++----------- spacy/kb/kb_in_memory.pyx | 4 +- spacy/tests/pipeline/test_entity_linker.py | 11 ++-- 6 files changed, 58 insertions(+), 39 deletions(-) create mode 100644 spacy/kb/candidate.pxd rename spacy/kb/{candidate.py => candidate.pyx} (73%) diff --git a/setup.py b/setup.py index 9b8897233..2768455b9 100755 --- a/setup.py +++ b/setup.py @@ -30,6 +30,8 @@ MOD_NAMES = [ "spacy.lexeme", "spacy.vocab", "spacy.attrs", + "spacy.kb.candidate", + # "spacy.kb.inmemorycandidate", "spacy.kb.kb", "spacy.kb.kb_in_memory", "spacy.ml.tb_framework", diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py index c8a657d62..ff0e209e3 100644 --- a/spacy/kb/__init__.py +++ b/spacy/kb/__init__.py @@ -2,4 +2,5 @@ from .kb import KnowledgeBase from .kb_in_memory import InMemoryLookupKB from .candidate import Candidate, InMemoryCandidate + __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd new file mode 100644 index 000000000..17056f615 --- /dev/null +++ b/spacy/kb/candidate.pxd @@ -0,0 +1,17 @@ +from libcpp.vector cimport vector +from .kb_in_memory cimport InMemoryLookupKB +from ..typedefs cimport hash_t + +cdef class Candidate: + cdef readonly str _entity_id_ + cdef readonly hash_t _entity_id + cdef readonly str _mention + cpdef vector[float] _entity_vector + cdef float _prior_prob + + +cdef class InMemoryCandidate(Candidate): + cdef readonly InMemoryLookupKB _kb + cdef hash_t _entity_hash + cdef float _entity_freq + cdef hash_t _alias_hash \ No newline at end of file diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.pyx similarity index 73% rename from spacy/kb/candidate.py rename to spacy/kb/candidate.pyx index b1c188e09..3e61c4444 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.pyx @@ -1,10 +1,12 @@ -import abc -from typing import List, Union, Callable +# cython: infer_types=True, profile=True -from ..errors import Errors +from ..typedefs cimport hash_t + +from .kb cimport KnowledgeBase +from .kb_in_memory cimport InMemoryLookupKB -class Candidate(abc.ABC): +cdef class Candidate: """A `Candidate` object refers to a textual mention that may or may not be resolved to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking algorithm which will disambiguate the various candidates to the correct one. @@ -16,8 +18,8 @@ class Candidate(abc.ABC): def __init__( self, mention: str, - entity_id: Union[str, int], - entity_vector: List[float], + entity_id: str, + entity_vector: vector[float], prior_prob: float, ): """Initializes properties of `Candidate` instance. @@ -30,22 +32,23 @@ class Candidate(abc.ABC): doesn't) it might be better to eschew this information and always supply the same value. """ self._mention = mention - self._entity_id = entity_id + self._entity_id_ = entity_id # Note that hashing an int value yields the same int value. - self._entity_id_hash = hash(entity_id) + self._entity_id = hash(entity_id) self._entity_vector = entity_vector self._prior_prob = prior_prob + # todo raise exception if this is instantiated class @property def entity_id(self) -> int: """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, otherwise the hash of the entity ID string).""" - return self._entity_id_hash + return self._entity_id @property def entity_id_(self) -> str: """RETURNS (str): String representation of entity ID.""" - return str(self._entity_id) + return self._entity_id_ @property def mention(self) -> str: @@ -53,8 +56,8 @@ class Candidate(abc.ABC): return self._mention @property - def entity_vector(self) -> List[float]: - """RETURNS (List[float]): Entity vector.""" + def entity_vector(self) -> vector[float]: + """RETURNS (vector[float]): Entity vector.""" return self._entity_vector @property @@ -63,20 +66,20 @@ class Candidate(abc.ABC): return self._prior_prob -class InMemoryCandidate(Candidate): +cdef class InMemoryCandidate(Candidate): """Candidate for InMemoryLookupKB.""" def __init__( self, - hash_to_str: Callable[[int], str], - entity_id: int, + kb: InMemoryLookupKB, + entity_hash: int, mention: str, - entity_vector: List[float], + entity_vector: vector[float], prior_prob: float, - entity_freq: int, + entity_freq: float ): """ - hash_to_str (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab hash. + kb (InMemoryLookupKB]): InMemoryLookupKB instance. entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). entity_freq (int): Entity frequency in KB corpus. entity_vector (List[float]): Entity embedding. @@ -88,24 +91,19 @@ class InMemoryCandidate(Candidate): """ super().__init__( mention=mention, - entity_id=entity_id, + entity_id=kb.vocab.strings[entity_hash], entity_vector=entity_vector, prior_prob=prior_prob, ) - self._hash_to_str = hash_to_str + self._kb = kb + self._entity_id = entity_hash self._entity_freq = entity_freq - if not isinstance(self._entity_id, int): - raise ValueError( - Errors.E4006.format(exp_type="int", found_type=str(type(entity_id))) - ) - self._entity_id_str = self._hash_to_str(self._entity_id) - - @property - def entity_freq(self) -> float: - """RETURNS (float): Relative entity frequency.""" - return self._entity_freq @property def entity_id_(self) -> str: - """RETURNS (str): String representation of entity ID.""" - return self._entity_id_str + """RETURNS (str): ID/name of this entity in the KB""" + return self._kb.vocab.strings[self._entity_id] + + @property + def entity_freq(self) -> float: + return self._entity_freq diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 692390f10..3e9001da9 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -243,8 +243,8 @@ cdef class InMemoryLookupKB(KnowledgeBase): return [ InMemoryCandidate( - hash_to_str=self.vocab.strings.__getitem__, - entity_id=self._entries[entry_index].entity_hash, + kb=self, + entity_hash=self._entries[entry_index].entity_hash, mention=alias, entity_vector=self._vectors_table[self._entries[entry_index].vector_index], prior_prob=prior_prob, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index d0cfdd3c7..cd1dc90e4 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -465,16 +465,17 @@ def test_candidate_generation(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates + adam_ent_cands = get_candidates(mykb, adam_ent) assert len(get_candidates(mykb, douglas_ent)) == 2 - assert len(get_candidates(mykb, adam_ent)) == 1 + assert len(adam_ent_cands) == 1 assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates - assert get_candidates(mykb, adam_ent)[0].entity_id_ == "Q2" - assert get_candidates(mykb, adam_ent)[0].mention == "adam" - assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) - assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) + assert adam_ent_cands[0].entity_id_ == "Q2" + assert adam_ent_cands[0].mention == "adam" + assert_almost_equal(adam_ent_cands[0].entity_freq, 12) + assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9) def test_el_pipe_configuration(nlp): From 845864beb4ab16e8a9e96621e3a3e5227032220b Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 9 Mar 2023 14:55:10 +0100 Subject: [PATCH 15/39] Remove redundant entry in setup.py. --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 2768455b9..d5b82ec68 100755 --- a/setup.py +++ b/setup.py @@ -31,7 +31,6 @@ MOD_NAMES = [ "spacy.vocab", "spacy.attrs", "spacy.kb.candidate", - # "spacy.kb.inmemorycandidate", "spacy.kb.kb", "spacy.kb.kb_in_memory", "spacy.ml.tb_framework", From b0ee34185da4877f8045f0016316a7120a255d61 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 9 Mar 2023 14:56:44 +0100 Subject: [PATCH 16/39] Add abstract class check. --- spacy/kb/candidate.pyx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index 3e61c4444..4f75ac687 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -4,7 +4,7 @@ from ..typedefs cimport hash_t from .kb cimport KnowledgeBase from .kb_in_memory cimport InMemoryLookupKB - +from ..errors import Errors cdef class Candidate: """A `Candidate` object refers to a textual mention that may or may not be resolved @@ -22,7 +22,7 @@ cdef class Candidate: entity_vector: vector[float], prior_prob: float, ): - """Initializes properties of `Candidate` instance. + """Initializes properties of abstract base class `Candidate`. mention (str): Mention text for this candidate. entity_id (Union[str, int]): Unique entity ID. entity_vector (List[float]): Entity embedding. @@ -31,13 +31,18 @@ cdef class Candidate: cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) it might be better to eschew this information and always supply the same value. """ + # Make sure abstract KB is not instantiated. + if self.__class__ == Candidate: + raise TypeError( + Errors.E1046.format(cls_name=self.__class__.__name__) + ) + self._mention = mention self._entity_id_ = entity_id # Note that hashing an int value yields the same int value. self._entity_id = hash(entity_id) self._entity_vector = entity_vector self._prior_prob = prior_prob - # todo raise exception if this is instantiated class @property def entity_id(self) -> int: From c61654eef8cd7afa3b1d00f9c90f3f11b6528c08 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 9 Mar 2023 15:04:10 +0100 Subject: [PATCH 17/39] Drop storing mention. --- spacy/kb/candidate.pxd | 3 +-- spacy/kb/candidate.pyx | 16 +++++++++------- spacy/kb/kb_in_memory.pyx | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd index 17056f615..23da038d3 100644 --- a/spacy/kb/candidate.pxd +++ b/spacy/kb/candidate.pxd @@ -5,7 +5,6 @@ from ..typedefs cimport hash_t cdef class Candidate: cdef readonly str _entity_id_ cdef readonly hash_t _entity_id - cdef readonly str _mention cpdef vector[float] _entity_vector cdef float _prior_prob @@ -14,4 +13,4 @@ cdef class InMemoryCandidate(Candidate): cdef readonly InMemoryLookupKB _kb cdef hash_t _entity_hash cdef float _entity_freq - cdef hash_t _alias_hash \ No newline at end of file + cdef hash_t _mention \ No newline at end of file diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index 4f75ac687..dcf302112 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -17,13 +17,11 @@ cdef class Candidate: def __init__( self, - mention: str, entity_id: str, entity_vector: vector[float], prior_prob: float, ): """Initializes properties of abstract base class `Candidate`. - mention (str): Mention text for this candidate. entity_id (Union[str, int]): Unique entity ID. entity_vector (List[float]): Entity embedding. prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of @@ -37,7 +35,6 @@ cdef class Candidate: Errors.E1046.format(cls_name=self.__class__.__name__) ) - self._mention = mention self._entity_id_ = entity_id # Note that hashing an int value yields the same int value. self._entity_id = hash(entity_id) @@ -58,7 +55,7 @@ cdef class Candidate: @property def mention(self) -> str: """RETURNS (str): Mention.""" - return self._mention + raise NotImplementedError @property def entity_vector(self) -> vector[float]: @@ -78,7 +75,7 @@ cdef class InMemoryCandidate(Candidate): self, kb: InMemoryLookupKB, entity_hash: int, - mention: str, + mention_hash: int, entity_vector: vector[float], prior_prob: float, entity_freq: float @@ -88,22 +85,27 @@ cdef class InMemoryCandidate(Candidate): entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). entity_freq (int): Entity frequency in KB corpus. entity_vector (List[float]): Entity embedding. - mention (str): Mention. + mention_hash (int): Mention hash. prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) it might be better to eschew this information and always supply the same value. """ super().__init__( - mention=mention, entity_id=kb.vocab.strings[entity_hash], entity_vector=entity_vector, prior_prob=prior_prob, ) self._kb = kb + self._mention = mention_hash self._entity_id = entity_hash self._entity_freq = entity_freq + @property + def mention(self) -> str: + """RETURNS (str): ID/name of this entity in the KB""" + return self._kb.vocab.strings[self._mention] + @property def entity_id_(self) -> str: """RETURNS (str): ID/name of this entity in the KB""" diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 3e9001da9..059f3a140 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -245,7 +245,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): InMemoryCandidate( kb=self, entity_hash=self._entries[entry_index].entity_hash, - mention=alias, + mention_hash=alias_hash, entity_vector=self._vectors_table[self._entries[entry_index].vector_index], prior_prob=prior_prob, entity_freq=self._entries[entry_index].freq From 34e092e4e522600c264499bfcc678995616b295f Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 9 Mar 2023 16:15:39 +0100 Subject: [PATCH 18/39] Update spacy/kb/candidate.pxd Co-authored-by: Sofie Van Landeghem --- spacy/kb/candidate.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd index 23da038d3..920580d33 100644 --- a/spacy/kb/candidate.pxd +++ b/spacy/kb/candidate.pxd @@ -13,4 +13,4 @@ cdef class InMemoryCandidate(Candidate): cdef readonly InMemoryLookupKB _kb cdef hash_t _entity_hash cdef float _entity_freq - cdef hash_t _mention \ No newline at end of file + cdef hash_t _mention From 6fc7997c06ca8aded44194f4d9f9e81b9d112139 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 10 Mar 2023 08:55:32 +0100 Subject: [PATCH 19/39] Fix entity_id refactoring problems in docstrings. --- spacy/kb/candidate.pyx | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index dcf302112..6f0bd061e 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -1,14 +1,11 @@ # cython: infer_types=True, profile=True -from ..typedefs cimport hash_t - -from .kb cimport KnowledgeBase from .kb_in_memory cimport InMemoryLookupKB from ..errors import Errors cdef class Candidate: """A `Candidate` object refers to a textual mention that may or may not be resolved - to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking + to a specific entity from a Knowledge Base. This will be used as input for the entity linking algorithm which will disambiguate the various candidates to the correct one. Each candidate (mention, entity_id) pair is assigned a certain prior probability. @@ -87,7 +84,7 @@ cdef class InMemoryCandidate(Candidate): entity_vector (List[float]): Entity embedding. mention_hash (int): Mention hash. prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of - the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In + the context, this mention resolves to this entity in the corpus used to build the knowledge base. In cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) it might be better to eschew this information and always supply the same value. """ From 27053912da0fbaae216e2d862371298228346063 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 10 Mar 2023 09:00:30 +0100 Subject: [PATCH 20/39] Drop unused InMemoryCandidate._entity_hash. --- spacy/kb/candidate.pxd | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd index 23da038d3..924357ad0 100644 --- a/spacy/kb/candidate.pxd +++ b/spacy/kb/candidate.pxd @@ -11,6 +11,5 @@ cdef class Candidate: cdef class InMemoryCandidate(Candidate): cdef readonly InMemoryLookupKB _kb - cdef hash_t _entity_hash cdef float _entity_freq cdef hash_t _mention \ No newline at end of file From 348dd1c87ec853f837907ef8b804b103be1ebfcc Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 10 Mar 2023 09:03:41 +0100 Subject: [PATCH 21/39] Update docstrings. --- spacy/kb/candidate.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index 6f0bd061e..d45d130c5 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -100,14 +100,15 @@ cdef class InMemoryCandidate(Candidate): @property def mention(self) -> str: - """RETURNS (str): ID/name of this entity in the KB""" + """RETURNS (str): Mention.""" return self._kb.vocab.strings[self._mention] @property def entity_id_(self) -> str: - """RETURNS (str): ID/name of this entity in the KB""" + """RETURNS (str): ID/name of this entity in the KB.""" return self._kb.vocab.strings[self._entity_id] @property def entity_freq(self) -> float: + """RETURNS (float): Entity frequence of this candidate's entity in the KB.""" return self._entity_freq From 649c146e2c60d71dbdabe4e7532bc985da7b038c Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 13 Mar 2023 09:21:08 +0100 Subject: [PATCH 22/39] Move attributes out of Candidate. --- spacy/kb/candidate.pxd | 8 +++--- spacy/kb/candidate.pyx | 56 +++++++++++++++++------------------------- 2 files changed, 27 insertions(+), 37 deletions(-) diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd index ddb3dbca8..0e753bf99 100644 --- a/spacy/kb/candidate.pxd +++ b/spacy/kb/candidate.pxd @@ -3,13 +3,13 @@ from .kb_in_memory cimport InMemoryLookupKB from ..typedefs cimport hash_t cdef class Candidate: - cdef readonly str _entity_id_ - cdef readonly hash_t _entity_id - cpdef vector[float] _entity_vector - cdef float _prior_prob + pass cdef class InMemoryCandidate(Candidate): + cdef readonly hash_t _entity_hash + cpdef vector[float] _entity_vector + cdef float _prior_prob cdef readonly InMemoryLookupKB _kb cdef float _entity_freq cdef hash_t _mention diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index d45d130c5..9d3a05ec8 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -12,42 +12,23 @@ cdef class Candidate: DOCS: https://spacy.io/api/kb/#candidate-init """ - def __init__( - self, - entity_id: str, - entity_vector: vector[float], - prior_prob: float, - ): - """Initializes properties of abstract base class `Candidate`. - entity_id (Union[str, int]): Unique entity ID. - entity_vector (List[float]): Entity embedding. - prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of - the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In - cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus - doesn't) it might be better to eschew this information and always supply the same value. - """ - # Make sure abstract KB is not instantiated. + def __init__(self): + # Make sure abstract Candidate is not instantiated. if self.__class__ == Candidate: raise TypeError( Errors.E1046.format(cls_name=self.__class__.__name__) ) - self._entity_id_ = entity_id - # Note that hashing an int value yields the same int value. - self._entity_id = hash(entity_id) - self._entity_vector = entity_vector - self._prior_prob = prior_prob - @property def entity_id(self) -> int: """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, otherwise the hash of the entity ID string).""" - return self._entity_id + raise NotImplementedError @property def entity_id_(self) -> str: """RETURNS (str): String representation of entity ID.""" - return self._entity_id_ + raise NotImplementedError @property def mention(self) -> str: @@ -57,12 +38,12 @@ cdef class Candidate: @property def entity_vector(self) -> vector[float]: """RETURNS (vector[float]): Entity vector.""" - return self._entity_vector + raise NotImplementedError @property def prior_prob(self) -> float: """RETURNS (List[float]): Entity vector.""" - return self._prior_prob + raise NotImplementedError cdef class InMemoryCandidate(Candidate): @@ -88,27 +69,36 @@ cdef class InMemoryCandidate(Candidate): cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) it might be better to eschew this information and always supply the same value. """ - super().__init__( - entity_id=kb.vocab.strings[entity_hash], - entity_vector=entity_vector, - prior_prob=prior_prob, - ) + super().__init__() + + self._entity_hash = entity_hash + self._entity_vector = entity_vector + self._prior_prob = prior_prob self._kb = kb self._mention = mention_hash self._entity_id = entity_hash self._entity_freq = entity_freq + @property + def entity_id(self) -> int: + return self._entity_hash + + @property + def entity_vector(self) -> vector[float]: + return self._entity_vector + + @property + def prior_prob(self) -> float: + return self._prior_prob + @property def mention(self) -> str: - """RETURNS (str): Mention.""" return self._kb.vocab.strings[self._mention] @property def entity_id_(self) -> str: - """RETURNS (str): ID/name of this entity in the KB.""" return self._kb.vocab.strings[self._entity_id] @property def entity_freq(self) -> float: - """RETURNS (float): Entity frequence of this candidate's entity in the KB.""" return self._entity_freq From 6adc15178f74529aa1c01a390ab0c09c682e2329 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 13 Mar 2023 14:26:14 +0100 Subject: [PATCH 23/39] Partially fix alias/mention terminology usage. Convert Candidate to interface. --- spacy/errors.py | 2 +- spacy/kb/candidate.pyx | 6 +++--- spacy/kb/kb.pyx | 2 +- spacy/kb/kb_in_memory.pyx | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 92770b8a8..30446e7ea 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes): "ignoring the duplicate entry.") W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " "incorrect. Modify PhraseMatcher._terminal_hash to fix.") - W024 = ("Entity '{entity}' - mention '{mention}' combination already exists in " + W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in " "the Knowledge Base.") W026 = ("Unable to set all sentence boundaries from dependency parses. If " "you are constructing a parse tree incrementally by setting " diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index 9d3a05ec8..ac19df671 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -7,7 +7,8 @@ cdef class Candidate: """A `Candidate` object refers to a textual mention that may or may not be resolved to a specific entity from a Knowledge Base. This will be used as input for the entity linking algorithm which will disambiguate the various candidates to the correct one. - Each candidate (mention, entity_id) pair is assigned a certain prior probability. + Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base, + is assigned a certain prior probability. DOCS: https://spacy.io/api/kb/#candidate-init """ @@ -76,7 +77,6 @@ cdef class InMemoryCandidate(Candidate): self._prior_prob = prior_prob self._kb = kb self._mention = mention_hash - self._entity_id = entity_hash self._entity_freq = entity_freq @property @@ -97,7 +97,7 @@ cdef class InMemoryCandidate(Candidate): @property def entity_id_(self) -> str: - return self._kb.vocab.strings[self._entity_id] + return self._kb.vocab.strings[self._entity_hash] @property def entity_freq(self) -> float: diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 158c3304f..7da312863 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -11,7 +11,7 @@ from ..errors import Errors cdef class KnowledgeBase: - """A `KnowledgeBase` instance stores unique identifiers for entities and their textual mentions, + """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases, to support entity linking of named entities to real-world concepts. This is an abstract class and requires its operations to be implemented. diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 059f3a140..4ceb87888 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -22,7 +22,7 @@ from .candidate import InMemoryCandidate cdef class InMemoryLookupKB(KnowledgeBase): - """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual mentions, + """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases, to support entity linking of named entities to real-world concepts. DOCS: https://spacy.io/api/inmemorylookupkb @@ -216,7 +216,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): if is_present: if not ignore_warnings: - warnings.warn(Warnings.W024.format(entity=entity, mention=alias)) + warnings.warn(Warnings.W024.format(entity=entity, alias=alias)) else: entry_indices.push_back(int(entry_index)) alias_entry.entry_indices = entry_indices From 4a921766f128755fb733e899e6701599008184a5 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 13 Mar 2023 16:54:38 +0100 Subject: [PATCH 24/39] Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs(). --- spacy/errors.py | 3 +++ spacy/kb/candidate.pyx | 7 ++----- spacy/kb/kb.pyx | 7 +++++++ spacy/kb/kb_in_memory.pyx | 3 +++ spacy/pipeline/entity_linker.py | 14 ++++++++------ 5 files changed, 23 insertions(+), 11 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 30446e7ea..0f8091e3a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -209,7 +209,10 @@ class Warnings(metaclass=ErrorsWithCodes): "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") + # v4 warning strings W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") + W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " + "lookups.") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index ac19df671..9e4e9f321 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -41,11 +41,6 @@ cdef class Candidate: """RETURNS (vector[float]): Entity vector.""" raise NotImplementedError - @property - def prior_prob(self) -> float: - """RETURNS (List[float]): Entity vector.""" - raise NotImplementedError - cdef class InMemoryCandidate(Candidate): """Candidate for InMemoryLookupKB.""" @@ -89,6 +84,7 @@ cdef class InMemoryCandidate(Candidate): @property def prior_prob(self) -> float: + """RETURNS (float): Prior probability that this mention resolves to this entity.""" return self._prior_prob @property @@ -101,4 +97,5 @@ cdef class InMemoryCandidate(Candidate): @property def entity_freq(self) -> float: + """RETURNS (float): Entity frequency in KB corpus.""" return self._entity_freq diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 7da312863..d10123e37 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -106,3 +106,10 @@ cdef class KnowledgeBase: raise NotImplementedError( Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) ) + + @property + def supports_prior_probs(self) -> bool: + """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions.""" + raise NotImplementedError( + Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__) + ) diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 4ceb87888..e3b9dfcb3 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -283,6 +283,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): return 0.0 + def supports_prior_probs(self) -> bool: + return True + def to_bytes(self, **kwargs): """Serialize the current state to a binary string. """ diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 39cff218a..caced9cfd 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,5 +1,5 @@ -from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any -from typing import cast +import warnings +from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast from numpy import dtype from thinc.types import Floats1d, Floats2d, Ints1d, Ragged from pathlib import Path @@ -10,14 +10,13 @@ from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import set_dropout_rate from ..kb import KnowledgeBase, Candidate -from ..ml import empty_kb from ..tokens import Doc, Span from .pipe import deserialize_config from .trainable_pipe import TrainablePipe from ..language import Language from ..vocab import Vocab from ..training import Example, validate_examples, validate_get_examples -from ..errors import Errors +from ..errors import Errors, Warnings from ..util import SimpleFrozenList, registry from .. import util from ..scorer import Scorer @@ -240,6 +239,8 @@ class EntityLinker(TrainablePipe): if candidates_batch_size < 1: raise ValueError(Errors.E1044) + if self.incl_prior and not self.kb.supports_prior_probs: + warnings.warn(Warnings.W401) def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will @@ -532,8 +533,9 @@ class EntityLinker(TrainablePipe): else: random.shuffle(candidates) # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.incl_prior: + if self.incl_prior and self.kb.supports_prior_probs: + prior_probs = xp.asarray([c.prior_prob for c in candidates]) # type: ignore + else: prior_probs = xp.asarray([0.0 for _ in candidates]) scores = prior_probs # add in similarity from the context From be858981e6984a31b04a4b9068603a6cb8a07412 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 13 Mar 2023 17:01:20 +0100 Subject: [PATCH 25/39] Update docstrings related to prior_prob. --- spacy/kb/candidate.pyx | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index 9e4e9f321..6707d32ed 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -60,10 +60,8 @@ cdef class InMemoryCandidate(Candidate): entity_freq (int): Entity frequency in KB corpus. entity_vector (List[float]): Entity embedding. mention_hash (int): Mention hash. - prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of - the context, this mention resolves to this entity in the corpus used to build the knowledge base. In - cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus - doesn't) it might be better to eschew this information and always supply the same value. + prior_prob (float): Prior probability of entity for this mention. I. e. the probability that, independent of + the context, this mention - which matches one of this entity's aliases - resolves to one this entity. """ super().__init__() @@ -84,7 +82,8 @@ cdef class InMemoryCandidate(Candidate): @property def prior_prob(self) -> float: - """RETURNS (float): Prior probability that this mention resolves to this entity.""" + """RETURNS (float): Prior probability that this mention, which matches one of this entity's aliases, resolves to + this entity.""" return self._prior_prob @property From 28dbed64cbd6f21041691890d4b5b9a348d6ebfa Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 14 Mar 2023 13:33:05 +0100 Subject: [PATCH 26/39] Update alias/mention usage in doc(strings). --- spacy/kb/candidate.pyx | 12 ++++++------ spacy/kb/kb_in_memory.pyx | 2 +- website/docs/api/kb.mdx | 26 +++++++++++++------------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index 6707d32ed..94445f27d 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -49,7 +49,7 @@ cdef class InMemoryCandidate(Candidate): self, kb: InMemoryLookupKB, entity_hash: int, - mention_hash: int, + alias_hash: int, entity_vector: vector[float], prior_prob: float, entity_freq: float @@ -59,9 +59,9 @@ cdef class InMemoryCandidate(Candidate): entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). entity_freq (int): Entity frequency in KB corpus. entity_vector (List[float]): Entity embedding. - mention_hash (int): Mention hash. - prior_prob (float): Prior probability of entity for this mention. I. e. the probability that, independent of - the context, this mention - which matches one of this entity's aliases - resolves to one this entity. + alias_hash (int): Alias hash. + prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of + the context, this alias - which matches one of this entity's aliases - resolves to one this entity. """ super().__init__() @@ -69,7 +69,7 @@ cdef class InMemoryCandidate(Candidate): self._entity_vector = entity_vector self._prior_prob = prior_prob self._kb = kb - self._mention = mention_hash + self._mention = alias_hash self._entity_freq = entity_freq @property @@ -82,7 +82,7 @@ cdef class InMemoryCandidate(Candidate): @property def prior_prob(self) -> float: - """RETURNS (float): Prior probability that this mention, which matches one of this entity's aliases, resolves to + """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to this entity.""" return self._prior_prob diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index e3b9dfcb3..c9ced8309 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -245,7 +245,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): InMemoryCandidate( kb=self, entity_hash=self._entries[entry_index].entity_hash, - mention_hash=alias_hash, + alias_hash=alias_hash, entity_vector=self._vectors_table[self._entries[entry_index].vector_index], prior_prob=prior_prob, entity_freq=self._entries[entry_index].freq diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index e3b699140..9536a3fe3 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -175,11 +175,11 @@ Restore the state of the knowledge base from a given directory. Note that the ## InMemoryCandidate {id="candidate",tag="class"} -An `InMemoryCandidate` object refers to a textual mention (alias) that may or may -not be resolved to a specific entity from a `KnowledgeBase`. This will be used -as input for the entity linking algorithm which will disambiguate the various -candidates to the correct one. Each candidate `(alias, entity)` pair is assigned -to a certain prior probability. +An `InMemoryCandidate` object refers to a textual mention (alias) that may or +may not be resolved to a specific entity from a `KnowledgeBase`. This will be +used as input for the entity linking algorithm which will disambiguate the +various candidates to the correct one. Each candidate `(alias, entity)` pair is +assigned to a certain prior probability. ### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"} @@ -190,19 +190,19 @@ of the [`entity_linker`](/api/entitylinker) pipe. > #### Example```python > > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb, -> entity_hash, entity_freq, entity_vector, mention_hash, prior_prob) +> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) > > ``` > > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------- | -| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ | -| `entity_hash` | The hash of the entity's KB ID. ~~int~~ | -| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ | -| `mention_hash` | The hash of the textual mention. ~~int~~ | -| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | +| Name | Description | +| ------------- | ------------------------------------------------------------------------- | +| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ | +| `entity_hash` | The hash of the entity's KB ID. ~~int~~ | +| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ | +| `alias_hash` | The hash of the entity alias. ~~int~~ | +| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | ## InMemoryCandidate attributes {id="candidate-attributes"} From b7b4282821fb2b14035773de03f1891363b733ec Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 15 Mar 2023 09:20:07 +0100 Subject: [PATCH 27/39] Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem --- spacy/ml/models/entity_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 50ad4daba..5da0544a9 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -124,7 +124,7 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: Return candidate entities for a given mention and fetching appropriate entries from the index. kb (KnowledgeBase): Knowledge base to query. mention (Span): Entity mention for which to identify candidates. - RETURNS (Iterable[InMemoryCandidate]): Identified candidates. + RETURNS (Iterable[Candidate]): Identified candidates. """ return kb.get_candidates(mention) From 961795d9f17d676f0ff46491c7b789106dbfaf0e Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 15 Mar 2023 09:20:25 +0100 Subject: [PATCH 28/39] Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem --- spacy/ml/models/entity_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 5da0544a9..ea8882430 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -136,6 +136,6 @@ def get_candidates_batch( Return candidate entities for the given mentions and fetching appropriate entries from the index. kb (KnowledgeBase): Knowledge base to query. mention (Iterable[Span]): Entity mentions for which to identify candidates. - RETURNS (Iterable[Iterable[InMemoryCandidate]]): Identified candidates. + RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ return kb.get_candidates_batch(mentions) From 3cfc1c6accb4f75b141b075aa53d8d9ae12166f0 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 15 Mar 2023 09:23:31 +0100 Subject: [PATCH 29/39] Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs. --- spacy/kb/candidate.pyx | 12 ++++-------- spacy/kb/kb.pyx | 5 +++-- website/docs/api/inmemorylookupkb.mdx | 16 ---------------- 3 files changed, 7 insertions(+), 26 deletions(-) diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index 94445f27d..9f141b20a 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -31,11 +31,6 @@ cdef class Candidate: """RETURNS (str): String representation of entity ID.""" raise NotImplementedError - @property - def mention(self) -> str: - """RETURNS (str): Mention.""" - raise NotImplementedError - @property def entity_vector(self) -> vector[float]: """RETURNS (vector[float]): Entity vector.""" @@ -69,7 +64,7 @@ cdef class InMemoryCandidate(Candidate): self._entity_vector = entity_vector self._prior_prob = prior_prob self._kb = kb - self._mention = alias_hash + self._alias = alias_hash self._entity_freq = entity_freq @property @@ -87,8 +82,9 @@ cdef class InMemoryCandidate(Candidate): return self._prior_prob @property - def mention(self) -> str: - return self._kb.vocab.strings[self._mention] + def alias(self) -> str: + """RETURNS (str): Alias.""" + return self._kb.vocab.strings[self._alias] @property def entity_id_(self) -> str: diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index d10123e37..e4165301e 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -42,8 +42,9 @@ cdef class KnowledgeBase: def get_candidates(self, mention: Span) -> Iterable[Candidate]: """ - Return candidate entities for specified text. Each candidate defines the entity, the original mention, - and the prior probability of that mention resolving to that entity. + Return candidate entities for specified text. Each candidate defines at least the entity and the entity's + embedding vector. Depending on the KB implementation, further properties - such as the prior probability of the + specified mention text resolving to that entity - might be included. If the no candidate is found for a given text, an empty list is returned. mention (Span): Mention for which to get candidates. RETURNS (Iterable[Candidate]): Identified candidates. diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx index e88e4a500..9063939a3 100644 --- a/website/docs/api/inmemorylookupkb.mdx +++ b/website/docs/api/inmemorylookupkb.mdx @@ -199,22 +199,6 @@ to you. | `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | | **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ | -## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"} - -Given a certain textual mention as input, retrieve a list of candidate entities -of type [`InMemoryCandidate`](/api/kb#candidate). - -> #### Example -> -> ```python -> candidates = kb.get_alias_candidates("Douglas") -> ``` - -| Name | Description | -| ----------- | ----------------------------------------------------------------------------- | -| `alias` | The textual mention or alias. ~~str~~ | -| **RETURNS** | The list of relevant `InMemoryCandidate` objects. ~~List[InMemoryCandidate]~~ | - ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"} Given a certain entity ID, retrieve its pretrained entity vector. From 80fb0666b98ff0bd96c36daf5f7d0e5e00ecdb25 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 15 Mar 2023 09:25:41 +0100 Subject: [PATCH 30/39] Update docstrings. --- spacy/kb/kb.pyx | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index e4165301e..30694dc2a 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -32,9 +32,10 @@ cdef class KnowledgeBase: def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: """ - Return candidate entities for specified texts. Each candidate defines the entity, the original mention, - and the prior probability of this mention resolving to that entity. - If no candidate is found for a given text, an empty list is returned. + Return candidate entities for specified mention texts. Each candidate defines at least the entity and the + entity's embedding vector. Depending on the KB implementation, further properties - such as the prior + probability of the specified mention text resolving to that entity - might be included. + If the no candidates are found for a given mention text, an empty list is returned. mentions (Iterable[Span]): Mentions for which to get candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ @@ -42,10 +43,10 @@ cdef class KnowledgeBase: def get_candidates(self, mention: Span) -> Iterable[Candidate]: """ - Return candidate entities for specified text. Each candidate defines at least the entity and the entity's - embedding vector. Depending on the KB implementation, further properties - such as the prior probability of the - specified mention text resolving to that entity - might be included. - If the no candidate is found for a given text, an empty list is returned. + Return candidate entities for specified mention text. Each candidate defines at least the entity and the + entity's embedding vector. Depending on the KB implementation, further properties - such as the prior + probability of the specified mention text resolving to that entity - might be included. + If the no candidate is found for the given mention text, an empty list is returned. mention (Span): Mention for which to get candidates. RETURNS (Iterable[Candidate]): Identified candidates. """ From 830939ee648e32ca5c5dd05e342b8c22ea9ffc96 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 15 Mar 2023 10:51:34 +0100 Subject: [PATCH 31/39] Fix InMemoryCandidate attribute names. --- spacy/kb/candidate.pxd | 2 +- spacy/kb/candidate.pyx | 4 ++-- spacy/tests/pipeline/test_entity_linker.py | 6 +++--- spacy/tests/serialize/test_serialize_kb.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd index 0e753bf99..f21f423e4 100644 --- a/spacy/kb/candidate.pxd +++ b/spacy/kb/candidate.pxd @@ -8,8 +8,8 @@ cdef class Candidate: cdef class InMemoryCandidate(Candidate): cdef readonly hash_t _entity_hash + cdef readonly hash_t _alias_hash cpdef vector[float] _entity_vector cdef float _prior_prob cdef readonly InMemoryLookupKB _kb cdef float _entity_freq - cdef hash_t _mention diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index 9f141b20a..3d8da4b95 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -64,7 +64,7 @@ cdef class InMemoryCandidate(Candidate): self._entity_vector = entity_vector self._prior_prob = prior_prob self._kb = kb - self._alias = alias_hash + self._alias_hash = alias_hash self._entity_freq = entity_freq @property @@ -84,7 +84,7 @@ cdef class InMemoryCandidate(Candidate): @property def alias(self) -> str: """RETURNS (str): Alias.""" - return self._kb.vocab.strings[self._alias] + return self._kb.vocab.strings[self._alias_hash] @property def entity_id_(self) -> str: diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index cd1dc90e4..e29e3920b 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -473,7 +473,7 @@ def test_candidate_generation(nlp): # test the content of the candidates assert adam_ent_cands[0].entity_id_ == "Q2" - assert adam_ent_cands[0].mention == "adam" + assert adam_ent_cands[0].alias == "adam" assert_almost_equal(adam_ent_cands[0].entity_freq, 12) assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9) @@ -566,7 +566,7 @@ def test_vocab_serialization(nlp): assert len(candidates) == 1 assert candidates[0].entity_id == q2_hash assert candidates[0].entity_id_ == "Q2" - assert candidates[0].mention == "adam" + assert candidates[0].alias == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") @@ -577,7 +577,7 @@ def test_vocab_serialization(nlp): assert len(candidates) == 1 assert candidates[0].entity_id == q2_hash assert candidates[0].entity_id_ == "Q2" - assert candidates[0].mention == "adam" + assert candidates[0].alias == "adam" assert kb_new_vocab.get_vector("Q2") == [2] assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 336fd16fe..eb4254d31 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -74,13 +74,13 @@ def _check_kb(kb): assert candidates[0].entity_id_ == "Q007" assert 6.999 < candidates[0].entity_freq < 7.01 assert candidates[0].entity_vector == [0, 0, 7] - assert candidates[0].mention == "double07" + assert candidates[0].alias == "double07" assert 0.899 < candidates[0].prior_prob < 0.901 assert candidates[1].entity_id_ == "Q17" assert 1.99 < candidates[1].entity_freq < 2.01 assert candidates[1].entity_vector == [7, 1, 0] - assert candidates[1].mention == "double07" + assert candidates[1].alias == "double07" assert 0.099 < candidates[1].prior_prob < 0.101 From 978fbdcee1b45f5a88c21da5a219125f711e2b9f Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 17 Mar 2023 08:58:17 +0100 Subject: [PATCH 32/39] Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem --- spacy/kb/kb.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 30694dc2a..6260dfb72 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -32,10 +32,10 @@ cdef class KnowledgeBase: def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: """ - Return candidate entities for specified mention texts. Each candidate defines at least the entity and the + Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the entity's embedding vector. Depending on the KB implementation, further properties - such as the prior probability of the specified mention text resolving to that entity - might be included. - If the no candidates are found for a given mention text, an empty list is returned. + If no candidates are found for a given mention, an empty list is returned. mentions (Iterable[Span]): Mentions for which to get candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ From 307bbab285bb2aa7ab5a696ed3638e6559ba3633 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 17 Mar 2023 08:58:28 +0100 Subject: [PATCH 33/39] Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem --- spacy/ml/models/entity_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index ea8882430..7fe0b4741 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -135,7 +135,7 @@ def get_candidates_batch( """ Return candidate entities for the given mentions and fetching appropriate entries from the index. kb (KnowledgeBase): Knowledge base to query. - mention (Iterable[Span]): Entity mentions for which to identify candidates. + mentions (Iterable[Span]): Entity mentions for which to identify candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ return kb.get_candidates_batch(mentions) From 2377b67f81d5707f46ba6ca085591dcc9256f334 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 17 Mar 2023 08:59:52 +0100 Subject: [PATCH 34/39] Update W401 test. --- spacy/errors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index 0f8091e3a..e7f59c091 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -212,7 +212,8 @@ class Warnings(metaclass=ErrorsWithCodes): # v4 warning strings W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " - "lookups.") + "lookups (`.supports_prior_probs is False`). If your KB does support prior probability lookups, make sure " + "to return True in `.supports_prior_probs`.") class Errors(metaclass=ErrorsWithCodes): From 4d8dce5ba2a74faf699b9fd78a3b773a871682e7 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 17 Mar 2023 11:28:18 +0100 Subject: [PATCH 35/39] Update spacy/errors.py Co-authored-by: Sofie Van Landeghem --- spacy/errors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index e7f59c091..e1f7e7400 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -212,8 +212,8 @@ class Warnings(metaclass=ErrorsWithCodes): # v4 warning strings W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " - "lookups (`.supports_prior_probs is False`). If your KB does support prior probability lookups, make sure " - "to return True in `.supports_prior_probs`.") + "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure " + "to return `True` in `.supports_prior_probs`.") class Errors(metaclass=ErrorsWithCodes): From faede7155ccb553ee04c409d873eaf193f9ad86e Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 17 Mar 2023 11:32:41 +0100 Subject: [PATCH 36/39] Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem --- spacy/kb/kb.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 6260dfb72..1cb08f488 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -43,10 +43,10 @@ cdef class KnowledgeBase: def get_candidates(self, mention: Span) -> Iterable[Candidate]: """ - Return candidate entities for specified mention text. Each candidate defines at least the entity and the + Return candidate entities for a specific mention. Each candidate defines at least the entity and the entity's embedding vector. Depending on the KB implementation, further properties - such as the prior probability of the specified mention text resolving to that entity - might be included. - If the no candidate is found for the given mention text, an empty list is returned. + If no candidate is found for the given mention, an empty list is returned. mention (Span): Mention for which to get candidates. RETURNS (Iterable[Candidate]): Identified candidates. """ From 9e71adc0743d123fdc19688865dd468dbdf02776 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 19 Mar 2023 23:27:20 +0100 Subject: [PATCH 37/39] Use Candidate output type for toy generators in the test suite to mimick best practices --- spacy/tests/pipeline/test_entity_linker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index e29e3920b..5d2f0c430 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -510,13 +510,13 @@ def test_el_pipe_configuration(nlp): @registry.misc("spacy.LowercaseCandidateGenerator.v1") def create_candidates() -> Callable[ - [InMemoryLookupKB, "Span"], Iterable[InMemoryCandidate] + [InMemoryLookupKB, "Span"], Iterable[Candidate] ]: return get_lowercased_candidates @registry.misc("spacy.LowercaseCandidateBatchGenerator.v1") def create_candidates_batch() -> Callable[ - [InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[InMemoryCandidate]] + [InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]] ]: return get_lowercased_candidates_batch From 0365d3d2e2ff7e22b928b7bf1a54485a7565a5a6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 19 Mar 2023 23:31:02 +0100 Subject: [PATCH 38/39] fix docs --- website/docs/api/inmemorylookupkb.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx index 9063939a3..6fa6cb235 100644 --- a/website/docs/api/inmemorylookupkb.mdx +++ b/website/docs/api/inmemorylookupkb.mdx @@ -196,7 +196,7 @@ to you. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------ | -| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | +| `mentions` | The textual mentions. ~~Iterable[Span]~~ | | **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ | ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"} From b83407388af7bf5b6ae7065b5416ba707d283641 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 19 Mar 2023 23:34:00 +0100 Subject: [PATCH 39/39] fix import --- spacy/tests/pipeline/test_entity_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 5d2f0c430..65406a36e 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -7,7 +7,7 @@ from thinc.types import Ragged from spacy import registry, util from spacy.attrs import ENT_KB_ID from spacy.compat import pickle -from spacy.kb import InMemoryCandidate, InMemoryLookupKB, KnowledgeBase +from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase from spacy.lang.en import English from spacy.ml import load_kb from spacy.ml.models.entity_linker import build_span_maker, get_candidates