Refactor Candidate attribute names. Update docs and tests accordingly.

This commit is contained in:
Raphael Mitsch 2023-03-03 11:08:17 +01:00
parent 46fe069f87
commit 94e57d0ed5
5 changed files with 41 additions and 58 deletions

View File

@ -29,26 +29,26 @@ class Candidate(abc.ABC):
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
doesn't) it might be better to eschew this information and always supply the same value. doesn't) it might be better to eschew this information and always supply the same value.
""" """
self._mention = mention self._mention_ = mention
self._entity_id = entity_id self._entity = entity_id
self._entity_name = entity_name self._entity_ = entity_name
self._entity_vector = entity_vector self._entity_vector = entity_vector
self._prior_prob = prior_prob self._prior_prob = prior_prob
@property @property
def entity(self) -> int: def entity(self) -> int:
"""RETURNS (int): Unique entity ID.""" """RETURNS (int): Unique entity ID."""
return self._entity_id return self._entity
@property @property
def entity_(self) -> str: def entity_(self) -> str:
"""RETURNS (int): Entity name.""" """RETURNS (int): Entity name."""
return self._entity_name return self._entity_
@property @property
def mention(self) -> str: def mention_(self) -> str:
"""RETURNS (str): Mention.""" """RETURNS (str): Mention."""
return self._mention return self._mention_
@property @property
def entity_vector(self) -> List[float]: def entity_vector(self) -> List[float]:
@ -93,20 +93,20 @@ class InMemoryCandidate(Candidate):
prior_prob=prior_prob, prior_prob=prior_prob,
) )
self._retrieve_string_from_hash = retrieve_string_from_hash self._retrieve_string_from_hash = retrieve_string_from_hash
self._entity_hash = entity_hash self._entity = entity_hash
self._entity_freq = entity_freq self._entity_freq = entity_freq
self._mention_hash = mention_hash self._mention = mention_hash
self._prior_prob = prior_prob self._prior_prob = prior_prob
@property @property
def entity(self) -> int: def entity(self) -> int:
"""RETURNS (int): hash of the entity_id's KB ID/name""" """RETURNS (int): hash of the entity_id's KB ID/name"""
return self._entity_hash return self._entity
@property @property
def mention_hash(self) -> int: def mention(self) -> int:
"""RETURNS (int): Mention hash.""" """RETURNS (int): Mention hash."""
return self._mention_hash return self._mention
@property @property
def entity_freq(self) -> float: def entity_freq(self) -> float:

View File

@ -224,9 +224,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
self._aliases_table[alias_index] = alias_entry self._aliases_table[alias_index] = alias_entry
def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]: def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
return self.get_alias_candidates(mention.text) # type: ignore return self._get_alias_candidates(mention.text) # type: ignore
def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
""" """
Return candidate entities for an alias. Each candidate defines the entity, the original alias, Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity. and the prior probability of that alias resolving to that entity.
@ -244,7 +244,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
entity_hash=self._entries[entry_index].entity_hash, entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].freq, entity_freq=self._entries[entry_index].freq,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index], entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
alias_hash=alias_hash, mention_hash=alias_hash,
prior_prob=prior_prob prior_prob=prior_prob
) )
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)

View File

@ -469,7 +469,7 @@ def test_candidate_generation(nlp):
# test the content of the candidates # test the content of the candidates
assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" assert get_candidates(mykb, adam_ent)[0].mention_ == "adam"
assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
@ -499,7 +499,7 @@ def test_el_pipe_configuration(nlp):
assert doc[2].ent_kb_id_ == "Q2" assert doc[2].ent_kb_id_ == "Q2"
def get_lowercased_candidates(kb, span): def get_lowercased_candidates(kb, span):
return kb.get_alias_candidates(span.text.lower()) return kb._get_alias_candidates(span.text.lower())
def get_lowercased_candidates_batch(kb, spans): def get_lowercased_candidates_batch(kb, spans):
return [get_lowercased_candidates(kb, span) for span in spans] return [get_lowercased_candidates(kb, span) for span in spans]
@ -558,24 +558,24 @@ def test_vocab_serialization(nlp):
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
candidates = mykb.get_alias_candidates("adam") candidates = mykb._get_alias_candidates("adam")
assert len(candidates) == 1 assert len(candidates) == 1
assert candidates[0].entity == q2_hash assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2" assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash assert candidates[0].mention == adam_hash
assert candidates[0].alias_ == "adam" assert candidates[0].mention_ == "adam"
with make_tempdir() as d: with make_tempdir() as d:
mykb.to_disk(d / "kb") mykb.to_disk(d / "kb")
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1) kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
kb_new_vocab.from_disk(d / "kb") kb_new_vocab.from_disk(d / "kb")
candidates = kb_new_vocab.get_alias_candidates("adam") candidates = kb_new_vocab._get_alias_candidates("adam")
assert len(candidates) == 1 assert len(candidates) == 1
assert candidates[0].entity == q2_hash assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2" assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash assert candidates[0].mention == adam_hash
assert candidates[0].alias_ == "adam" assert candidates[0].mention_ == "adam"
assert kb_new_vocab.get_vector("Q2") == [2] assert kb_new_vocab.get_vector("Q2") == [2]
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@ -595,20 +595,20 @@ def test_append_alias(nlp):
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
# test the size of the relevant candidates # test the size of the relevant candidates
assert len(mykb.get_alias_candidates("douglas")) == 2 assert len(mykb._get_alias_candidates("douglas")) == 2
# append an alias # append an alias
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
# test the size of the relevant candidates has been incremented # test the size of the relevant candidates has been incremented
assert len(mykb.get_alias_candidates("douglas")) == 3 assert len(mykb._get_alias_candidates("douglas")) == 3
# append the same alias-entity pair again should not work (will throw a warning) # append the same alias-entity pair again should not work (will throw a warning)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
# test the size of the relevant candidates remained unchanged # test the size of the relevant candidates remained unchanged
assert len(mykb.get_alias_candidates("douglas")) == 3 assert len(mykb._get_alias_candidates("douglas")) == 3
@pytest.mark.filterwarnings("ignore:\\[W036") @pytest.mark.filterwarnings("ignore:\\[W036")
@ -905,11 +905,11 @@ def test_kb_to_bytes():
assert kb_2.contains_alias("Russ Cochran") assert kb_2.contains_alias("Russ Cochran")
assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
assert kb_1.get_alias_strings() == kb_2.get_alias_strings() assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
kb_2.get_alias_candidates("Russ Cochran") kb_2._get_alias_candidates("Russ Cochran")
) )
assert len(kb_1.get_alias_candidates("Randomness")) == len( assert len(kb_1._get_alias_candidates("Randomness")) == len(
kb_2.get_alias_candidates("Randomness") kb_2._get_alias_candidates("Randomness")
) )

View File

@ -63,19 +63,19 @@ def _check_kb(kb):
assert alias_string not in kb.get_alias_strings() assert alias_string not in kb.get_alias_strings()
# check candidates & probabilities # check candidates & probabilities
candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_) candidates = sorted(kb._get_alias_candidates("double07"), key=lambda x: x.entity_)
assert len(candidates) == 2 assert len(candidates) == 2
assert candidates[0].entity_ == "Q007" assert candidates[0].entity_ == "Q007"
assert 6.999 < candidates[0].entity_freq < 7.01 assert 6.999 < candidates[0].entity_freq < 7.01
assert candidates[0].entity_vector == [0, 0, 7] assert candidates[0].entity_vector == [0, 0, 7]
assert candidates[0].alias_ == "double07" assert candidates[0].mention_ == "double07"
assert 0.899 < candidates[0].prior_prob < 0.901 assert 0.899 < candidates[0].prior_prob < 0.901
assert candidates[1].entity_ == "Q17" assert candidates[1].entity_ == "Q17"
assert 1.99 < candidates[1].entity_freq < 2.01 assert 1.99 < candidates[1].entity_freq < 2.01
assert candidates[1].entity_vector == [7, 1, 0] assert candidates[1].entity_vector == [7, 1, 0]
assert candidates[1].alias_ == "double07" assert candidates[1].mention_ == "double07"
assert 0.099 < candidates[1].prior_prob < 0.101 assert 0.099 < candidates[1].prior_prob < 0.101

View File

@ -103,23 +103,6 @@ to you.
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | | `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"}
<Infobox variant="warning">
This method is _not_ available from spaCy 3.5 onwards.
</Infobox>
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
allow more flexibility in customizing knowledge bases. Some of its methods were
moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
one of those being `get_alias_candidates()`. This method is now available as
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
Note:
[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
defaults to
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
## KnowledgeBase.get_vector {id="get_vector",tag="method"} ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
Given a certain entity ID, retrieve its pretrained entity vector. Given a certain entity ID, retrieve its pretrained entity vector.
@ -207,18 +190,18 @@ of the [`entity_linker`](/api/entitylinker) pipe.
> #### Example```python > #### Example```python
> >
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb, > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) > entity_hash, entity_freq, entity_vector, mention_hash, prior_prob)
> >
> ``` > ```
> >
> ``` > ```
| Name | Description | | Name | Description |
| ------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------- |
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ | | `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ | | `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ | | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ | | `mention_hash` | The hash of the textual mention. ~~int~~ |
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | | `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
## InMemoryCandidate attributes {id="candidate-attributes"} ## InMemoryCandidate attributes {id="candidate-attributes"}