Refactor Candidate attribute names. Update docs and tests accordingly.

This commit is contained in:
Raphael Mitsch 2023-03-03 11:08:17 +01:00
parent 46fe069f87
commit 94e57d0ed5
5 changed files with 41 additions and 58 deletions

View File

@ -29,26 +29,26 @@ class Candidate(abc.ABC):
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
doesn't) it might be better to eschew this information and always supply the same value.
"""
self._mention = mention
self._entity_id = entity_id
self._entity_name = entity_name
self._mention_ = mention
self._entity = entity_id
self._entity_ = entity_name
self._entity_vector = entity_vector
self._prior_prob = prior_prob
@property
def entity(self) -> int:
"""RETURNS (int): Unique entity ID."""
return self._entity_id
return self._entity
@property
def entity_(self) -> str:
"""RETURNS (int): Entity name."""
return self._entity_name
return self._entity_
@property
def mention(self) -> str:
def mention_(self) -> str:
"""RETURNS (str): Mention."""
return self._mention
return self._mention_
@property
def entity_vector(self) -> List[float]:
@ -93,20 +93,20 @@ class InMemoryCandidate(Candidate):
prior_prob=prior_prob,
)
self._retrieve_string_from_hash = retrieve_string_from_hash
self._entity_hash = entity_hash
self._entity = entity_hash
self._entity_freq = entity_freq
self._mention_hash = mention_hash
self._mention = mention_hash
self._prior_prob = prior_prob
@property
def entity(self) -> int:
"""RETURNS (int): hash of the entity_id's KB ID/name"""
return self._entity_hash
return self._entity
@property
def mention_hash(self) -> int:
def mention(self) -> int:
"""RETURNS (int): Mention hash."""
return self._mention_hash
return self._mention
@property
def entity_freq(self) -> float:

View File

@ -224,9 +224,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
self._aliases_table[alias_index] = alias_entry
def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
return self.get_alias_candidates(mention.text) # type: ignore
return self._get_alias_candidates(mention.text) # type: ignore
def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
"""
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
@ -244,7 +244,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].freq,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
alias_hash=alias_hash,
mention_hash=alias_hash,
prior_prob=prior_prob
)
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)

View File

@ -469,7 +469,7 @@ def test_candidate_generation(nlp):
# test the content of the candidates
assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
assert get_candidates(mykb, adam_ent)[0].mention_ == "adam"
assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
@ -499,7 +499,7 @@ def test_el_pipe_configuration(nlp):
assert doc[2].ent_kb_id_ == "Q2"
def get_lowercased_candidates(kb, span):
return kb.get_alias_candidates(span.text.lower())
return kb._get_alias_candidates(span.text.lower())
def get_lowercased_candidates_batch(kb, spans):
return [get_lowercased_candidates(kb, span) for span in spans]
@ -558,24 +558,24 @@ def test_vocab_serialization(nlp):
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
candidates = mykb.get_alias_candidates("adam")
candidates = mykb._get_alias_candidates("adam")
assert len(candidates) == 1
assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash
assert candidates[0].alias_ == "adam"
assert candidates[0].mention == adam_hash
assert candidates[0].mention_ == "adam"
with make_tempdir() as d:
mykb.to_disk(d / "kb")
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
kb_new_vocab.from_disk(d / "kb")
candidates = kb_new_vocab.get_alias_candidates("adam")
candidates = kb_new_vocab._get_alias_candidates("adam")
assert len(candidates) == 1
assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash
assert candidates[0].alias_ == "adam"
assert candidates[0].mention == adam_hash
assert candidates[0].mention_ == "adam"
assert kb_new_vocab.get_vector("Q2") == [2]
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@ -595,20 +595,20 @@ def test_append_alias(nlp):
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
# test the size of the relevant candidates
assert len(mykb.get_alias_candidates("douglas")) == 2
assert len(mykb._get_alias_candidates("douglas")) == 2
# append an alias
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
# test the size of the relevant candidates has been incremented
assert len(mykb.get_alias_candidates("douglas")) == 3
assert len(mykb._get_alias_candidates("douglas")) == 3
# append the same alias-entity pair again should not work (will throw a warning)
with pytest.warns(UserWarning):
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
# test the size of the relevant candidates remained unchanged
assert len(mykb.get_alias_candidates("douglas")) == 3
assert len(mykb._get_alias_candidates("douglas")) == 3
@pytest.mark.filterwarnings("ignore:\\[W036")
@ -905,11 +905,11 @@ def test_kb_to_bytes():
assert kb_2.contains_alias("Russ Cochran")
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
kb_2.get_alias_candidates("Russ Cochran")
assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
kb_2._get_alias_candidates("Russ Cochran")
)
assert len(kb_1.get_alias_candidates("Randomness")) == len(
kb_2.get_alias_candidates("Randomness")
assert len(kb_1._get_alias_candidates("Randomness")) == len(
kb_2._get_alias_candidates("Randomness")
)

View File

@ -63,19 +63,19 @@ def _check_kb(kb):
assert alias_string not in kb.get_alias_strings()
# check candidates & probabilities
candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
candidates = sorted(kb._get_alias_candidates("double07"), key=lambda x: x.entity_)
assert len(candidates) == 2
assert candidates[0].entity_ == "Q007"
assert 6.999 < candidates[0].entity_freq < 7.01
assert candidates[0].entity_vector == [0, 0, 7]
assert candidates[0].alias_ == "double07"
assert candidates[0].mention_ == "double07"
assert 0.899 < candidates[0].prior_prob < 0.901
assert candidates[1].entity_ == "Q17"
assert 1.99 < candidates[1].entity_freq < 2.01
assert candidates[1].entity_vector == [7, 1, 0]
assert candidates[1].alias_ == "double07"
assert candidates[1].mention_ == "double07"
assert 0.099 < candidates[1].prior_prob < 0.101

View File

@ -103,23 +103,6 @@ to you.
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"}
<Infobox variant="warning">
This method is _not_ available from spaCy 3.5 onwards.
</Infobox>
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
allow more flexibility in customizing knowledge bases. Some of its methods were
moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
one of those being `get_alias_candidates()`. This method is now available as
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
Note:
[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
defaults to
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
Given a certain entity ID, retrieve its pretrained entity vector.
@ -207,19 +190,19 @@ of the [`entity_linker`](/api/entitylinker) pipe.
> #### Example```python
>
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
> entity_hash, entity_freq, entity_vector, mention_hash, prior_prob)
>
> ```
>
> ```
| Name | Description |
| ------------- | ------------------------------------------------------------------------- |
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
| Name | Description |
| -------------- | ------------------------------------------------------------------------- |
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
| `mention_hash` | The hash of the textual mention. ~~int~~ |
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
## InMemoryCandidate attributes {id="candidate-attributes"}