Refactor Candidate attribute names. Update docs and tests accordingly.

2025-05-29 10:13:19 +03:00 · 2023-03-03 11:08:17 +01:00 · 2023-03-03 11:08:17 +01:00 · 94e57d0ed5
commit 94e57d0ed5
parent 46fe069f87
5 changed files with 41 additions and 58 deletions
--- a/spacy/kb/candidate.py
+++ b/spacy/kb/candidate.py
@ -29,26 +29,26 @@ class Candidate(abc.ABC):
            cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
            doesn't) it might be better to eschew this information and always supply the same value.
        """
-        self._mention = mention
-        self._entity_id = entity_id
-        self._entity_name = entity_name
+        self._mention_ = mention
+        self._entity = entity_id
+        self._entity_ = entity_name
        self._entity_vector = entity_vector
        self._prior_prob = prior_prob

    @property
    def entity(self) -> int:
        """RETURNS (int): Unique entity ID."""
-        return self._entity_id
+        return self._entity

    @property
    def entity_(self) -> str:
        """RETURNS (int): Entity name."""
-        return self._entity_name
+        return self._entity_

    @property
-    def mention(self) -> str:
+    def mention_(self) -> str:
        """RETURNS (str): Mention."""
-        return self._mention
+        return self._mention_

    @property
    def entity_vector(self) -> List[float]:
@ -93,20 +93,20 @@ class InMemoryCandidate(Candidate):
            prior_prob=prior_prob,
        )
        self._retrieve_string_from_hash = retrieve_string_from_hash
-        self._entity_hash = entity_hash
+        self._entity = entity_hash
        self._entity_freq = entity_freq
-        self._mention_hash = mention_hash
+        self._mention = mention_hash
        self._prior_prob = prior_prob

    @property
    def entity(self) -> int:
        """RETURNS (int): hash of the entity_id's KB ID/name"""
-        return self._entity_hash
+        return self._entity

    @property
-    def mention_hash(self) -> int:
+    def mention(self) -> int:
        """RETURNS (int): Mention hash."""
-        return self._mention_hash
+        return self._mention

    @property
    def entity_freq(self) -> float:
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -224,9 +224,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
            self._aliases_table[alias_index] = alias_entry

    def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
-        return self.get_alias_candidates(mention.text)  # type: ignore
+        return self._get_alias_candidates(mention.text)  # type: ignore

-    def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
+    def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
        """
        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
@ -244,7 +244,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
                entity_hash=self._entries[entry_index].entity_hash,
                entity_freq=self._entries[entry_index].freq,
                entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
-                alias_hash=alias_hash,
+                mention_hash=alias_hash,
                prior_prob=prior_prob
            )
            for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -469,7 +469,7 @@ def test_candidate_generation(nlp):

    # test the content of the candidates
    assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
-    assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
+    assert get_candidates(mykb, adam_ent)[0].mention_ == "adam"
    assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
    assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)

@ -499,7 +499,7 @@ def test_el_pipe_configuration(nlp):
    assert doc[2].ent_kb_id_ == "Q2"

    def get_lowercased_candidates(kb, span):
-        return kb.get_alias_candidates(span.text.lower())
+        return kb._get_alias_candidates(span.text.lower())

    def get_lowercased_candidates_batch(kb, spans):
        return [get_lowercased_candidates(kb, span) for span in spans]
@ -558,24 +558,24 @@ def test_vocab_serialization(nlp):
    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
    adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])

-    candidates = mykb.get_alias_candidates("adam")
+    candidates = mykb._get_alias_candidates("adam")
    assert len(candidates) == 1
    assert candidates[0].entity == q2_hash
    assert candidates[0].entity_ == "Q2"
-    assert candidates[0].alias == adam_hash
-    assert candidates[0].alias_ == "adam"
+    assert candidates[0].mention == adam_hash
+    assert candidates[0].mention_ == "adam"

    with make_tempdir() as d:
        mykb.to_disk(d / "kb")
        kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
        kb_new_vocab.from_disk(d / "kb")

-        candidates = kb_new_vocab.get_alias_candidates("adam")
+        candidates = kb_new_vocab._get_alias_candidates("adam")
        assert len(candidates) == 1
        assert candidates[0].entity == q2_hash
        assert candidates[0].entity_ == "Q2"
-        assert candidates[0].alias == adam_hash
-        assert candidates[0].alias_ == "adam"
+        assert candidates[0].mention == adam_hash
+        assert candidates[0].mention_ == "adam"

        assert kb_new_vocab.get_vector("Q2") == [2]
        assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@ -595,20 +595,20 @@ def test_append_alias(nlp):
    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])

    # test the size of the relevant candidates
-    assert len(mykb.get_alias_candidates("douglas")) == 2
+    assert len(mykb._get_alias_candidates("douglas")) == 2

    # append an alias
    mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)

    # test the size of the relevant candidates has been incremented
-    assert len(mykb.get_alias_candidates("douglas")) == 3
+    assert len(mykb._get_alias_candidates("douglas")) == 3

    # append the same alias-entity pair again should not work (will throw a warning)
    with pytest.warns(UserWarning):
        mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)

    # test the size of the relevant candidates remained unchanged
-    assert len(mykb.get_alias_candidates("douglas")) == 3
+    assert len(mykb._get_alias_candidates("douglas")) == 3


@pytest.mark.filterwarnings("ignore:\\[W036")
@ -905,11 +905,11 @@ def test_kb_to_bytes():
    assert kb_2.contains_alias("Russ Cochran")
    assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
    assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
-    assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
-        kb_2.get_alias_candidates("Russ Cochran")
+    assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
+        kb_2._get_alias_candidates("Russ Cochran")
    )
-    assert len(kb_1.get_alias_candidates("Randomness")) == len(
-        kb_2.get_alias_candidates("Randomness")
+    assert len(kb_1._get_alias_candidates("Randomness")) == len(
+        kb_2._get_alias_candidates("Randomness")
    )


--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@ -63,19 +63,19 @@ def _check_kb(kb):
        assert alias_string not in kb.get_alias_strings()

    # check candidates & probabilities
-    candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
+    candidates = sorted(kb._get_alias_candidates("double07"), key=lambda x: x.entity_)
    assert len(candidates) == 2

    assert candidates[0].entity_ == "Q007"
    assert 6.999 < candidates[0].entity_freq < 7.01
    assert candidates[0].entity_vector == [0, 0, 7]
-    assert candidates[0].alias_ == "double07"
+    assert candidates[0].mention_ == "double07"
    assert 0.899 < candidates[0].prior_prob < 0.901

    assert candidates[1].entity_ == "Q17"
    assert 1.99 < candidates[1].entity_freq < 2.01
    assert candidates[1].entity_vector == [7, 1, 0]
-    assert candidates[1].alias_ == "double07"
+    assert candidates[1].mention_ == "double07"
    assert 0.099 < candidates[1].prior_prob < 0.101


--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@ -103,23 +103,6 @@ to you.
 | `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
 | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |

-## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"}
-
-<Infobox variant="warning">
-  This method is _not_ available from spaCy 3.5 onwards.
-</Infobox>
-
-From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
-[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
-allow more flexibility in customizing knowledge bases. Some of its methods were
-moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
-one of those being `get_alias_candidates()`. This method is now available as
-[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
-Note:
-[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
-defaults to
-[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
-
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}

 Given a certain entity ID, retrieve its pretrained entity vector.
@ -207,19 +190,19 @@ of the [`entity_linker`](/api/entitylinker) pipe.
 > #### Example```python
 >
 > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
-> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
+> entity_hash, entity_freq, entity_vector, mention_hash, prior_prob)
 >
 > ```
 >
 > ```

-| Name          | Description                                                               |
-| ------------- | ------------------------------------------------------------------------- |
-| `kb`          | The knowledge base that defined this candidate. ~~KnowledgeBase~~         |
-| `entity_hash` | The hash of the entity's KB ID. ~~int~~                                   |
-| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~                     |
-| `alias_hash`  | The hash of the textual mention or alias. ~~int~~                         |
-| `prior_prob`  | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
+| Name           | Description                                                               |
+| -------------- | ------------------------------------------------------------------------- |
+| `kb`           | The knowledge base that defined this candidate. ~~KnowledgeBase~~         |
+| `entity_hash`  | The hash of the entity's KB ID. ~~int~~                                   |
+| `entity_freq`  | The entity frequency as recorded in the KB. ~~float~~                     |
+| `mention_hash` | The hash of the textual mention. ~~int~~                                  |
+| `prior_prob`   | The prior probability of the `alias` referring to the `entity`. ~~float~~ |

 ## InMemoryCandidate attributes {id="candidate-attributes"}