From 7c28424f478c14f5e1dac523ae57ee6d4b207835 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 18 Oct 2022 15:31:15 +0200 Subject: [PATCH 01/35] Convert batched into doc-wise batched candidate generation. --- spacy/errors.py | 5 +- spacy/kb/__init__.py | 2 +- spacy/kb/candidate.pyx | 12 +- spacy/kb/kb.pyx | 26 +-- spacy/ml/models/entity_linker.py | 12 +- spacy/pipeline/entity_linker.py | 222 +++++++++++---------- spacy/tests/pipeline/test_entity_linker.py | 58 ++++-- 7 files changed, 182 insertions(+), 155 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index e0628819d..958859569 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -946,11 +946,10 @@ class Errors(metaclass=ErrorsWithCodes): "case pass an empty list for the previously not specified argument to avoid this error.") E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got " "{value}.") - E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}") - E1045 = ("Encountered {parent} subclass without `{parent}.{method}` " + E1044 = ("Encountered {parent} subclass without `{parent}.{method}` " "method in '{name}'. If you want to use this method, make " "sure it's overwritten on the subclass.") - E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default " + E1045 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default " "knowledge base, use `InMemoryLookupKB`.") diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py index 1d70a9b34..b61cb5447 100644 --- a/spacy/kb/__init__.py +++ b/spacy/kb/__init__.py @@ -1,3 +1,3 @@ from .kb import KnowledgeBase from .kb_in_memory import InMemoryLookupKB -from .candidate import Candidate, get_candidates, get_candidates_batch +from .candidate import Candidate, get_candidates, get_candidates_all diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index c89efeb03..5ad52618a 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -1,6 +1,6 @@ # cython: infer_types=True, profile=True -from typing import Iterable +from typing import Iterable, Generator from .kb cimport KnowledgeBase from ..tokens import Span @@ -64,11 +64,13 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: return kb.get_candidates(mention) -def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: +def get_candidates_all( + kb: KnowledgeBase, mentions: Generator[Iterable[Span], None, None] +) -> Generator[Iterable[Iterable[Candidate]], None, None]: """ Return candidate entities for the given mentions and fetching appropriate entries from the index. kb (KnowledgeBase): Knowledge base to query. - mention (Iterable[Span]): Entity mentions for which to identify candidates. - RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. + mention (Generator[Iterable[Span]]): Entity mentions per document for which to identify candidates. + RETURNS (Generator[Iterable[Iterable[Candidate]]]): Identified candidates per document. """ - return kb.get_candidates_batch(mentions) + return kb.get_candidates_all(mentions) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index ce4bc0138..2e99ea493 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -1,7 +1,7 @@ # cython: infer_types=True, profile=True from pathlib import Path -from typing import Iterable, Tuple, Union +from typing import Iterable, Tuple, Union, Generator from cymem.cymem cimport Pool from .candidate import Candidate @@ -23,22 +23,24 @@ cdef class KnowledgeBase: # Make sure abstract KB is not instantiated. if self.__class__ == KnowledgeBase: raise TypeError( - Errors.E1046.format(cls_name=self.__class__.__name__) + Errors.E1045.format(cls_name=self.__class__.__name__) ) self.vocab = vocab self.entity_vector_length = entity_vector_length self.mem = Pool() - def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: + def get_candidates_all(self, mentions: Generator[Iterable[Span]]) -> Generator[Iterable[Iterable[Candidate]]]: """ Return candidate entities for specified texts. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. If no candidate is found for a given text, an empty list is returned. - mentions (Iterable[Span]): Mentions for which to get candidates. - RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. + mentions (Generator[Iterable[Span]]): Mentions per documents for which to get candidates. + RETURNS (Generator[Iterable[Iterable[Candidate]]]): Identified candidates per document. """ - return [self.get_candidates(span) for span in mentions] + + for doc_mentions in mentions: + yield [self.get_candidates(span) for span in doc_mentions] def get_candidates(self, mention: Span) -> Iterable[Candidate]: """ @@ -49,7 +51,7 @@ cdef class KnowledgeBase: RETURNS (Iterable[Candidate]): Identified candidates. """ raise NotImplementedError( - Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__) + Errors.E1044.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__) ) def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]: @@ -67,7 +69,7 @@ cdef class KnowledgeBase: RETURNS (Iterable[float]): Vector for specified entity. """ raise NotImplementedError( - Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__) + Errors.E1044.format(parent="KnowledgeBase", method="get_vector", name=self.__name__) ) def to_bytes(self, **kwargs) -> bytes: @@ -75,7 +77,7 @@ cdef class KnowledgeBase: RETURNS (bytes): Current state as binary string. """ raise NotImplementedError( - Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__) + Errors.E1044.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__) ) def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()): @@ -84,7 +86,7 @@ cdef class KnowledgeBase: exclude (Tuple[str]): Properties to exclude when restoring KB. """ raise NotImplementedError( - Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__) + Errors.E1044.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__) ) def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None: @@ -94,7 +96,7 @@ cdef class KnowledgeBase: exclude (Iterable[str]): List of components to exclude. """ raise NotImplementedError( - Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__) + Errors.E1044.format(parent="KnowledgeBase", method="to_disk", name=self.__name__) ) def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None: @@ -104,5 +106,5 @@ cdef class KnowledgeBase: exclude (Iterable[str]): List of components to exclude. """ raise NotImplementedError( - Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) + Errors.E1044.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) ) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 4d18d216a..9aac71d40 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,12 +1,12 @@ from pathlib import Path -from typing import Optional, Callable, Iterable, List, Tuple +from typing import Optional, Callable, Iterable, List, Tuple, Generator from thinc.types import Floats2d from thinc.api import chain, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear, tuplify, Ragged from ...util import registry from ...kb import KnowledgeBase, InMemoryLookupKB -from ...kb import Candidate, get_candidates, get_candidates_batch +from ...kb import Candidate, get_candidates, get_candidates_all from ...vocab import Vocab from ...tokens import Span, Doc from ..extract_spans import extract_spans @@ -105,8 +105,8 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: return get_candidates -@registry.misc("spacy.CandidateBatchGenerator.v1") -def create_candidates_batch() -> Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] +@registry.misc("spacy.CandidateAllGenerator.v1") +def create_candidates_all() -> Callable[ + [KnowledgeBase, Generator[Iterable[Span], None, None]], Generator[Iterable[Iterable[Candidate]], None, None] ]: - return get_candidates_batch + return get_candidates_all diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 62845287b..4d3baf2f3 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,4 +1,4 @@ -from typing import Optional, Iterable, Callable, Dict, Union, List, Any +from typing import Optional, Iterable, Callable, Dict, Union, List, Any, Generator from thinc.types import Floats2d from pathlib import Path from itertools import islice @@ -53,11 +53,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "incl_context": True, "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, - "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, + "get_candidates_all": {"@misc": "spacy.CandidateAllGenerator.v1"}, "overwrite": True, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, - "candidates_batch_size": 1, + "candidates_doc_mode": False, "threshold": None, }, default_score_weights={ @@ -77,13 +77,14 @@ def make_entity_linker( incl_context: bool, entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], - get_candidates_batch: Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + get_candidates_all: Callable[ + [KnowledgeBase, Generator[Iterable[Span], None, None]], + Generator[Iterable[Iterable[Candidate]], None, None] ], overwrite: bool, scorer: Optional[Callable], use_gold_ents: bool, - candidates_batch_size: int, + candidates_doc_mode: bool, threshold: Optional[float] = None, ): """Construct an EntityLinker component. @@ -98,13 +99,18 @@ def make_entity_linker( entity_vector_length (int): Size of encoding vectors in the KB. get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. - get_candidates_batch ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] - ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + get_candidates_all ( + Callable[ + [KnowledgeBase, Generator[Iterable[Span], None, None]], + Generator[Iterable[Iterable[Candidate]], None, None] + ]): Function that produces a list of candidates per document, given a certain knowledge base and several textual + documents with textual mentions. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. - candidates_batch_size (int): Size of batches for entity candidate generation. + candidates_doc_mode (bool): Whether or not to operate candidate generation in doc mode, i.e. to provide a generator + yielding entities per document (candidate generator callable is called only once in this case). If False, + the candidate generator is called once per entity. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. """ @@ -134,11 +140,11 @@ def make_entity_linker( incl_context=incl_context, entity_vector_length=entity_vector_length, get_candidates=get_candidates, - get_candidates_batch=get_candidates_batch, + get_candidates_all=get_candidates_all, overwrite=overwrite, scorer=scorer, use_gold_ents=use_gold_ents, - candidates_batch_size=candidates_batch_size, + candidates_doc_mode=candidates_doc_mode, threshold=threshold, ) @@ -172,13 +178,14 @@ class EntityLinker(TrainablePipe): incl_context: bool, entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], - get_candidates_batch: Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + get_candidates_all: Callable[ + [KnowledgeBase, Generator[Iterable[Span], None, None]], + Generator[Iterable[Iterable[Candidate]], None, None] ], overwrite: bool = BACKWARD_OVERWRITE, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, - candidates_batch_size: int, + candidates_doc_mode: bool, threshold: Optional[float] = None, ) -> None: """Initialize an entity linker. @@ -194,14 +201,18 @@ class EntityLinker(TrainablePipe): entity_vector_length (int): Size of encoding vectors in the KB. get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. - get_candidates_batch ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], - Iterable[Candidate]] - ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + get_candidates_all ( + Callable[ + [KnowledgeBase, Generator[Iterable[Span], None, None]], + Generator[Iterable[Iterable[Candidate]], None, None] + ]): Function that produces a list of candidates per document, given a certain knowledge base and several textual + documents with textual mentions. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. - candidates_batch_size (int): Size of batches for entity candidate generation. + candidates_doc_mode (bool): Whether or not to operate candidate generation in doc mode, i.e. to provide a generator + yielding entities per document (candidate generator callable is called only once in this case). If False, + the candidate generator is called once per entity. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. DOCS: https://spacy.io/api/entitylinker#init @@ -224,7 +235,7 @@ class EntityLinker(TrainablePipe): self.incl_prior = incl_prior self.incl_context = incl_context self.get_candidates = get_candidates - self.get_candidates_batch = get_candidates_batch + self.get_candidates_all = get_candidates_all self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) # how many neighbour sentences to take into account @@ -232,12 +243,9 @@ class EntityLinker(TrainablePipe): self.kb = empty_kb(entity_vector_length)(self.vocab) self.scorer = scorer self.use_gold_ents = use_gold_ents - self.candidates_batch_size = candidates_batch_size + self.candidates_doc_mode = candidates_doc_mode self.threshold = threshold - if candidates_batch_size < 1: - raise ValueError(Errors.E1044) - def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will create it using this object's vocab.""" @@ -440,96 +448,98 @@ class EntityLinker(TrainablePipe): return final_kb_ids if isinstance(docs, Doc): docs = [docs] - for i, doc in enumerate(docs): + + # Determine which entities are to be ignored due to labels_discard. + valid_ent_idx_per_doc = ( + [ + idx + for idx in range(len(doc.ents)) + if doc.ents[idx].label_ not in self.labels_discard + ] + for doc in docs if len(doc.ents) + ) + # Call candidate generator. + if self.candidates_doc_mode: + all_ent_cands = self.get_candidates_all( + self.kb, + ([doc.ents[idx] for idx in next(valid_ent_idx_per_doc)] for doc in docs if len(doc.ents)) + ) + else: + # Alternative: collect entities the old-fashioned way - by retrieving entities individually. + all_ent_cands = ( + [self.get_candidates(self.kb, doc.ents[idx]) for idx in next(valid_ent_idx_per_doc)] + for doc in docs if len(doc.ents) + ) + + for doc_idx, doc in enumerate(docs): if len(doc) == 0: continue sentences = [s for s in doc.sents] + doc_ent_cands = list(next(all_ent_cands)) if len(doc.ents) else [] - # Loop over entities in batches. - for ent_idx in range(0, len(doc.ents), self.candidates_batch_size): - ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size] + # Looping over candidate entities for this doc. (TODO: rewrite) + for ent_cand_idx, ent in enumerate(doc.ents): + sent_index = sentences.index(ent.sent) + assert sent_index >= 0 - # Look up candidate entities. - valid_ent_idx = [ - idx - for idx in range(len(ent_batch)) - if ent_batch[idx].label_ not in self.labels_discard - ] - - batch_candidates = list( - self.get_candidates_batch( - self.kb, [ent_batch[idx] for idx in valid_ent_idx] + if self.incl_context: + # get n_neighbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min( + len(sentences) - 1, sent_index + self.n_sents ) - if self.candidates_batch_size > 1 - else [ - self.get_candidates(self.kb, ent_batch[idx]) - for idx in valid_ent_idx - ] - ) - - # Looping through each entity in batch (TODO: rewrite) - for j, ent in enumerate(ent_batch): - sent_index = sentences.index(ent.sent) - assert sent_index >= 0 - - if self.incl_context: - # get n_neighbour sentences, clipped to the length of the document - start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min( - len(sentences) - 1, sent_index + self.n_sents - ) - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - sent_doc = doc[start_token:end_token].as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model.predict([sent_doc])[0] - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) - entity_count += 1 - if ent.label_ in self.labels_discard: - # ignoring this entity - setting to NIL + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + sent_doc = doc[start_token:end_token].as_doc() + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model.predict([sent_doc])[0] + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + entity_count += 1 + if ent.label_ in self.labels_discard: + # ignoring this entity - setting to NIL + final_kb_ids.append(self.NIL) + else: + candidates = list(doc_ent_cands[ent_cand_idx]) + if not candidates: + # no prediction possible for this entity - setting to NIL final_kb_ids.append(self.NIL) + elif len(candidates) == 1 and self.threshold is None: + # shortcut for efficiency reasons: take the 1 candidate + final_kb_ids.append(candidates[0].entity_) else: - candidates = list(batch_candidates[j]) - if not candidates: - # no prediction possible for this entity - setting to NIL - final_kb_ids.append(self.NIL) - elif len(candidates) == 1 and self.threshold is None: - # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_) - else: - random.shuffle(candidates) - # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.incl_prior: - prior_probs = xp.asarray([0.0 for _ in candidates]) - scores = prior_probs - # add in similarity from the context - if self.incl_context: - entity_encodings = xp.asarray( - [c.entity_vector for c in candidates] - ) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError( - Errors.E147.format( - method="predict", - msg="vectors not of equal length", - ) - ) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / ( - sentence_norm * entity_norm - ) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs * sims) - final_kb_ids.append( - candidates[scores.argmax().item()].entity_ - if self.threshold is None - or scores.max() >= self.threshold - else EntityLinker.NIL + random.shuffle(candidates) + # set all prior probabilities to 0 if incl_prior=False + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.incl_prior: + prior_probs = xp.asarray([0.0 for _ in candidates]) + scores = prior_probs + # add in similarity from the context + if self.incl_context: + entity_encodings = xp.asarray( + [c.entity_vector for c in candidates] ) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + if len(entity_encodings) != len(prior_probs): + raise RuntimeError( + Errors.E147.format( + method="predict", + msg="vectors not of equal length", + ) + ) + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / ( + sentence_norm * entity_norm + ) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs * sims) + final_kb_ids.append( + candidates[scores.argmax().item()].entity_ + if self.threshold is None + or scores.max() >= self.threshold + else EntityLinker.NIL + ) if not (len(final_kb_ids) == entity_count): err = Errors.E147.format( diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 4d683acc5..a579b0fac 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,4 +1,4 @@ -from typing import Callable, Iterable, Dict, Any +from typing import Callable, Iterable, Dict, Any, Generator import pytest from numpy.testing import assert_equal @@ -497,11 +497,14 @@ def test_el_pipe_configuration(nlp): assert doc[1].ent_kb_id_ == "" assert doc[2].ent_kb_id_ == "Q2" + # Replace the pipe with a new one with with a different candidate generator. + def get_lowercased_candidates(kb, span): return kb.get_alias_candidates(span.text.lower()) - def get_lowercased_candidates_batch(kb, spans): - return [get_lowercased_candidates(kb, span) for span in spans] + def get_lowercased_candidates_all(kb, spans_per_doc): + for doc_spans in spans_per_doc: + yield [get_lowercased_candidates(kb, span) for span in doc_spans] @registry.misc("spacy.LowercaseCandidateGenerator.v1") def create_candidates() -> Callable[ @@ -509,29 +512,39 @@ def test_el_pipe_configuration(nlp): ]: return get_lowercased_candidates - @registry.misc("spacy.LowercaseCandidateBatchGenerator.v1") + @registry.misc("spacy.LowercaseCandidateAllGenerator.v1") def create_candidates_batch() -> Callable[ - [InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]] + [InMemoryLookupKB, Generator[Iterable["Span"], None, None]], + Generator[Iterable[Iterable[Candidate]], None, None] ]: - return get_lowercased_candidates_batch + return get_lowercased_candidates_all - # replace the pipe with a new one with with a different candidate generator - entity_linker = nlp.replace_pipe( - "entity_linker", - "entity_linker", - config={ - "incl_context": False, - "get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"}, - "get_candidates_batch": { - "@misc": "spacy.LowercaseCandidateBatchGenerator.v1" + def test_reconfigured_el(candidates_doc_mode: bool, doc_text: str) -> None: + """Test reconfigured EL for correct results. + candidates_doc_mode (bool): candidates_doc_mode in pipe config. + doc_text (str): Text to infer. + """ + _entity_linker = nlp.replace_pipe( + "entity_linker", + "entity_linker", + config={ + "incl_context": False, + "candidates_doc_mode": candidates_doc_mode, + "get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"}, + "get_candidates_all": { + "@misc": "spacy.LowercaseCandidateAllGenerator.v1" + }, }, - }, - ) - entity_linker.set_kb(create_kb) - doc = nlp(text) - assert doc[0].ent_kb_id_ == "Q2" - assert doc[1].ent_kb_id_ == "" - assert doc[2].ent_kb_id_ == "Q2" + ) + _entity_linker.set_kb(create_kb) + _doc = nlp(doc_text) + assert _doc[0].ent_kb_id_ == "Q2" + assert _doc[1].ent_kb_id_ == "" + assert _doc[2].ent_kb_id_ == "Q2" + + # Test individual and doc-wise candidate generation. + test_reconfigured_el(False, text) + test_reconfigured_el(True, text) def test_nel_nsents(nlp): @@ -670,6 +683,7 @@ def test_preserving_links_asdoc(nlp): assert s_ent.kb_id_ == orig_kb_id + def test_preserving_links_ents(nlp): """Test that doc.ents preserves KB annotations""" text = "She lives in Boston. He lives in Denver." From d8183121f657731c486c9fa39c5b84d7f89b5878 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 18 Oct 2022 17:33:37 +0200 Subject: [PATCH 02/35] Reformat with black. --- spacy/ml/models/entity_linker.py | 3 ++- spacy/pipeline/entity_linker.py | 28 +++++++++++++--------- spacy/tests/pipeline/test_entity_linker.py | 3 +-- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 9aac71d40..455bcc3b1 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -107,6 +107,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: @registry.misc("spacy.CandidateAllGenerator.v1") def create_candidates_all() -> Callable[ - [KnowledgeBase, Generator[Iterable[Span], None, None]], Generator[Iterable[Iterable[Candidate]], None, None] + [KnowledgeBase, Generator[Iterable[Span], None, None]], + Generator[Iterable[Iterable[Candidate]], None, None], ]: return get_candidates_all diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 4d3baf2f3..eb546b3a0 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -79,7 +79,7 @@ def make_entity_linker( get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_all: Callable[ [KnowledgeBase, Generator[Iterable[Span], None, None]], - Generator[Iterable[Iterable[Candidate]], None, None] + Generator[Iterable[Iterable[Candidate]], None, None], ], overwrite: bool, scorer: Optional[Callable], @@ -180,7 +180,7 @@ class EntityLinker(TrainablePipe): get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_all: Callable[ [KnowledgeBase, Generator[Iterable[Span], None, None]], - Generator[Iterable[Iterable[Candidate]], None, None] + Generator[Iterable[Iterable[Candidate]], None, None], ], overwrite: bool = BACKWARD_OVERWRITE, scorer: Optional[Callable] = entity_linker_score, @@ -456,19 +456,28 @@ class EntityLinker(TrainablePipe): for idx in range(len(doc.ents)) if doc.ents[idx].label_ not in self.labels_discard ] - for doc in docs if len(doc.ents) + for doc in docs + if len(doc.ents) ) # Call candidate generator. if self.candidates_doc_mode: all_ent_cands = self.get_candidates_all( self.kb, - ([doc.ents[idx] for idx in next(valid_ent_idx_per_doc)] for doc in docs if len(doc.ents)) + ( + [doc.ents[idx] for idx in next(valid_ent_idx_per_doc)] + for doc in docs + if len(doc.ents) + ), ) else: # Alternative: collect entities the old-fashioned way - by retrieving entities individually. all_ent_cands = ( - [self.get_candidates(self.kb, doc.ents[idx]) for idx in next(valid_ent_idx_per_doc)] - for doc in docs if len(doc.ents) + [ + self.get_candidates(self.kb, doc.ents[idx]) + for idx in next(valid_ent_idx_per_doc) + ] + for doc in docs + if len(doc.ents) ) for doc_idx, doc in enumerate(docs): @@ -485,9 +494,7 @@ class EntityLinker(TrainablePipe): if self.incl_context: # get n_neighbour sentences, clipped to the length of the document start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min( - len(sentences) - 1, sent_index + self.n_sents - ) + end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) start_token = sentences[start_sentence].start end_token = sentences[end_sentence].end sent_doc = doc[start_token:end_token].as_doc() @@ -536,8 +543,7 @@ class EntityLinker(TrainablePipe): scores = prior_probs + sims - (prior_probs * sims) final_kb_ids.append( candidates[scores.argmax().item()].entity_ - if self.threshold is None - or scores.max() >= self.threshold + if self.threshold is None or scores.max() >= self.threshold else EntityLinker.NIL ) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index a579b0fac..877a4c5ce 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -515,7 +515,7 @@ def test_el_pipe_configuration(nlp): @registry.misc("spacy.LowercaseCandidateAllGenerator.v1") def create_candidates_batch() -> Callable[ [InMemoryLookupKB, Generator[Iterable["Span"], None, None]], - Generator[Iterable[Iterable[Candidate]], None, None] + Generator[Iterable[Iterable[Candidate]], None, None], ]: return get_lowercased_candidates_all @@ -683,7 +683,6 @@ def test_preserving_links_asdoc(nlp): assert s_ent.kb_id_ == orig_kb_id - def test_preserving_links_ents(nlp): """Test that doc.ents preserves KB annotations""" text = "She lives in Boston. He lives in Denver." From b32f48c878ffa7e31d03c3463852cf292d60f9aa Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 20 Oct 2022 09:43:47 +0200 Subject: [PATCH 03/35] Change typing from Generator to Iterable. --- spacy/kb/candidate.pyx | 4 ++-- spacy/ml/models/entity_linker.py | 4 ++-- spacy/pipeline/entity_linker.py | 20 +++++++++++++++----- spacy/tests/pipeline/test_entity_linker.py | 4 ++-- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index 5ad52618a..613b70483 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -1,6 +1,6 @@ # cython: infer_types=True, profile=True -from typing import Iterable, Generator +from typing import Iterable, Generator, Iterator from .kb cimport KnowledgeBase from ..tokens import Span @@ -66,7 +66,7 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: def get_candidates_all( kb: KnowledgeBase, mentions: Generator[Iterable[Span], None, None] -) -> Generator[Iterable[Iterable[Candidate]], None, None]: +) -> Iterator[Iterable[Iterable[Candidate]]]: """ Return candidate entities for the given mentions and fetching appropriate entries from the index. kb (KnowledgeBase): Knowledge base to query. diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 455bcc3b1..4045a3206 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Optional, Callable, Iterable, List, Tuple, Generator +from typing import Optional, Callable, Iterable, List, Tuple, Generator, Iterator from thinc.types import Floats2d from thinc.api import chain, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear, tuplify, Ragged @@ -108,6 +108,6 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: @registry.misc("spacy.CandidateAllGenerator.v1") def create_candidates_all() -> Callable[ [KnowledgeBase, Generator[Iterable[Span], None, None]], - Generator[Iterable[Iterable[Candidate]], None, None], + Iterator[Iterable[Iterable[Candidate]]], ]: return get_candidates_all diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index eb546b3a0..439c3af59 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,4 +1,14 @@ -from typing import Optional, Iterable, Callable, Dict, Union, List, Any, Generator +from typing import ( + Optional, + Iterable, + Callable, + Dict, + Union, + List, + Any, + Generator, + Iterator, +) from thinc.types import Floats2d from pathlib import Path from itertools import islice @@ -79,7 +89,7 @@ def make_entity_linker( get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_all: Callable[ [KnowledgeBase, Generator[Iterable[Span], None, None]], - Generator[Iterable[Iterable[Candidate]], None, None], + Iterator[Iterable[Iterable[Candidate]]], ], overwrite: bool, scorer: Optional[Callable], @@ -102,7 +112,7 @@ def make_entity_linker( get_candidates_all ( Callable[ [KnowledgeBase, Generator[Iterable[Span], None, None]], - Generator[Iterable[Iterable[Candidate]], None, None] + Iterator[Iterable[Iterable[Candidate]]] ]): Function that produces a list of candidates per document, given a certain knowledge base and several textual documents with textual mentions. scorer (Optional[Callable]): The scoring method. @@ -180,7 +190,7 @@ class EntityLinker(TrainablePipe): get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_all: Callable[ [KnowledgeBase, Generator[Iterable[Span], None, None]], - Generator[Iterable[Iterable[Candidate]], None, None], + Iterator[Iterable[Iterable[Candidate]]], ], overwrite: bool = BACKWARD_OVERWRITE, scorer: Optional[Callable] = entity_linker_score, @@ -204,7 +214,7 @@ class EntityLinker(TrainablePipe): get_candidates_all ( Callable[ [KnowledgeBase, Generator[Iterable[Span], None, None]], - Generator[Iterable[Iterable[Candidate]], None, None] + Iterator[Iterable[Iterable[Candidate]]] ]): Function that produces a list of candidates per document, given a certain knowledge base and several textual documents with textual mentions. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 877a4c5ce..86956c2cc 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,4 +1,4 @@ -from typing import Callable, Iterable, Dict, Any, Generator +from typing import Callable, Iterable, Dict, Any, Generator, Iterator import pytest from numpy.testing import assert_equal @@ -515,7 +515,7 @@ def test_el_pipe_configuration(nlp): @registry.misc("spacy.LowercaseCandidateAllGenerator.v1") def create_candidates_batch() -> Callable[ [InMemoryLookupKB, Generator[Iterable["Span"], None, None]], - Generator[Iterable[Iterable[Candidate]], None, None], + Iterator[Iterable[Iterable[Candidate]]], ]: return get_lowercased_candidates_all From 9c69c3c3134c6ab03708b7e23db38548f67a5f19 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 21 Oct 2022 13:14:03 +0200 Subject: [PATCH 04/35] Harmonize checks for entity candidate generation and document skipping. --- spacy/pipeline/entity_linker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 439c3af59..c3a0a3815 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -476,7 +476,7 @@ class EntityLinker(TrainablePipe): ( [doc.ents[idx] for idx in next(valid_ent_idx_per_doc)] for doc in docs - if len(doc.ents) + if len(docs) and len(doc.ents) ), ) else: @@ -487,11 +487,11 @@ class EntityLinker(TrainablePipe): for idx in next(valid_ent_idx_per_doc) ] for doc in docs - if len(doc.ents) + if len(docs) and len(doc.ents) ) for doc_idx, doc in enumerate(docs): - if len(doc) == 0: + if len(doc) == 0 or len(doc.ents) == 0: continue sentences = [s for s in doc.sents] doc_ent_cands = list(next(all_ent_cands)) if len(doc.ents) else [] From f5f20b0b92a3d26598d8eecfe1780b512853558b Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Sun, 23 Oct 2022 18:32:55 +0200 Subject: [PATCH 05/35] Fix bug in check for doc NEL viability. --- spacy/pipeline/entity_linker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index c3a0a3815..a8c8d4278 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -467,7 +467,7 @@ class EntityLinker(TrainablePipe): if doc.ents[idx].label_ not in self.labels_discard ] for doc in docs - if len(doc.ents) + if len(doc) and len(doc.ents) ) # Call candidate generator. if self.candidates_doc_mode: @@ -476,7 +476,7 @@ class EntityLinker(TrainablePipe): ( [doc.ents[idx] for idx in next(valid_ent_idx_per_doc)] for doc in docs - if len(docs) and len(doc.ents) + if len(doc) and len(doc.ents) ), ) else: @@ -487,7 +487,7 @@ class EntityLinker(TrainablePipe): for idx in next(valid_ent_idx_per_doc) ] for doc in docs - if len(docs) and len(doc.ents) + if len(doc) and len(doc.ents) ) for doc_idx, doc in enumerate(docs): From ba91d0d1d9a9194033208a16f58e164ccf2e4846 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 27 Oct 2022 13:10:23 +0200 Subject: [PATCH 06/35] Add test for candidate stream processing. Simplify processing of candidate streams. --- spacy/pipeline/entity_linker.py | 2 +- spacy/tests/pipeline/test_entity_linker.py | 64 +++++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index a8c8d4278..b2d8e7a13 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -494,7 +494,7 @@ class EntityLinker(TrainablePipe): if len(doc) == 0 or len(doc.ents) == 0: continue sentences = [s for s in doc.sents] - doc_ent_cands = list(next(all_ent_cands)) if len(doc.ents) else [] + doc_ent_cands = list(next(all_ent_cands)) # Looping over candidate entities for this doc. (TODO: rewrite) for ent_cand_idx, ent in enumerate(doc.ents): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 86956c2cc..15c052833 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -179,7 +179,7 @@ def test_no_entities(): { "sent_starts": [1, 0, 0, 0, 0], }, - ) + ), ] nlp = English() vector_length = 3 @@ -1209,3 +1209,65 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]): assert len(doc.ents) == 1 assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL + + +def test_nel_candidate_processing(): + """Test that NEL handles candidate streams correctly in a set of documents with & without entities as well as empty + documents. + """ + train_data = [ + ( + "The sky over New York is blue.", + { + "sent_starts": [1, 0, 0, 0, 0, 0, 0, 0], + }, + ), + ( + "They visited New York.", + { + "sent_starts": [1, 0, 0, 0, 0], + }, + ), + # ( + # "", + # {} + # ), + # ( + # "New York is a city.", + # { + # "sent_starts": [1, 0, 0, 0, 0, 0], + # } + # ), + ] + + nlp = English() + # Add a custom rule-based component to mimick NER + ruler = nlp.add_pipe("entity_ruler", last=True) + ruler.add_patterns([{"label": "GPE", "pattern": [{"LOWER": "new york"}]}]) # type: ignore + + vector_length = 3 + train_examples = [] + for text, annotation in train_data: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB + mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q60", freq=12, entity_vector=[1, 2, 3]) + mykb.add_alias("New York", ["Q60"], [0.9]) + return mykb + + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + + # this will run the pipeline on the examples and shouldn't crash + nlp.evaluate(train_examples) From ace5655fe12377b2849141f813b3f7366333893d Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 27 Oct 2022 13:28:17 +0200 Subject: [PATCH 07/35] Fix test. --- spacy/tests/pipeline/test_entity_linker.py | 35 ++++++++++------------ 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 15c052833..5ad092fbf 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1217,39 +1217,35 @@ def test_nel_candidate_processing(): """ train_data = [ ( - "The sky over New York is blue.", + "The sky is blue.", { - "sent_starts": [1, 0, 0, 0, 0, 0, 0, 0], + "sent_starts": [1, 0, 0, 0, 0], }, ), ( "They visited New York.", { "sent_starts": [1, 0, 0, 0, 0], + "entities": [(13, 21, "GPE")], + }, + ), + ("", {}), + ( + "New York is a city.", + { + "sent_starts": [1, 0, 0, 0, 0, 0], + "entities": [(0, 8, "GPE")], }, ), - # ( - # "", - # {} - # ), - # ( - # "New York is a city.", - # { - # "sent_starts": [1, 0, 0, 0, 0, 0], - # } - # ), ] nlp = English() - # Add a custom rule-based component to mimick NER - ruler = nlp.add_pipe("entity_ruler", last=True) - ruler.add_patterns([{"label": "GPE", "pattern": [{"LOWER": "new york"}]}]) # type: ignore + nlp.add_pipe("sentencizer") vector_length = 3 train_examples = [] for text, annotation in train_data: - doc = nlp(text) - train_examples.append(Example.from_dict(doc, annotation)) + train_examples.append(Example.from_dict(nlp(text), annotation)) def create_kb(vocab): # create artificial KB @@ -1266,8 +1262,9 @@ def test_nel_candidate_processing(): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) - # adding additional components that are required for the entity_linker - nlp.add_pipe("sentencizer", first=True) + # Add a custom rule-based component to mimick NER + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns([{"label": "GPE", "pattern": [{"LOWER": "new york"}]}]) # type: ignore # this will run the pipeline on the examples and shouldn't crash nlp.evaluate(train_examples) From b398cca5cc50ef1f51b79505990b53c83287cf63 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 4 Nov 2022 12:46:03 +0100 Subject: [PATCH 08/35] Replace leftover Generator typing with Iterator. --- spacy/kb/kb.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 2e99ea493..b2ff80834 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -1,7 +1,7 @@ # cython: infer_types=True, profile=True from pathlib import Path -from typing import Iterable, Tuple, Union, Generator +from typing import Iterable, Tuple, Union, Iterator from cymem.cymem cimport Pool from .candidate import Candidate @@ -30,7 +30,7 @@ cdef class KnowledgeBase: self.entity_vector_length = entity_vector_length self.mem = Pool() - def get_candidates_all(self, mentions: Generator[Iterable[Span]]) -> Generator[Iterable[Iterable[Candidate]]]: + def get_candidates_all(self, mentions: Iterator[Iterable[Span]]) -> Iterator[Iterable[Iterable[Candidate]]]: """ Return candidate entities for specified texts. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. From c5b15e0e045411ef0901cbb59788805bbabb173f Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 9 Nov 2022 14:31:08 +0100 Subject: [PATCH 09/35] Update docstring. --- spacy/kb/kb.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index b2ff80834..8e5b52d26 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -32,9 +32,9 @@ cdef class KnowledgeBase: def get_candidates_all(self, mentions: Iterator[Iterable[Span]]) -> Iterator[Iterable[Iterable[Candidate]]]: """ - Return candidate entities for specified texts. Each candidate defines the entity, the original alias, + Return candidate entities for specified mentions. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. - If no candidate is found for a given text, an empty list is returned. + If no candidate is found for a given mentions, an empty list is returned. mentions (Generator[Iterable[Span]]): Mentions per documents for which to get candidates. RETURNS (Generator[Iterable[Iterable[Candidate]]]): Identified candidates per document. """ From b572e2473a83fca2a00717dcf6b72483361f8a4b Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 9 Nov 2022 14:31:22 +0100 Subject: [PATCH 10/35] Update docstring. --- spacy/kb/kb.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 8e5b52d26..81273a371 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -34,7 +34,7 @@ cdef class KnowledgeBase: """ Return candidate entities for specified mentions. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. - If no candidate is found for a given mentions, an empty list is returned. + If no candidate is found for a given mention, an empty list is returned. mentions (Generator[Iterable[Span]]): Mentions per documents for which to get candidates. RETURNS (Generator[Iterable[Iterable[Candidate]]]): Identified candidates per document. """ From d6d4c45eef8c11a8f95e019d9150bc9bb7a7780f Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 16 Nov 2022 15:52:34 +0100 Subject: [PATCH 11/35] Make entity_vector_length writable. --- spacy/kb/kb.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/kb/kb.pxd b/spacy/kb/kb.pxd index 1adeef8ae..ff2fc15ce 100644 --- a/spacy/kb/kb.pxd +++ b/spacy/kb/kb.pxd @@ -7,4 +7,4 @@ from ..vocab cimport Vocab cdef class KnowledgeBase: cdef Pool mem cdef readonly Vocab vocab - cdef readonly int64_t entity_vector_length + cdef int64_t entity_vector_length From aa2b5122b6d90d67ad9b8d8ccaa4dc831cfe62db Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 16 Nov 2022 16:07:39 +0100 Subject: [PATCH 12/35] Make entity_vector_length available in Python. --- spacy/kb/kb.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/kb/kb.pxd b/spacy/kb/kb.pxd index ff2fc15ce..9330dd5df 100644 --- a/spacy/kb/kb.pxd +++ b/spacy/kb/kb.pxd @@ -7,4 +7,4 @@ from ..vocab cimport Vocab cdef class KnowledgeBase: cdef Pool mem cdef readonly Vocab vocab - cdef int64_t entity_vector_length + cpdef int64_t entity_vector_length From 14800097150d321333dff9a0822b5c6e46b60e0c Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 16 Nov 2022 16:16:20 +0100 Subject: [PATCH 13/35] Make entity_vector_length available in Python. --- spacy/kb/kb.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/kb/kb.pxd b/spacy/kb/kb.pxd index 9330dd5df..7261287eb 100644 --- a/spacy/kb/kb.pxd +++ b/spacy/kb/kb.pxd @@ -7,4 +7,4 @@ from ..vocab cimport Vocab cdef class KnowledgeBase: cdef Pool mem cdef readonly Vocab vocab - cpdef int64_t entity_vector_length + cdef public int64_t entity_vector_length From 4eb072fa91678cbeaf8db62ede5950b73b658b47 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 23 Nov 2022 21:24:17 +0100 Subject: [PATCH 14/35] Add abstract method KnowledgeBase.__len__(). --- spacy/kb/kb.pyx | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 81273a371..fee074f0c 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -90,8 +90,7 @@ cdef class KnowledgeBase: ) def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None: - """ - Write KnowledgeBase content to disk. + """Write KnowledgeBase content to disk. path (Union[str, Path]): Target file path. exclude (Iterable[str]): List of components to exclude. """ @@ -100,11 +99,18 @@ cdef class KnowledgeBase: ) def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None: - """ - Load KnowledgeBase content from disk. + """Load KnowledgeBase content from disk. path (Union[str, Path]): Target file path. exclude (Iterable[str]): List of components to exclude. """ raise NotImplementedError( Errors.E1044.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) ) + + def __len__(self) -> int: + """Returns number of entities in the KnowledgeBase. + RETURNS (int): Number of entities in the KnowledgeBase. + """ + raise NotImplementedError( + Errors.E1044.format(parent="KnowledgeBase", method="__len__", name=self.__name__) + ) From b1d458eca743cb0ce5218deeec451a39e45cff73 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 25 Nov 2022 12:02:37 +0100 Subject: [PATCH 15/35] Add generate_from_disk() factory method. --- spacy/kb/kb.pyx | 17 ++++++++++++++++- spacy/kb/kb_in_memory.pyx | 8 ++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index fee074f0c..3ee434ab5 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -1,7 +1,7 @@ # cython: infer_types=True, profile=True from pathlib import Path -from typing import Iterable, Tuple, Union, Iterator +from typing import Iterable, Tuple, Union, Iterator, TypeVar, Type from cymem.cymem cimport Pool from .candidate import Candidate @@ -107,6 +107,21 @@ cdef class KnowledgeBase: Errors.E1044.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) ) + KBType = TypeVar("KBType", bound=KnowledgeBase) + @classmethod + def generate_from_disk( + cls: Type[KBType], path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList() + ) -> KBType: + """ + Factory method for generating KnowledgeBase instance from file. + path (Union[str, Path]): Target file path. + exclude (Iterable[str]): List of components to exclude. + return (KBType): Instance of KnowledgeBase generated from file. + """ + raise NotImplementedError( + Errors.E1044.format(parent="KnowledgeBase", method="generate_from_disk", name=cls.__name__) + ) + def __len__(self) -> int: """Returns number of entities in the KnowledgeBase. RETURNS (int): Number of entities in the KnowledgeBase. diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 485e52c2f..c030b5f8e 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -46,6 +46,14 @@ cdef class InMemoryLookupKB(KnowledgeBase): self._alias_index = PreshMap(nr_aliases + 1) self._aliases_table = alias_vec(nr_aliases + 1) + @classmethod + def generate_from_disk( + cls, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList() + ) -> "InMemoryLookupKB": + kb = InMemoryLookupKB(vocab=Vocab(strings=["."]), entity_vector_length=1) + kb.from_disk(path) + return kb + def __len__(self): return self.get_size_entities() From 7e6888dcd48a3927ac967bb6a7e7af930dd7ff53 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 28 Nov 2022 10:46:02 +0100 Subject: [PATCH 16/35] Add empty_kb() as config argument. --- spacy/kb/kb.pyx | 11 ++++++----- spacy/ml/models/entity_linker.py | 8 ++++++++ spacy/pipeline/entity_linker.py | 23 ++++++++++++++--------- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 3ee434ab5..fa537edc9 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -18,6 +18,8 @@ cdef class KnowledgeBase: DOCS: https://spacy.io/api/kb """ + _KBType = TypeVar("_KBType", bound=KnowledgeBase) + def __init__(self, vocab: Vocab, entity_vector_length: int): """Create a KnowledgeBase.""" # Make sure abstract KB is not instantiated. @@ -107,16 +109,15 @@ cdef class KnowledgeBase: Errors.E1044.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) ) - KBType = TypeVar("KBType", bound=KnowledgeBase) @classmethod def generate_from_disk( - cls: Type[KBType], path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList() - ) -> KBType: + cls: Type[_KBType], path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList() + ) -> _KBType: """ - Factory method for generating KnowledgeBase instance from file. + Factory method for generating KnowledgeBase subclass instance from file. path (Union[str, Path]): Target file path. exclude (Iterable[str]): List of components to exclude. - return (KBType): Instance of KnowledgeBase generated from file. + return (_KBType): Instance of KnowledgeBase subclass generated from file. """ raise NotImplementedError( Errors.E1044.format(parent="KnowledgeBase", method="generate_from_disk", name=cls.__name__) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 2f8ab20a5..fd84981b6 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -99,6 +99,14 @@ def empty_kb( return empty_kb_factory +@registry.misc("spacy.EmptyKB.v2") +def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) + + return empty_kb_factory + + @registry.misc("spacy.CandidateGenerator.v1") def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: return get_candidates diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index b2d8e7a13..55a04e7ca 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -18,7 +18,6 @@ from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import set_dropout_rate from ..kb import KnowledgeBase, Candidate -from ..ml import empty_kb from ..tokens import Doc, Span from .pipe import deserialize_config from .legacy.entity_linker import EntityLinker_v1 @@ -64,6 +63,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_all": {"@misc": "spacy.CandidateAllGenerator.v1"}, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, "overwrite": True, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, @@ -91,6 +91,7 @@ def make_entity_linker( [KnowledgeBase, Generator[Iterable[Span], None, None]], Iterator[Iterable[Iterable[Candidate]]], ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, scorer: Optional[Callable], use_gold_ents: bool, @@ -115,6 +116,7 @@ def make_entity_linker( Iterator[Iterable[Iterable[Candidate]]] ]): Function that produces a list of candidates per document, given a certain knowledge base and several textual documents with textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -151,6 +153,7 @@ def make_entity_linker( entity_vector_length=entity_vector_length, get_candidates=get_candidates, get_candidates_all=get_candidates_all, + generate_empty_kb=generate_empty_kb, overwrite=overwrite, scorer=scorer, use_gold_ents=use_gold_ents, @@ -192,6 +195,7 @@ class EntityLinker(TrainablePipe): [KnowledgeBase, Generator[Iterable[Span], None, None]], Iterator[Iterable[Iterable[Candidate]]], ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = BACKWARD_OVERWRITE, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, @@ -215,14 +219,15 @@ class EntityLinker(TrainablePipe): Callable[ [KnowledgeBase, Generator[Iterable[Span], None, None]], Iterator[Iterable[Iterable[Candidate]]] - ]): Function that produces a list of candidates per document, given a certain knowledge base and several textual - documents with textual mentions. + ]): Function that produces a list of candidates per document, given a certain knowledge base and several + textual documents with textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. - candidates_doc_mode (bool): Whether or not to operate candidate generation in doc mode, i.e. to provide a generator - yielding entities per document (candidate generator callable is called only once in this case). If False, - the candidate generator is called once per entity. + candidates_doc_mode (bool): Whether or not to operate candidate generation in doc mode, i.e. to provide a + generator yielding entities per document (candidate generator callable is called only once in this case). If + False, the candidate generator is called once per entity. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. DOCS: https://spacy.io/api/entitylinker#init @@ -241,16 +246,16 @@ class EntityLinker(TrainablePipe): self.model = model self.name = name self.labels_discard = list(labels_discard) + # how many neighbour sentences to take into account self.n_sents = n_sents self.incl_prior = incl_prior self.incl_context = incl_context self.get_candidates = get_candidates self.get_candidates_all = get_candidates_all + self.generate_empty_kb = generate_empty_kb self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) - # how many neighbour sentences to take into account - # create an empty KB by default - self.kb = empty_kb(entity_vector_length)(self.vocab) + self.kb = generate_empty_kb(self.vocab, entity_vector_length) self.scorer = scorer self.use_gold_ents = use_gold_ents self.candidates_doc_mode = candidates_doc_mode From 75aee55bc3ad81ea3fde052ef7706a3f25f8e8a5 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 28 Nov 2022 17:29:35 +0100 Subject: [PATCH 17/35] Start refactoring of Candidate classes. --- spacy/kb/candidate.pxd | 12 --- spacy/kb/candidate.py | 127 ++++++++++++++++++++++++++++++++ spacy/kb/candidate.pyx | 76 ------------------- spacy/pipeline/entity_linker.py | 9 ++- 4 files changed, 133 insertions(+), 91 deletions(-) delete mode 100644 spacy/kb/candidate.pxd create mode 100644 spacy/kb/candidate.py delete mode 100644 spacy/kb/candidate.pyx diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd deleted file mode 100644 index 942ce9dd0..000000000 --- a/spacy/kb/candidate.pxd +++ /dev/null @@ -1,12 +0,0 @@ -from .kb cimport KnowledgeBase -from libcpp.vector cimport vector -from ..typedefs cimport hash_t - -# Object used by the Entity Linker that summarizes one entity-alias candidate combination. -cdef class Candidate: - cdef readonly KnowledgeBase kb - cdef hash_t entity_hash - cdef float entity_freq - cdef vector[float] entity_vector - cdef hash_t alias_hash - cdef float prior_prob diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py new file mode 100644 index 000000000..420ee9c26 --- /dev/null +++ b/spacy/kb/candidate.py @@ -0,0 +1,127 @@ +import abc +from typing import List, Union, Optional + +from spacy import Errors +from ..tokens import Span + + +class Candidate(abc.ABC): + """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved + to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking + algorithm which will disambiguate the various candidates to the correct one. + Each candidate (alias, entity_id) pair is assigned a certain prior probability. + + DOCS: https://spacy.io/api/kb/#candidate-init + """ + + def __init__( + self, mention: str, entity_id: Union[int, str], entity_vector: List[float] + ): + """Create new instance of `Candidate`. Note: has to be a sub-class, otherwise error will be raised. + mention (str): Mention text for this candidate. + entity_id (Union[int, str]): Unique ID of entity_id. + """ + self.mention = mention + self.entity = entity_id + self.entity_vector = entity_vector + + @property + def entity_id(self) -> Union[int, str]: + """RETURNS (Union[int, str]): Entity ID.""" + return self.entity + + def entity_(self) -> Union[int, str]: + """RETURNS (Union[int, str]): Entity ID (for backwards compatibility).""" + return self.entity + + @property + def mention(self) -> str: + """RETURNS (str): Mention.""" + return self.mention + + @property + def entity_vector(self) -> List[float]: + """RETURNS (List[float]): Entity vector.""" + return self.entity_vector + + +class InMemoryLookupKBCandidate(Candidate): + """`Candidate` for InMemoryLookupKBCandidate.""" + + # todo how to resolve circular import issue? -> replace with callable for hash? + def __init__( + self, + kb: KnowledgeBase, + entity_hash, + entity_freq, + entity_vector, + alias_hash, + prior_prob, + ): + """ + prior_prob (float): Prior probability of entity_id for this mention - i.e. the probability that, independent of the + context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In cases in + which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) + it might be better to eschew this information and always supply the same value. + """ + self.kb = kb + self.entity_hash = entity_hash + self.entity_freq = entity_freq + self.entity_vector = entity_vector + self.alias_hash = alias_hash + self.prior_prob = prior_prob + + @property + def entity(self) -> int: + """RETURNS (uint64): hash of the entity_id's KB ID/name""" + return self.entity_hash + + @property + def entity_(self) -> str: + """RETURNS (str): ID/name of this entity_id in the KB""" + return self.kb.vocab.strings[self.entity_hash] + + @property + def alias(self) -> int: + """RETURNS (uint64): hash of the alias""" + return self.alias_hash + + @property + def alias_(self) -> str: + """RETURNS (str): ID of the original alias""" + return self.kb.vocab.strings[self.alias_hash] + + @property + def entity_freq(self) -> float: + return self.entity_freq + + @property + def entity_vector(self) -> Iterable[float]: + return self.entity_vector + + @property + def prior_prob(self) -> float: + """RETURNS (List[float]): Entity vector.""" + return self.prior_prob + + +def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: + """ + Return candidate entities for a given mention and fetching appropriate entries from the index. + kb (KnowledgeBase): Knowledge base to query. + mention (Span): Entity mention for which to identify candidates. + RETURNS (Iterable[Candidate]): Identified candidates. + """ + return kb.get_candidates(mention) + + +def get_candidates_all( + kb: KnowledgeBase, mentions: Generator[Iterable[Span], None, None] +) -> Iterator[Iterable[Iterable[Candidate]]]: + """ + Return candidate entities for the given mentions and fetching appropriate entries from the index. + kb (KnowledgeBase): Knowledge base to query. + mention (Generator[Iterable[Span]]): Entity mentions per document for which to identify candidates. + RETURNS (Generator[Iterable[Iterable[Candidate]]]): Identified candidates per document. + """ + return kb.get_candidates_all(mentions) diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx deleted file mode 100644 index 613b70483..000000000 --- a/spacy/kb/candidate.pyx +++ /dev/null @@ -1,76 +0,0 @@ -# cython: infer_types=True, profile=True - -from typing import Iterable, Generator, Iterator -from .kb cimport KnowledgeBase -from ..tokens import Span - -cdef class Candidate: - """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved - to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking - algorithm which will disambiguate the various candidates to the correct one. - Each candidate (alias, entity) pair is assigned a certain prior probability. - - DOCS: https://spacy.io/api/kb/#candidate-init - """ - - def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): - self.kb = kb - self.entity_hash = entity_hash - self.entity_freq = entity_freq - self.entity_vector = entity_vector - self.alias_hash = alias_hash - self.prior_prob = prior_prob - - @property - def entity(self) -> int: - """RETURNS (uint64): hash of the entity's KB ID/name""" - return self.entity_hash - - @property - def entity_(self) -> str: - """RETURNS (str): ID/name of this entity in the KB""" - return self.kb.vocab.strings[self.entity_hash] - - @property - def alias(self) -> int: - """RETURNS (uint64): hash of the alias""" - return self.alias_hash - - @property - def alias_(self) -> str: - """RETURNS (str): ID of the original alias""" - return self.kb.vocab.strings[self.alias_hash] - - @property - def entity_freq(self) -> float: - return self.entity_freq - - @property - def entity_vector(self) -> Iterable[float]: - return self.entity_vector - - @property - def prior_prob(self) -> float: - return self.prior_prob - - -def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: - """ - Return candidate entities for a given mention and fetching appropriate entries from the index. - kb (KnowledgeBase): Knowledge base to query. - mention (Span): Entity mention for which to identify candidates. - RETURNS (Iterable[Candidate]): Identified candidates. - """ - return kb.get_candidates(mention) - - -def get_candidates_all( - kb: KnowledgeBase, mentions: Generator[Iterable[Span], None, None] -) -> Iterator[Iterable[Iterable[Candidate]]]: - """ - Return candidate entities for the given mentions and fetching appropriate entries from the index. - kb (KnowledgeBase): Knowledge base to query. - mention (Generator[Iterable[Span]]): Entity mentions per document for which to identify candidates. - RETURNS (Generator[Iterable[Iterable[Candidate]]]): Identified candidates per document. - """ - return kb.get_candidates_all(mentions) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 55a04e7ca..ef42aac9e 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -532,9 +532,12 @@ class EntityLinker(TrainablePipe): else: random.shuffle(candidates) # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.incl_prior: - prior_probs = xp.asarray([0.0 for _ in candidates]) + prior_probs = xp.asarray( + [ + 0.0 if self.incl_prior else c.prior_prob + for c in candidates + ] + ) scores = prior_probs # add in similarity from the context if self.incl_context: From 3e668503ded1d334241ceffff47b170ce16ec3c6 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 29 Nov 2022 15:03:54 +0100 Subject: [PATCH 18/35] Finish Candidate refactoring. --- spacy/kb/__init__.py | 4 +- spacy/kb/candidate.py | 125 +++++++++------------ spacy/kb/kb_in_memory.pyx | 2 +- spacy/ml/models/entity_linker.py | 24 +++- spacy/pipeline/entity_linker.py | 5 +- spacy/tests/pipeline/test_entity_linker.py | 23 ++-- 6 files changed, 97 insertions(+), 86 deletions(-) diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py index b61cb5447..6dd4a3222 100644 --- a/spacy/kb/__init__.py +++ b/spacy/kb/__init__.py @@ -1,3 +1,5 @@ from .kb import KnowledgeBase from .kb_in_memory import InMemoryLookupKB -from .candidate import Candidate, get_candidates, get_candidates_all +from .candidate import Candidate + +__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate"] diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index 420ee9c26..b121974f8 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -1,12 +1,9 @@ import abc -from typing import List, Union, Optional - -from spacy import Errors -from ..tokens import Span +from typing import List, Union, Callable -class Candidate(abc.ABC): - """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved +class BaseCandidate(abc.ABC): + """A `BaseCandidate` object refers to a textual mention (`alias`) that may or may not be resolved to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking algorithm which will disambiguate the various candidates to the correct one. Each candidate (alias, entity_id) pair is assigned a certain prior probability. @@ -19,109 +16,99 @@ class Candidate(abc.ABC): ): """Create new instance of `Candidate`. Note: has to be a sub-class, otherwise error will be raised. mention (str): Mention text for this candidate. - entity_id (Union[int, str]): Unique ID of entity_id. + entity_id (Union[int, str]): Unique entity ID. + entity_vector (List[float]): Entity embedding. """ - self.mention = mention - self.entity = entity_id - self.entity_vector = entity_vector + self._mention = mention + self._entity_id = entity_id + self._entity_vector = entity_vector @property - def entity_id(self) -> Union[int, str]: + def entity(self) -> Union[int, str]: """RETURNS (Union[int, str]): Entity ID.""" - return self.entity + return self._entity_id - def entity_(self) -> Union[int, str]: - """RETURNS (Union[int, str]): Entity ID (for backwards compatibility).""" - return self.entity + @property + @abc.abstractmethod + def entity_(self) -> str: + """RETURNS (str): Entity name.""" @property def mention(self) -> str: """RETURNS (str): Mention.""" - return self.mention + return self._mention @property def entity_vector(self) -> List[float]: """RETURNS (List[float]): Entity vector.""" - return self.entity_vector + return self._entity_vector -class InMemoryLookupKBCandidate(Candidate): +class Candidate(BaseCandidate): """`Candidate` for InMemoryLookupKBCandidate.""" - # todo how to resolve circular import issue? -> replace with callable for hash? + # todo + # - glue together + # - is candidate definition necessary for EL? as long as interface fulfills requirements, this shouldn't matter. + # otherwise incorporate new argument. + # - fix test failures (100% backwards-compatible should be possible after changing EntityLinker) def __init__( self, - kb: KnowledgeBase, - entity_hash, - entity_freq, - entity_vector, - alias_hash, - prior_prob, + retrieve_string_from_hash: Callable[[int], str], + entity_hash: int, + entity_freq: int, + entity_vector: List[float], + alias_hash: int, + prior_prob: float, ): """ - prior_prob (float): Prior probability of entity_id for this mention - i.e. the probability that, independent of the - context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In cases in - which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus doesn't) - it might be better to eschew this information and always supply the same value. + retrieve_string_from_hash (Callable[[int], str]): Callable retrieveing entity name from provided entity/vocab + hash. + entity_hash (str): Hashed entity name /ID. + entity_freq (int): Entity frequency in KB corpus. + entity_vector (List[float]): Entity embedding. + alias_hash (int): Hashed alias. + prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of + the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In + cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus + doesn't) it might be better to eschew this information and always supply the same value. """ - self.kb = kb - self.entity_hash = entity_hash - self.entity_freq = entity_freq - self.entity_vector = entity_vector - self.alias_hash = alias_hash - self.prior_prob = prior_prob + super().__init__( + mention=retrieve_string_from_hash(alias_hash), + entity_id=entity_hash, + entity_vector=entity_vector, + ) + self._retrieve_string_from_hash = retrieve_string_from_hash + self._entity_hash = entity_hash + self._entity_freq = entity_freq + self._alias_hash = alias_hash + self._prior_prob = prior_prob @property def entity(self) -> int: - """RETURNS (uint64): hash of the entity_id's KB ID/name""" - return self.entity_hash + """RETURNS (int): hash of the entity_id's KB ID/name""" + return self._entity_hash @property def entity_(self) -> str: """RETURNS (str): ID/name of this entity_id in the KB""" - return self.kb.vocab.strings[self.entity_hash] + return self._retrieve_string_from_hash(self._entity_hash) @property def alias(self) -> int: - """RETURNS (uint64): hash of the alias""" - return self.alias_hash + """RETURNS (int): hash of the alias""" + return self._alias_hash @property def alias_(self) -> str: """RETURNS (str): ID of the original alias""" - return self.kb.vocab.strings[self.alias_hash] + return self._retrieve_string_from_hash(self._alias_hash) @property def entity_freq(self) -> float: - return self.entity_freq - - @property - def entity_vector(self) -> Iterable[float]: - return self.entity_vector + return self._entity_freq @property def prior_prob(self) -> float: """RETURNS (List[float]): Entity vector.""" - return self.prior_prob - - -def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: - """ - Return candidate entities for a given mention and fetching appropriate entries from the index. - kb (KnowledgeBase): Knowledge base to query. - mention (Span): Entity mention for which to identify candidates. - RETURNS (Iterable[Candidate]): Identified candidates. - """ - return kb.get_candidates(mention) - - -def get_candidates_all( - kb: KnowledgeBase, mentions: Generator[Iterable[Span], None, None] -) -> Iterator[Iterable[Iterable[Candidate]]]: - """ - Return candidate entities for the given mentions and fetching appropriate entries from the index. - kb (KnowledgeBase): Knowledge base to query. - mention (Generator[Iterable[Span]]): Entity mentions per document for which to identify candidates. - RETURNS (Generator[Iterable[Iterable[Candidate]]]): Identified candidates per document. - """ - return kb.get_candidates_all(mentions) + return self._prior_prob diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index c030b5f8e..2b245d76f 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -246,7 +246,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] - return [Candidate(kb=self, + return [Candidate(retrieve_string_from_hash=self.vocab.strings.__getitem__, entity_hash=self._entries[entry_index].entity_hash, entity_freq=self._entries[entry_index].freq, entity_vector=self._vectors_table[self._entries[entry_index].vector_index], diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index fd84981b6..46b4cd83f 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -6,7 +6,7 @@ from thinc.api import Model, Maxout, Linear, tuplify, Ragged from ...util import registry from ...kb import KnowledgeBase, InMemoryLookupKB -from ...kb import Candidate, get_candidates, get_candidates_all +from ...kb import Candidate from ...vocab import Vocab from ...tokens import Span, Doc from ..extract_spans import extract_spans @@ -107,6 +107,28 @@ def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: return empty_kb_factory +def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: + """ + Return candidate entities for a given mention and fetching appropriate entries from the index. + kb (KnowledgeBase): Knowledge base to query. + mention (Span): Entity mention for which to identify candidates. + RETURNS (Iterable[Candidate]): Identified candidates. + """ + return kb.get_candidates(mention) + + +def get_candidates_all( + kb: KnowledgeBase, mentions: Generator[Iterable[Span], None, None] +) -> Iterator[Iterable[Iterable[Candidate]]]: + """ + Return candidate entities for the given mentions and fetching appropriate entries from the index. + kb (KnowledgeBase): Knowledge base to query. + mention (Generator[Iterable[Span]]): Entity mentions per document for which to identify candidates. + RETURNS (Generator[Iterable[Iterable[Candidate]]]): Identified candidates per document. + """ + return kb.get_candidates_all(mentions) + + @registry.misc("spacy.CandidateGenerator.v1") def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: return get_candidates diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index ef42aac9e..5fa826383 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -464,6 +464,7 @@ class EntityLinker(TrainablePipe): if isinstance(docs, Doc): docs = [docs] + docs = list(docs) # Determine which entities are to be ignored due to labels_discard. valid_ent_idx_per_doc = ( [ @@ -474,6 +475,7 @@ class EntityLinker(TrainablePipe): for doc in docs if len(doc) and len(doc.ents) ) + # Call candidate generator. if self.candidates_doc_mode: all_ent_cands = self.get_candidates_all( @@ -532,13 +534,12 @@ class EntityLinker(TrainablePipe): else: random.shuffle(candidates) # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray( + scores = prior_probs = xp.asarray( [ 0.0 if self.incl_prior else c.prior_prob for c in candidates ] ) - scores = prior_probs # add in similarity from the context if self.incl_context: entity_encodings = xp.asarray( diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 2fc183722..c4cd411bb 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -6,10 +6,10 @@ from numpy.testing import assert_equal from spacy import registry, util from spacy.attrs import ENT_KB_ID from spacy.compat import pickle -from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase +from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase from spacy.lang.en import English from spacy.ml import load_kb -from spacy.ml.models.entity_linker import build_span_maker +from spacy.ml.models.entity_linker import build_span_maker, get_candidates from spacy.pipeline import EntityLinker from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL @@ -496,7 +496,9 @@ def test_el_pipe_configuration(nlp): doc = nlp(text) assert doc[0].ent_kb_id_ == "NIL" assert doc[1].ent_kb_id_ == "" - assert doc[2].ent_kb_id_ == "Q2" + # todo It's unclear why EL doesn't learn properly for this test anymore (scores are 0). Seemed to work before, but + # no relevant changes in EL code were made since these tests were added AFAIK (CG seems to work fine). + assert doc[2].ent_kb_id_ in ("Q2", "Q3") # Replace the pipe with a new one with with a different candidate generator. @@ -530,6 +532,7 @@ def test_el_pipe_configuration(nlp): "entity_linker", config={ "incl_context": False, + "incl_prior": True, "candidates_doc_mode": candidates_doc_mode, "get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"}, "get_candidates_all": { @@ -539,9 +542,9 @@ def test_el_pipe_configuration(nlp): ) _entity_linker.set_kb(create_kb) _doc = nlp(doc_text) - assert _doc[0].ent_kb_id_ == "Q2" + assert _doc[0].ent_kb_id_ in ("Q2", "Q3") assert _doc[1].ent_kb_id_ == "" - assert _doc[2].ent_kb_id_ == "Q2" + assert _doc[2].ent_kb_id_ in ("Q2", "Q3") # Test individual and doc-wise candidate generation. test_reconfigured_el(False, text) @@ -1191,18 +1194,14 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]): # create artificial KB mykb = InMemoryLookupKB(vocab, entity_vector_length=3) mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3]) - mykb.add_alias( - alias="Mahler", - entities=[entity_id], - probabilities=[1 if meet_threshold else 0.01], - ) + mykb.add_alias(alias="Mahler", entities=[entity_id], probabilities=[1]) return mykb # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.add_pipe( "entity_linker", last=True, - config={"threshold": 0.99, "model": config}, + config={"threshold": None if meet_threshold else 1.0, "model": config}, ) entity_linker.set_kb(create_kb) # type: ignore nlp.initialize(get_examples=lambda: train_examples) @@ -1213,7 +1212,7 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]): doc = nlp(text) assert len(doc.ents) == 1 - assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL + assert doc.ents[0].kb_id_ == (entity_id if meet_threshold else EntityLinker.NIL) def test_span_maker_forward_with_empty(): From 60eda0d7a5bd4cf89fbb55312eafbf6043c9cdc3 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 29 Nov 2022 15:15:47 +0100 Subject: [PATCH 19/35] Update setup.py. Remove temporary comments. --- setup.py | 1 - spacy/kb/candidate.py | 5 ----- spacy/tests/pipeline/test_entity_linker.py | 2 -- 3 files changed, 8 deletions(-) diff --git a/setup.py b/setup.py index 243554c7a..79bdcba8d 100755 --- a/setup.py +++ b/setup.py @@ -30,7 +30,6 @@ MOD_NAMES = [ "spacy.lexeme", "spacy.vocab", "spacy.attrs", - "spacy.kb.candidate", "spacy.kb.kb", "spacy.kb.kb_in_memory", "spacy.ml.parser_model", diff --git a/spacy/kb/candidate.py b/spacy/kb/candidate.py index b121974f8..190792fbe 100644 --- a/spacy/kb/candidate.py +++ b/spacy/kb/candidate.py @@ -47,11 +47,6 @@ class BaseCandidate(abc.ABC): class Candidate(BaseCandidate): """`Candidate` for InMemoryLookupKBCandidate.""" - # todo - # - glue together - # - is candidate definition necessary for EL? as long as interface fulfills requirements, this shouldn't matter. - # otherwise incorporate new argument. - # - fix test failures (100% backwards-compatible should be possible after changing EntityLinker) def __init__( self, retrieve_string_from_hash: Callable[[int], str], diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index c4cd411bb..7ef155bfb 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -496,8 +496,6 @@ def test_el_pipe_configuration(nlp): doc = nlp(text) assert doc[0].ent_kb_id_ == "NIL" assert doc[1].ent_kb_id_ == "" - # todo It's unclear why EL doesn't learn properly for this test anymore (scores are 0). Seemed to work before, but - # no relevant changes in EL code were made since these tests were added AFAIK (CG seems to work fine). assert doc[2].ent_kb_id_ in ("Q2", "Q3") # Replace the pipe with a new one with with a different candidate generator. From 96909f320314be48b282d021d0fcc81a09b22b2e Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 29 Nov 2022 22:03:53 +0100 Subject: [PATCH 20/35] Update typing for get_candidates_all(). --- spacy/ml/models/entity_linker.py | 6 +++--- spacy/pipeline/entity_linker.py | 21 +++++++-------------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 46b4cd83f..293c3910a 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -118,13 +118,13 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: def get_candidates_all( - kb: KnowledgeBase, mentions: Generator[Iterable[Span], None, None] + kb: KnowledgeBase, mentions: Iterator[Iterable[Span]] ) -> Iterator[Iterable[Iterable[Candidate]]]: """ Return candidate entities for the given mentions and fetching appropriate entries from the index. kb (KnowledgeBase): Knowledge base to query. - mention (Generator[Iterable[Span]]): Entity mentions per document for which to identify candidates. - RETURNS (Generator[Iterable[Iterable[Candidate]]]): Identified candidates per document. + mention (Iterator[Iterable[Span]]): Entity mentions per document for which to identify candidates. + RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per document. """ return kb.get_candidates_all(mentions) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 5fa826383..ed9d79da5 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -6,7 +6,6 @@ from typing import ( Union, List, Any, - Generator, Iterator, ) from thinc.types import Floats2d @@ -88,7 +87,7 @@ def make_entity_linker( entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_all: Callable[ - [KnowledgeBase, Generator[Iterable[Span], None, None]], + [KnowledgeBase, Iterator[Iterable[Span]]], Iterator[Iterable[Iterable[Candidate]]], ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], @@ -110,11 +109,8 @@ def make_entity_linker( entity_vector_length (int): Size of encoding vectors in the KB. get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. - get_candidates_all ( - Callable[ - [KnowledgeBase, Generator[Iterable[Span], None, None]], - Iterator[Iterable[Iterable[Candidate]]] - ]): Function that produces a list of candidates per document, given a certain knowledge base and several textual + get_candidates_all (Callable[[KnowledgeBase, Iterator[Iterable[Span]]], Iterator[Iterable[Iterable[Candidate]]]]): + Function that produces a list of candidates per document, given a certain knowledge base and several textual documents with textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. @@ -192,7 +188,7 @@ class EntityLinker(TrainablePipe): entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_all: Callable[ - [KnowledgeBase, Generator[Iterable[Span], None, None]], + [KnowledgeBase, Iterator[Iterable[Span]]], Iterator[Iterable[Iterable[Candidate]]], ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], @@ -215,12 +211,9 @@ class EntityLinker(TrainablePipe): entity_vector_length (int): Size of encoding vectors in the KB. get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. - get_candidates_all ( - Callable[ - [KnowledgeBase, Generator[Iterable[Span], None, None]], - Iterator[Iterable[Iterable[Candidate]]] - ]): Function that produces a list of candidates per document, given a certain knowledge base and several - textual documents with textual mentions. + get_candidates_all (Callable[[KnowledgeBase, Iterator[Iterable[Span]]], Iterator[Iterable[Iterable[Candidate]]]]): + Function that produces a list of candidates per document, given a certain knowledge base and several textual + documents with textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another From 6a30df3039af389cb4bd43826a4aad65627699ca Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 29 Nov 2022 23:03:03 +0100 Subject: [PATCH 21/35] Fix newly introduced bug in EntityLinker.predict(). --- spacy/pipeline/entity_linker.py | 2 +- spacy/tests/pipeline/test_entity_linker.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index ed9d79da5..9df8c357c 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -529,7 +529,7 @@ class EntityLinker(TrainablePipe): # set all prior probabilities to 0 if incl_prior=False scores = prior_probs = xp.asarray( [ - 0.0 if self.incl_prior else c.prior_prob + c.prior_prob if self.incl_prior else 0.0 for c in candidates ] ) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 7ef155bfb..4997631f3 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -496,7 +496,7 @@ def test_el_pipe_configuration(nlp): doc = nlp(text) assert doc[0].ent_kb_id_ == "NIL" assert doc[1].ent_kb_id_ == "" - assert doc[2].ent_kb_id_ in ("Q2", "Q3") + assert doc[2].ent_kb_id_ == "Q2" # Replace the pipe with a new one with with a different candidate generator. @@ -540,9 +540,9 @@ def test_el_pipe_configuration(nlp): ) _entity_linker.set_kb(create_kb) _doc = nlp(doc_text) - assert _doc[0].ent_kb_id_ in ("Q2", "Q3") + assert _doc[0].ent_kb_id_ == "Q2" assert _doc[1].ent_kb_id_ == "" - assert _doc[2].ent_kb_id_ in ("Q2", "Q3") + assert _doc[2].ent_kb_id_ == "Q2" # Test individual and doc-wise candidate generation. test_reconfigured_el(False, text) From ff7fc0850d984d1d16b4979c618a2a8d62abc586 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 5 Dec 2022 16:35:03 +0100 Subject: [PATCH 22/35] Add kwargs to KnowledgeBase.generate_from_disk(). --- spacy/kb/kb.pyx | 2 +- spacy/kb/kb_in_memory.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index fa537edc9..3a4d3b4be 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -111,7 +111,7 @@ cdef class KnowledgeBase: @classmethod def generate_from_disk( - cls: Type[_KBType], path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList() + cls: Type[_KBType], path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList(), **kwargs ) -> _KBType: """ Factory method for generating KnowledgeBase subclass instance from file. diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 2b245d76f..ddebc468d 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -48,7 +48,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): @classmethod def generate_from_disk( - cls, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList() + cls, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList(), **kwargs ) -> "InMemoryLookupKB": kb = InMemoryLookupKB(vocab=Vocab(strings=["."]), entity_vector_length=1) kb.from_disk(path) From 2870c8f4d6670a11909137e82601496caa648de4 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 5 Dec 2022 16:38:02 +0100 Subject: [PATCH 23/35] Remove kwargs from KnowledgeBase.generate_from_disk(). --- spacy/kb/kb.pyx | 2 +- spacy/kb/kb_in_memory.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 3a4d3b4be..fa537edc9 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -111,7 +111,7 @@ cdef class KnowledgeBase: @classmethod def generate_from_disk( - cls: Type[_KBType], path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList(), **kwargs + cls: Type[_KBType], path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList() ) -> _KBType: """ Factory method for generating KnowledgeBase subclass instance from file. diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index ddebc468d..2b245d76f 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -48,7 +48,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): @classmethod def generate_from_disk( - cls, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList(), **kwargs + cls, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList() ) -> "InMemoryLookupKB": kb = InMemoryLookupKB(vocab=Vocab(strings=["."]), entity_vector_length=1) kb.from_disk(path) From cb640abe8157c2daa70cf8096b712541cea36a49 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 12 Dec 2022 14:04:34 +0100 Subject: [PATCH 24/35] Fix EL test. --- spacy/kb/kb_in_memory.pyx | 20 ++++++++++++-------- spacy/tests/pipeline/test_entity_linker.py | 14 +++++++++++++- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 2b245d76f..97ae08e1e 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -246,14 +246,18 @@ cdef class InMemoryLookupKB(KnowledgeBase): alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] - return [Candidate(retrieve_string_from_hash=self.vocab.strings.__getitem__, - entity_hash=self._entries[entry_index].entity_hash, - entity_freq=self._entries[entry_index].freq, - entity_vector=self._vectors_table[self._entries[entry_index].vector_index], - alias_hash=alias_hash, - prior_prob=prior_prob) - for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) - if entry_index != 0] + return [ + Candidate( + retrieve_string_from_hash=self.vocab.strings.__getitem__, + entity_hash=self._entries[entry_index].entity_hash, + entity_freq=self._entries[entry_index].freq, + entity_vector=self._vectors_table[self._entries[entry_index].vector_index], + alias_hash=alias_hash, + prior_prob=prior_prob + ) + for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) + if entry_index != 0 + ] def get_vector(self, str entity): cdef hash_t entity_hash = self.vocab.strings[entity] diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 4997631f3..c6030be41 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1199,7 +1199,19 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]): entity_linker = nlp.add_pipe( "entity_linker", last=True, - config={"threshold": None if meet_threshold else 1.0, "model": config}, + config={ + **( + {"threshold": None} + if meet_threshold + else { + "threshold": 1.0, + # Prior for candidate may be 1.0, rendering the our test setting with threshold 1.0 useless + # otherwise. + "incl_prior": False, + } + ), + "model": config, + }, ) entity_linker.set_kb(create_kb) # type: ignore nlp.initialize(get_examples=lambda: train_examples) From df6e4ab055fedfc8201c02d420aa8ef1072c76df Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 12 Dec 2022 14:05:41 +0100 Subject: [PATCH 25/35] Reformat EL test. --- spacy/tests/pipeline/test_entity_linker.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index c6030be41..2f30672c1 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1200,16 +1200,9 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]): "entity_linker", last=True, config={ - **( - {"threshold": None} - if meet_threshold - else { - "threshold": 1.0, - # Prior for candidate may be 1.0, rendering the our test setting with threshold 1.0 useless - # otherwise. - "incl_prior": False, - } - ), + "threshold": None if meet_threshold else 1.0, + # Prior for candidate may be 1.0, rendering the our test setting with threshold 1.0 useless otherwise. + "incl_prior": meet_threshold, "model": config, }, ) From 53a24abd8b389b6214af3be367f12365e440d968 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 14 Dec 2022 11:51:37 +0100 Subject: [PATCH 26/35] Modify candidate retrieval interface to accept docs instead of individual spans. --- spacy/kb/kb.pyx | 23 ++++++------- spacy/kb/kb_in_memory.pyx | 6 ++-- spacy/ml/models/entity_linker.py | 6 ++-- spacy/pipeline/entity_linker.py | 38 ++++++++++++++-------- spacy/tests/pipeline/test_entity_linker.py | 6 ++-- 5 files changed, 45 insertions(+), 34 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index fa537edc9..12156590d 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -1,11 +1,11 @@ # cython: infer_types=True, profile=True from pathlib import Path -from typing import Iterable, Tuple, Union, Iterator, TypeVar, Type +from typing import Iterable, Tuple, Union, Iterator, TypeVar, Type, Optional from cymem.cymem cimport Pool from .candidate import Candidate -from ..tokens import Span +from ..tokens import Span, Doc from ..util import SimpleFrozenList from ..errors import Errors @@ -32,24 +32,25 @@ cdef class KnowledgeBase: self.entity_vector_length = entity_vector_length self.mem = Pool() - def get_candidates_all(self, mentions: Iterator[Iterable[Span]]) -> Iterator[Iterable[Iterable[Candidate]]]: + def get_candidates_all(self, docs: Iterator[Doc]) -> Iterator[Iterable[Iterable[Candidate]]]: """ - Return candidate entities for specified mentions. Each candidate defines the entity, the original alias, - and the prior probability of that alias resolving to that entity. + Return candidate entities for mentions stored in `ent` attribute in passed docs. Each candidate defines the + entity, the original alias, and the prior probability of that alias resolving to that entity. If no candidate is found for a given mention, an empty list is returned. - mentions (Generator[Iterable[Span]]): Mentions per documents for which to get candidates. - RETURNS (Generator[Iterable[Iterable[Candidate]]]): Identified candidates per document. + docs (Iterator[Doc]): Doc instances with mentions (stored in `.ent`). + RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per document. """ + for doc in docs: + yield [self.get_candidates(ent_span, doc) for ent_span in doc.ents] - for doc_mentions in mentions: - yield [self.get_candidates(span) for span in doc_mentions] - - def get_candidates(self, mention: Span) -> Iterable[Candidate]: + def get_candidates(self, mention: Span, doc: Optional[Doc] = None) -> Iterable[Candidate]: """ Return candidate entities for specified text. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. If the no candidate is found for a given text, an empty list is returned. + Note that doc is not utilized for further context in this implementation. mention (Span): Mention for which to get candidates. + doc (Optional[Doc]): Doc to use for context. RETURNS (Iterable[Candidate]): Identified candidates. """ raise NotImplementedError( diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 97ae08e1e..133dc3abb 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -1,5 +1,5 @@ # cython: infer_types=True, profile=True -from typing import Iterable, Callable, Dict, Any, Union +from typing import Iterable, Callable, Dict, Any, Union, Optional import srsly from preshed.maps cimport PreshMap @@ -11,7 +11,7 @@ from libcpp.vector cimport vector from pathlib import Path import warnings -from ..tokens import Span +from ..tokens import Span, Doc from ..typedefs cimport hash_t from ..errors import Errors, Warnings from .. import util @@ -231,7 +231,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): alias_entry.probs = probs self._aliases_table[alias_index] = alias_entry - def get_candidates(self, mention: Span) -> Iterable[Candidate]: + def get_candidates(self, mention: Span, doc: Optional[Doc] = None) -> Iterable[Candidate]: return self.get_alias_candidates(mention.text) # type: ignore def get_alias_candidates(self, str alias) -> Iterable[Candidate]: diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 293c3910a..99cc31125 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -118,15 +118,15 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: def get_candidates_all( - kb: KnowledgeBase, mentions: Iterator[Iterable[Span]] + kb: KnowledgeBase, docs: Iterator[Doc] ) -> Iterator[Iterable[Iterable[Candidate]]]: """ Return candidate entities for the given mentions and fetching appropriate entries from the index. kb (KnowledgeBase): Knowledge base to query. - mention (Iterator[Iterable[Span]]): Entity mentions per document for which to identify candidates. + docs (Iterator[Doc]): Doc instances with mentions (stored in `.ent`). RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per document. """ - return kb.get_candidates_all(mentions) + return kb.get_candidates_all(docs) @registry.misc("spacy.CandidateGenerator.v1") diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 9df8c357c..25fb654ac 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -87,8 +87,7 @@ def make_entity_linker( entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_all: Callable[ - [KnowledgeBase, Iterator[Iterable[Span]]], - Iterator[Iterable[Iterable[Candidate]]], + [KnowledgeBase, Iterator[Doc]], Iterator[Iterable[Iterable[Candidate]]] ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, @@ -107,11 +106,11 @@ def make_entity_linker( incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. incl_context (bool): Whether or not to include the local context in the model. entity_vector_length (int): Size of encoding vectors in the KB. - get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that - produces a list of candidates, given a certain knowledge base and a textual mention. - get_candidates_all (Callable[[KnowledgeBase, Iterator[Iterable[Span]]], Iterator[Iterable[Iterable[Candidate]]]]): - Function that produces a list of candidates per document, given a certain knowledge base and several textual - documents with textual mentions. + get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function producing a list of + candidates, given a certain knowledge base and a textual mention. + get_candidates_all (Callable[[KnowledgeBase, Iterator[Doc]], Iterator[Iterable[Iterable[Candidate]]]]): Function + that produces a list of candidates per document, given a certain knowledge base and several textual documents + with textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another @@ -188,8 +187,7 @@ class EntityLinker(TrainablePipe): entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_all: Callable[ - [KnowledgeBase, Iterator[Iterable[Span]]], - Iterator[Iterable[Iterable[Candidate]]], + [KnowledgeBase, Iterator[Doc]], Iterator[Iterable[Iterable[Candidate]]] ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = BACKWARD_OVERWRITE, @@ -209,9 +207,9 @@ class EntityLinker(TrainablePipe): incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. incl_context (bool): Whether or not to include the local context in the model. entity_vector_length (int): Size of encoding vectors in the KB. - get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that - produces a list of candidates, given a certain knowledge base and a textual mention. - get_candidates_all (Callable[[KnowledgeBase, Iterator[Iterable[Span]]], Iterator[Iterable[Iterable[Candidate]]]]): + get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function producing a list + of candidates, given a certain knowledge base and a textual mention. + get_candidates_all (Callable[[KnowledgeBase, Iterator[Doc]], Iterator[Iterable[Iterable[Candidate]]]]): Function that produces a list of candidates per document, given a certain knowledge base and several textual documents with textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. @@ -330,7 +328,6 @@ class EntityLinker(TrainablePipe): If one isn't present, then the update step needs to be skipped. """ - for eg in examples: for ent in eg.predicted.ents: candidates = list(self.get_candidates(self.kb, ent)) @@ -471,10 +468,22 @@ class EntityLinker(TrainablePipe): # Call candidate generator. if self.candidates_doc_mode: + + def _adjust_ents_in_doc(doc: Doc, valid_ent_idx: Iterable[int]) -> Doc: + """ + Generates copy of doc object with only those ents that are candidates are to be retrieved for. + doc (Doc): Doc object to adjust. + valid_ent_idx (Iterable[int]): Indices of entities to keep. + RETURN (doc): Doc instance with only valid entities (i.e. those to retrieve candidates for). + """ + _doc = doc.copy() + _doc.ents = [doc.ents[i] for i in valid_ent_idx] + return _doc + all_ent_cands = self.get_candidates_all( self.kb, ( - [doc.ents[idx] for idx in next(valid_ent_idx_per_doc)] + _adjust_ents_in_doc(doc, next(valid_ent_idx_per_doc)) for doc in docs if len(doc) and len(doc.ents) ), @@ -564,6 +573,7 @@ class EntityLinker(TrainablePipe): method="predict", msg="result variables not of equal length" ) raise RuntimeError(err) + return final_kb_ids def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None: diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 2f30672c1..19b276045 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -503,9 +503,9 @@ def test_el_pipe_configuration(nlp): def get_lowercased_candidates(kb, span): return kb.get_alias_candidates(span.text.lower()) - def get_lowercased_candidates_all(kb, spans_per_doc): - for doc_spans in spans_per_doc: - yield [get_lowercased_candidates(kb, span) for span in doc_spans] + def get_lowercased_candidates_all(kb, docs): + for _doc in docs: + yield [get_lowercased_candidates(kb, ent_span) for ent_span in _doc.ents] @registry.misc("spacy.LowercaseCandidateGenerator.v1") def create_candidates() -> Callable[ From 51c485da09ef7e87d714725cfa31ab8523a6c1c9 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 14 Dec 2022 11:53:39 +0100 Subject: [PATCH 27/35] Fix candidate retrieval interface. --- spacy/kb/kb.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 12156590d..b72378323 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -41,16 +41,15 @@ cdef class KnowledgeBase: RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per document. """ for doc in docs: - yield [self.get_candidates(ent_span, doc) for ent_span in doc.ents] + yield [self.get_candidates(ent_span) for ent_span in doc.ents] - def get_candidates(self, mention: Span, doc: Optional[Doc] = None) -> Iterable[Candidate]: + def get_candidates(self, mention: Span) -> Iterable[Candidate]: """ Return candidate entities for specified text. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. If the no candidate is found for a given text, an empty list is returned. Note that doc is not utilized for further context in this implementation. mention (Span): Mention for which to get candidates. - doc (Optional[Doc]): Doc to use for context. RETURNS (Iterable[Candidate]): Identified candidates. """ raise NotImplementedError( From 581c2fd40fa406655a30c1bff7fc97eb48c9396a Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 14 Dec 2022 12:10:39 +0100 Subject: [PATCH 28/35] Fix mypy errors. --- spacy/ml/models/entity_linker.py | 4 ++-- spacy/pipeline/entity_linker.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 99cc31125..d683a8fe7 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Optional, Callable, Iterable, List, Tuple, Generator, Iterator +from typing import Optional, Callable, Iterable, List, Tuple, Iterator from thinc.types import Floats2d from thinc.api import chain, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear, tuplify, Ragged @@ -136,7 +136,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: @registry.misc("spacy.CandidateAllGenerator.v1") def create_candidates_all() -> Callable[ - [KnowledgeBase, Generator[Iterable[Span], None, None]], + [KnowledgeBase, Iterator[Doc]], Iterator[Iterable[Iterable[Candidate]]], ]: return get_candidates_all diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 25fb654ac..6d2a114cb 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -477,7 +477,9 @@ class EntityLinker(TrainablePipe): RETURN (doc): Doc instance with only valid entities (i.e. those to retrieve candidates for). """ _doc = doc.copy() - _doc.ents = [doc.ents[i] for i in valid_ent_idx] + # mypy complains about mismatching types here (Tuple[str] vs. Tuple[str, ...]), which isn't correct and + # probably an artifact of a misreading of the Cython code. + _doc.ents = tuple([doc.ents[i] for i in valid_ent_idx]) # type: ignore return _doc all_ent_cands = self.get_candidates_all( From b6bc6885d9486a8611d784c97f5c73b0ac71c3ba Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 15 Dec 2022 10:17:25 +0100 Subject: [PATCH 29/35] Switch to SpanGroup (from Doc) for bundling Spans for candidate retrieval. --- spacy/kb/kb.pyx | 22 +++++++++---- spacy/kb/kb_in_memory.pyx | 4 +-- spacy/ml/models/entity_linker.py | 10 +++--- spacy/pipeline/entity_linker.py | 36 ++++++++-------------- spacy/tests/pipeline/test_entity_linker.py | 16 +++++----- spacy/tokens/doc.pyx | 10 ++++++ 6 files changed, 55 insertions(+), 43 deletions(-) diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index b72378323..bc8d54761 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -1,11 +1,11 @@ # cython: infer_types=True, profile=True from pathlib import Path -from typing import Iterable, Tuple, Union, Iterator, TypeVar, Type, Optional +from typing import Iterable, Tuple, Union, Iterator, TypeVar, Type, Callable from cymem.cymem cimport Pool from .candidate import Candidate -from ..tokens import Span, Doc +from ..tokens import Span, SpanGroup, Doc from ..util import SimpleFrozenList from ..errors import Errors @@ -32,16 +32,26 @@ cdef class KnowledgeBase: self.entity_vector_length = entity_vector_length self.mem = Pool() - def get_candidates_all(self, docs: Iterator[Doc]) -> Iterator[Iterable[Iterable[Candidate]]]: + def get_candidates_all(self, mentions: Iterator[SpanGroup]) -> Iterator[Iterable[Iterable[Candidate]]]: """ Return candidate entities for mentions stored in `ent` attribute in passed docs. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. If no candidate is found for a given mention, an empty list is returned. - docs (Iterator[Doc]): Doc instances with mentions (stored in `.ent`). + mentions (Iterator[SpanGroup]): Mentions per doc as SpanGroup instance. RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per document. """ - for doc in docs: - yield [self.get_candidates(ent_span) for ent_span in doc.ents] + for doc_mentions in mentions: + yield [self.get_candidates(ent_span) for ent_span in doc_mentions] + + @staticmethod + def get_ents_as_spangroup(doc: Doc, extractor: Union[str, Callable[[Iterable[Span]], Doc]] = "ent") -> SpanGroup: + """ + Fetch entities from doc and returns them as a SpanGroup ready to be used in + `KnowledgeBase.get_candidates_all()`. + doc (Doc): Doc whose entities should be fetched. + extractor (Union[str, Callable[[Iterable[Span]], Doc]]): Defines how to retrieve object holding spans + used to describe entities. This can be a key referring to a property of the doc instance (e.g. " + """ def get_candidates(self, mention: Span) -> Iterable[Candidate]: """ diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 133dc3abb..a87ddf0f6 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -231,8 +231,8 @@ cdef class InMemoryLookupKB(KnowledgeBase): alias_entry.probs = probs self._aliases_table[alias_index] = alias_entry - def get_candidates(self, mention: Span, doc: Optional[Doc] = None) -> Iterable[Candidate]: - return self.get_alias_candidates(mention.text) # type: ignore + def get_candidates(self, mention: Span) -> Iterable[Candidate]: + return self.get_alias_candidates(mention.text) def get_alias_candidates(self, str alias) -> Iterable[Candidate]: """ diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index d683a8fe7..b5f455cdc 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -8,7 +8,7 @@ from ...util import registry from ...kb import KnowledgeBase, InMemoryLookupKB from ...kb import Candidate from ...vocab import Vocab -from ...tokens import Span, Doc +from ...tokens import Span, Doc, SpanGroup from ..extract_spans import extract_spans from ...errors import Errors @@ -118,15 +118,15 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: def get_candidates_all( - kb: KnowledgeBase, docs: Iterator[Doc] + kb: KnowledgeBase, mentions: Iterator[SpanGroup] ) -> Iterator[Iterable[Iterable[Candidate]]]: """ Return candidate entities for the given mentions and fetching appropriate entries from the index. kb (KnowledgeBase): Knowledge base to query. - docs (Iterator[Doc]): Doc instances with mentions (stored in `.ent`). + mentions (Iterator[SpanGroup]): Mentions per doc as SpanGroup instance. RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per document. """ - return kb.get_candidates_all(docs) + return kb.get_candidates_all(mentions) @registry.misc("spacy.CandidateGenerator.v1") @@ -136,7 +136,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: @registry.misc("spacy.CandidateAllGenerator.v1") def create_candidates_all() -> Callable[ - [KnowledgeBase, Iterator[Doc]], + [KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]], ]: return get_candidates_all diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 6d2a114cb..169d375e1 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -17,7 +17,7 @@ from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import set_dropout_rate from ..kb import KnowledgeBase, Candidate -from ..tokens import Doc, Span +from ..tokens import Doc, Span, SpanGroup from .pipe import deserialize_config from .legacy.entity_linker import EntityLinker_v1 from .trainable_pipe import TrainablePipe @@ -87,7 +87,7 @@ def make_entity_linker( entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_all: Callable[ - [KnowledgeBase, Iterator[Doc]], Iterator[Iterable[Iterable[Candidate]]] + [KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]] ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, @@ -108,9 +108,9 @@ def make_entity_linker( entity_vector_length (int): Size of encoding vectors in the KB. get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function producing a list of candidates, given a certain knowledge base and a textual mention. - get_candidates_all (Callable[[KnowledgeBase, Iterator[Doc]], Iterator[Iterable[Iterable[Candidate]]]]): Function - that produces a list of candidates per document, given a certain knowledge base and several textual documents - with textual mentions. + get_candidates_all (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]): + Function producing a list of candidates per document, given a certain knowledge base and several textual + documents with textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another @@ -187,7 +187,8 @@ class EntityLinker(TrainablePipe): entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_all: Callable[ - [KnowledgeBase, Iterator[Doc]], Iterator[Iterable[Iterable[Candidate]]] + [KnowledgeBase, Iterator[SpanGroup]], + Iterator[Iterable[Iterable[Candidate]]], ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = BACKWARD_OVERWRITE, @@ -209,8 +210,8 @@ class EntityLinker(TrainablePipe): entity_vector_length (int): Size of encoding vectors in the KB. get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function producing a list of candidates, given a certain knowledge base and a textual mention. - get_candidates_all (Callable[[KnowledgeBase, Iterator[Doc]], Iterator[Iterable[Iterable[Candidate]]]]): - Function that produces a list of candidates per document, given a certain knowledge base and several textual + get_candidates_all (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]): + Function producing a list of candidates per document, given a certain knowledge base and several textual documents with textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. @@ -468,24 +469,13 @@ class EntityLinker(TrainablePipe): # Call candidate generator. if self.candidates_doc_mode: - - def _adjust_ents_in_doc(doc: Doc, valid_ent_idx: Iterable[int]) -> Doc: - """ - Generates copy of doc object with only those ents that are candidates are to be retrieved for. - doc (Doc): Doc object to adjust. - valid_ent_idx (Iterable[int]): Indices of entities to keep. - RETURN (doc): Doc instance with only valid entities (i.e. those to retrieve candidates for). - """ - _doc = doc.copy() - # mypy complains about mismatching types here (Tuple[str] vs. Tuple[str, ...]), which isn't correct and - # probably an artifact of a misreading of the Cython code. - _doc.ents = tuple([doc.ents[i] for i in valid_ent_idx]) # type: ignore - return _doc - all_ent_cands = self.get_candidates_all( self.kb, ( - _adjust_ents_in_doc(doc, next(valid_ent_idx_per_doc)) + SpanGroup( + doc, + spans=[doc.ents[idx] for idx in next(valid_ent_idx_per_doc)], + ) for doc in docs if len(doc) and len(doc.ents) ), diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 19b276045..c6ae4d5a9 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,4 +1,4 @@ -from typing import Callable, Iterable, Dict, Any, Generator, Iterator +from typing import Callable, Iterable, Dict, Any, Iterator import pytest from numpy.testing import assert_equal @@ -15,7 +15,7 @@ from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.tests.util import make_tempdir -from spacy.tokens import Span, Doc +from spacy.tokens import Span, Doc, SpanGroup from spacy.training import Example from spacy.util import ensure_path from spacy.vocab import Vocab @@ -500,12 +500,14 @@ def test_el_pipe_configuration(nlp): # Replace the pipe with a new one with with a different candidate generator. - def get_lowercased_candidates(kb, span): + def get_lowercased_candidates(kb: InMemoryLookupKB, span: Span): return kb.get_alias_candidates(span.text.lower()) - def get_lowercased_candidates_all(kb, docs): - for _doc in docs: - yield [get_lowercased_candidates(kb, ent_span) for ent_span in _doc.ents] + def get_lowercased_candidates_all( + kb: InMemoryLookupKB, mentions: Iterator[SpanGroup] + ): + for doc_mentions in mentions: + yield [get_lowercased_candidates(kb, mention) for mention in doc_mentions] @registry.misc("spacy.LowercaseCandidateGenerator.v1") def create_candidates() -> Callable[ @@ -515,7 +517,7 @@ def test_el_pipe_configuration(nlp): @registry.misc("spacy.LowercaseCandidateAllGenerator.v1") def create_candidates_batch() -> Callable[ - [InMemoryLookupKB, Generator[Iterable["Span"], None, None]], + [InMemoryLookupKB, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]], ]: return get_lowercased_candidates_all diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 075bc4d15..ca190cbe0 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -19,6 +19,8 @@ import warnings from .span cimport Span from .token cimport MISSING_DEP +from .span_group cimport SpanGroup + from ._dict_proxies import SpanGroups from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME @@ -704,6 +706,14 @@ cdef class Doc: """ return self.text + @property + def ents_spangroup(self) -> SpanGroup: + """ + Returns entities (in `.ents`) as `SpanGroup`. + RETURNS (SpanGroup): All entities (in `.ents`) as `SpanGroup`. + """ + return SpanGroup(self, spans=self.ents, name="ents") + property ents: """The named entities in the document. Returns a tuple of named entity `Span` objects, if the entity recognizer has been applied. From cdd5d69d606f0594c941764d6f471602c9be35cd Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 15 Dec 2022 14:08:56 +0100 Subject: [PATCH 30/35] Fix mypy error by not using SpanGroup.__iter__().. --- spacy/tests/pipeline/test_entity_linker.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index c6ae4d5a9..d61378002 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -507,7 +507,10 @@ def test_el_pipe_configuration(nlp): kb: InMemoryLookupKB, mentions: Iterator[SpanGroup] ): for doc_mentions in mentions: - yield [get_lowercased_candidates(kb, mention) for mention in doc_mentions] + yield [ + get_lowercased_candidates(kb, doc_mentions[idx]) + for idx in range(len(doc_mentions)) + ] @registry.misc("spacy.LowercaseCandidateGenerator.v1") def create_candidates() -> Callable[ From e763d17966782a0dda4f0bef7063d33b1bdcedfb Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 15 Dec 2022 16:12:42 +0100 Subject: [PATCH 31/35] Add ents_spangroup in doc.pyi. --- spacy/tokens/doc.pyi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index f0cdaee87..7760b4b5e 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -2,7 +2,9 @@ from typing import Callable, Protocol, Iterable, Iterator, Optional from typing import Union, Tuple, List, Dict, Any, overload from cymem.cymem import Pool from thinc.types import Floats1d, Floats2d, Ints2d + from .span import Span +from .span_group import SpanGroup from .token import Token from ._dict_proxies import SpanGroups from ._retokenize import Retokenizer @@ -128,6 +130,7 @@ class Doc: outside: Optional[List[Span]] = ..., default: str = ... ) -> None: ... + ents_spangroup: SpanGroup @property def noun_chunks(self) -> Iterator[Span]: ... @property From 2fab08579f85eb995248a2569f14c7bca9f66d95 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 14 Mar 2023 10:21:53 +0100 Subject: [PATCH 32/35] Fix sentence indexing bug in `Span.sents` (#12405) * Add test for partial sentences in ent.sents. * Removed unneeded import. * Format. Simplify code. (cherry picked from commit e8cab4625c12666ef599f19eb60403500af2a385) --- spacy/tests/doc/test_span.py | 16 ++++++++++++++++ spacy/tokens/span.pyx | 5 ++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 3676b35af..309ae6671 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -680,3 +680,19 @@ def test_span_group_copy(doc): assert len(doc.spans["test"]) == 3 # check that the copy spans were not modified and this is an isolated doc assert len(doc_copy.spans["test"]) == 2 + + +def test_for_partial_ent_sents(): + """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences, + which this tests for. + """ + doc = Doc( + English().vocab, + words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."], + sent_starts=[1, 0, 0, 1, 0, 0], + ) + doc.set_ents([Span(doc, 1, 4, "WORK")]) + # The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be + # equal to the sentences referenced in ent.sents. + for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents): + assert doc_sent == ent_sent diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 99a5f43bd..15e5422aa 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -460,9 +460,8 @@ cdef class Span: start = i if start >= self.end: break - if start < self.end: - yield Span(self.doc, start, self.end) - + elif i == self.doc.length - 1: + yield Span(self.doc, start, self.doc.length) @property def ents(self): From 7851f6eb8ea7e3eaa51c6924792382e75a051020 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 14 Mar 2023 22:13:14 +0100 Subject: [PATCH 33/35] Fix EL incl_context bug as in https://github.com/explosion/spaCy/pull/12398. --- spacy/pipeline/entity_linker.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 169d375e1..074168aef 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -499,13 +499,20 @@ class EntityLinker(TrainablePipe): # Looping over candidate entities for this doc. (TODO: rewrite) for ent_cand_idx, ent in enumerate(doc.ents): - sent_index = sentences.index(ent.sent) - assert sent_index >= 0 + assert hasattr(ent, "sents") + sents = list(ent.sents) + sent_indices = ( + sentences.index(sents[0]), + sentences.index(sents[-1]), + ) + assert sent_indices[1] >= sent_indices[0] >= 0 if self.incl_context: # get n_neighbour sentences, clipped to the length of the document - start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) + start_sentence = max(0, sent_indices[0] - self.n_sents) + end_sentence = min( + len(sentences) - 1, sent_indices[1] + self.n_sents + ) start_token = sentences[start_sentence].start end_token = sentences[end_sentence].end sent_doc = doc[start_token:end_token].as_doc() From 97018de33b3004f8411d13b2efef369bdf540d05 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 29 Mar 2023 15:22:27 +0200 Subject: [PATCH 34/35] Fix Span.sents issue. --- spacy/tokens/span.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 15e5422aa..75ef5df5b 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -463,6 +463,11 @@ cdef class Span: elif i == self.doc.length - 1: yield Span(self.doc, start, self.doc.length) + # Ensure that trailing parts of the Span instance are included in last element of .sents. + if start == self.doc.length - 1: + yield Span(self.doc, start, self.doc.length) + + @property def ents(self): """The named entities that fall completely within the span. Returns From c1faf6d7c73b50617614e48260414f194ea2fa46 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 19 Apr 2023 10:11:09 +0200 Subject: [PATCH 35/35] Removed duplicate empty_kb_for_config(). --- spacy/ml/models/entity_linker.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 90bb0a6d8..b5f455cdc 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -89,14 +89,6 @@ def load_kb( return kb_from_file -@registry.misc("spacy.EmptyKB.v2") -def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: - def empty_kb_factory(vocab: Vocab, entity_vector_length: int): - return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) - - return empty_kb_factory - - @registry.misc("spacy.EmptyKB.v1") def empty_kb( entity_vector_length: int,