From 41b3a0d932aafb4db9db02ae2e03b560305e0d53 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 7 Mar 2023 13:10:45 +0100 Subject: [PATCH 1/4] Drop support for EntityLinker_v1. (#12377) --- spacy/errors.py | 1 + spacy/pipeline/entity_linker.py | 23 ++-------------------- spacy/tests/pipeline/test_entity_linker.py | 7 +------ 3 files changed, 4 insertions(+), 27 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 5049100d8..390de126e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -960,6 +960,7 @@ class Errors(metaclass=ErrorsWithCodes): E4003 = ("Training examples for distillation must have the exact same tokens in the " "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") + E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index cd13a4b21..6a187b6c3 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -117,28 +117,9 @@ def make_entity_linker( prediction is discarded. If None, predictions are not filtered by any threshold. save_activations (bool): save model activations in Doc when annotating. """ - if not model.attrs.get("include_span_maker", False): - try: - from spacy_legacy.components.entity_linker import EntityLinker_v1 - except: - raise ImportError( - "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12." - ) - # The only difference in arguments here is that use_gold_ents and threshold aren't available. - return EntityLinker_v1( - nlp.vocab, - model, - name, - labels_discard=labels_discard, - n_sents=n_sents, - incl_prior=incl_prior, - incl_context=incl_context, - entity_vector_length=entity_vector_length, - get_candidates=get_candidates, - overwrite=overwrite, - scorer=scorer, - ) + raise ValueError(Errors.E4005) + return EntityLinker( nlp.vocab, model, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index ed84ce674..87cacfc9d 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -993,13 +993,11 @@ def test_scorer_links(): @pytest.mark.parametrize( "name,config", [ - ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ], ) # fmt: on def test_legacy_architectures(name, config): - from spacy_legacy.components.entity_linker import EntityLinker_v1 # Ensure that the legacy architectures still work vector_length = 3 @@ -1022,10 +1020,7 @@ def test_legacy_architectures(name, config): return mykb entity_linker = nlp.add_pipe(name, config={"model": config}) - if config["@architectures"] == "spacy.EntityLinker.v1": - assert isinstance(entity_linker, EntityLinker_v1) - else: - assert isinstance(entity_linker, EntityLinker) + assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) optimizer = nlp.initialize(get_examples=lambda: train_examples) From 520279ff7c9af199928e2a727999162cb79c38a3 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Thu, 9 Mar 2023 09:37:19 +0100 Subject: [PATCH 2/4] `Tok2Vec`: Add `distill` method (#12108) * `Tok2Vec`: Add `distill` method * `Tok2Vec`: Refactor `update` * Add `Tok2Vec.distill` test * Update `distill` signature to accept `Example`s instead of separate teacher and student docs * Add docs * Remove docstring * Update test * Remove `update` calls from test * Update `Tok2Vec.distill` docstring --- spacy/pipeline/tok2vec.py | 125 ++++++++++++++++++++------- spacy/tests/pipeline/test_tok2vec.py | 83 ++++++++++++++++++ website/docs/api/tok2vec.mdx | 37 ++++++++ 3 files changed, 213 insertions(+), 32 deletions(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index c742aaeaa..d9639f8d5 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,5 +1,6 @@ -from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any +from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple from thinc.api import Model, set_dropout_rate, Optimizer, Config +from thinc.types import Floats2d from itertools import islice from .trainable_pipe import TrainablePipe @@ -157,39 +158,9 @@ class Tok2Vec(TrainablePipe): DOCS: https://spacy.io/api/tok2vec#update """ - if losses is None: - losses = {} validate_examples(examples, "Tok2Vec.update") docs = [eg.predicted for eg in examples] - set_dropout_rate(self.model, drop) - tokvecs, bp_tokvecs = self.model.begin_update(docs) - d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] - losses.setdefault(self.name, 0.0) - - def accumulate_gradient(one_d_tokvecs): - """Accumulate tok2vec loss and gradient. This is passed as a callback - to all but the last listener. Only the last one does the backprop. - """ - nonlocal d_tokvecs - for i in range(len(one_d_tokvecs)): - d_tokvecs[i] += one_d_tokvecs[i] - losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) - return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] - - def backprop(one_d_tokvecs): - """Callback to actually do the backprop. Passed to last listener.""" - accumulate_gradient(one_d_tokvecs) - d_docs = bp_tokvecs(d_tokvecs) - if sgd is not None: - self.finish_update(sgd) - return d_docs - - batch_id = Tok2VecListener.get_batch_id(docs) - for listener in self.listeners[:-1]: - listener.receive(batch_id, tokvecs, accumulate_gradient) - if self.listeners: - self.listeners[-1].receive(batch_id, tokvecs, backprop) - return losses + return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses) def get_loss(self, examples, scores) -> None: pass @@ -219,6 +190,96 @@ class Tok2Vec(TrainablePipe): def add_label(self, label): raise NotImplementedError + def distill( + self, + teacher_pipe: Optional["TrainablePipe"], + examples: Iterable["Example"], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ) -> Dict[str, float]: + """Performs an update of the student pipe's model using the + student's distillation examples and sets the annotations + of the teacher's distillation examples using the teacher pipe. + + teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use + for prediction. + examples (Iterable[Example]): Distillation examples. The reference (teacher) + and predicted (student) docs must have the same number of tokens and the + same orthography. + drop (float): dropout rate. + sgd (Optional[Optimizer]): An optimizer. Will be created via + create_optimizer if not set. + losses (Optional[Dict[str, float]]): Optional record of loss during + distillation. + RETURNS: The updated losses dictionary. + + DOCS: https://spacy.io/api/tok2vec#distill + """ + # By default we require a teacher pipe, but there are downstream + # implementations that don't require a pipe. + if teacher_pipe is None: + raise ValueError(Errors.E4002.format(name=self.name)) + teacher_docs = [eg.reference for eg in examples] + student_docs = [eg.predicted for eg in examples] + teacher_preds = teacher_pipe.predict(teacher_docs) + teacher_pipe.set_annotations(teacher_docs, teacher_preds) + return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses) + + def _update_with_docs( + self, + docs: Iterable[Doc], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ): + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + set_dropout_rate(self.model, drop) + + tokvecs, accumulate_gradient, backprop = self._create_backprops( + docs, losses, sgd=sgd + ) + batch_id = Tok2VecListener.get_batch_id(docs) + for listener in self.listeners[:-1]: + listener.receive(batch_id, tokvecs, accumulate_gradient) + if self.listeners: + self.listeners[-1].receive(batch_id, tokvecs, backprop) + return losses + + def _create_backprops( + self, + docs: Iterable[Doc], + losses: Dict[str, float], + *, + sgd: Optional[Optimizer] = None, + ) -> Tuple[Floats2d, Callable, Callable]: + tokvecs, bp_tokvecs = self.model.begin_update(docs) + d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + + def accumulate_gradient(one_d_tokvecs): + """Accumulate tok2vec loss and gradient. This is passed as a callback + to all but the last listener. Only the last one does the backprop. + """ + nonlocal d_tokvecs + for i in range(len(one_d_tokvecs)): + d_tokvecs[i] += one_d_tokvecs[i] + losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) + return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + + def backprop(one_d_tokvecs): + """Callback to actually do the backprop. Passed to last listener.""" + accumulate_gradient(one_d_tokvecs) + d_docs = bp_tokvecs(d_tokvecs) + if sgd is not None: + self.finish_update(sgd) + return d_docs + + return tokvecs, accumulate_gradient, backprop + class Tok2VecListener(Model): """A layer that gets fed its answers from an upstream connection, diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index ee62b1ab4..6929b76fa 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -540,3 +540,86 @@ def test_tok2vec_listeners_textcat(): assert cats1["imperative"] < 0.9 assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] + + +cfg_string_distillation = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","tagger"] + + [components] + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v2" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v2" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +def test_tok2vec_distillation_teacher_annotations(): + orig_config = Config().from_str(cfg_string_distillation) + teacher_nlp = util.load_model_from_config( + orig_config, auto_fill=True, validate=True + ) + student_nlp = util.load_model_from_config( + orig_config, auto_fill=True, validate=True + ) + + train_examples_teacher = [] + train_examples_student = [] + for t in TRAIN_DATA: + train_examples_teacher.append( + Example.from_dict(teacher_nlp.make_doc(t[0]), t[1]) + ) + train_examples_student.append( + Example.from_dict(student_nlp.make_doc(t[0]), t[1]) + ) + + optimizer = teacher_nlp.initialize(lambda: train_examples_teacher) + student_nlp.initialize(lambda: train_examples_student) + + # Since Language.distill creates a copy of the examples to use as + # its internal teacher/student docs, we'll need to monkey-patch the + # tok2vec pipe's distill method. + student_tok2vec = student_nlp.get_pipe("tok2vec") + student_tok2vec._old_distill = student_tok2vec.distill + + def tok2vec_distill_wrapper( + self, + teacher_pipe, + examples, + **kwargs, + ): + assert all(not eg.reference.tensor.any() for eg in examples) + out = self._old_distill(teacher_pipe, examples, **kwargs) + assert all(eg.reference.tensor.any() for eg in examples) + return out + + student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec) + student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={}) diff --git a/website/docs/api/tok2vec.mdx b/website/docs/api/tok2vec.mdx index a1bb1265e..8b6d2380b 100644 --- a/website/docs/api/tok2vec.mdx +++ b/website/docs/api/tok2vec.mdx @@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"} + +Performs an update of the student pipe's model using the student's distillation +examples and sets the annotations of the teacher's distillation examples using +the teacher pipe. + +Unlike other trainable pipes, the student pipe doesn't directly learn its +representations from the teacher. However, since downstream pipes that do +perform distillation expect the tok2vec annotations to be present on the +correct distillation examples, we need to ensure that they are set beforehand. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("tok2vec") +> student_pipe = student.add_pipe("tok2vec") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~ | +| `examples` | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## Tok2Vec.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood From 6ae7618418d1a2514d55bff041fb196957844de6 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Mar 2023 23:41:20 +0100 Subject: [PATCH 3/4] Clean up Vocab constructor (#12290) * Clean up Vocab constructor * Change effective type of `strings` from `Iterable[str]` to `Optional[StringStore]` * Don't automatically add strings to vocab * Change default values to `None` * Remove `**deprecated_kwargs` * Format --- spacy/strings.pyi | 2 +- spacy/tests/pipeline/test_pipe_methods.py | 3 ++- .../serialize/test_serialize_vocab_strings.py | 27 +++++++++++-------- spacy/tests/vocab_vectors/test_lexeme.py | 2 +- spacy/vocab.pyi | 2 +- spacy/vocab.pyx | 18 +++++++------ website/docs/api/vocab.mdx | 5 ++-- 7 files changed, 34 insertions(+), 25 deletions(-) diff --git a/spacy/strings.pyi b/spacy/strings.pyi index d9509ff57..38dee7034 100644 --- a/spacy/strings.pyi +++ b/spacy/strings.pyi @@ -2,7 +2,7 @@ from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overlo from pathlib import Path class StringStore: - def __init__(self, strings: Optional[Iterable[str]]) -> None: ... + def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ... @overload def __getitem__(self, string_or_hash: str) -> int: ... @overload diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 9b9786f04..39611a742 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -9,6 +9,7 @@ from spacy.lang.en import English from spacy.lang.en.syntax_iterators import noun_chunks from spacy.language import Language from spacy.pipeline import TrainablePipe +from spacy.strings import StringStore from spacy.tokens import Doc from spacy.training import Example from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir @@ -131,7 +132,7 @@ def test_issue5458(): # Test that the noun chuncker does not generate overlapping spans # fmt: off words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] - vocab = Vocab(strings=words) + vocab = Vocab(strings=StringStore(words)) deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0] diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index fd80c3d8e..f6356ac9e 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -13,8 +13,11 @@ from spacy.vocab import Vocab from ..util import make_tempdir -test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] -test_strings_attrs = [(["rats", "are", "cute"], "Hello")] +test_strings = [ + (StringStore(), StringStore()), + (StringStore(["rats", "are", "cute"]), StringStore(["i", "like", "rats"])), +] +test_strings_attrs = [(StringStore(["rats", "are", "cute"]), "Hello")] @pytest.mark.issue(599) @@ -81,7 +84,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): vocab2 = Vocab(strings=strings2) vocab1_b = vocab1.to_bytes() vocab2_b = vocab2.to_bytes() - if strings1 == strings2: + if strings1.to_bytes() == strings2.to_bytes(): assert vocab1_b == vocab2_b else: assert vocab1_b != vocab2_b @@ -117,11 +120,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() - vocab1[strings[0]].norm_ = lex_attr - assert vocab1[strings[0]].norm_ == lex_attr - assert vocab2[strings[0]].norm_ != lex_attr + s = next(iter(vocab1.strings)) + vocab1[s].norm_ = lex_attr + assert vocab1[s].norm_ == lex_attr + assert vocab2[s].norm_ != lex_attr vocab2 = vocab2.from_bytes(vocab1.to_bytes()) - assert vocab2[strings[0]].norm_ == lex_attr + assert vocab2[s].norm_ == lex_attr @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -136,14 +140,15 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr): def test_serialize_vocab_lex_attrs_disk(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() - vocab1[strings[0]].norm_ = lex_attr - assert vocab1[strings[0]].norm_ == lex_attr - assert vocab2[strings[0]].norm_ != lex_attr + s = next(iter(vocab1.strings)) + vocab1[s].norm_ = lex_attr + assert vocab1[s].norm_ == lex_attr + assert vocab2[s].norm_ != lex_attr with make_tempdir() as d: file_path = d / "vocab" vocab1.to_disk(file_path) vocab2 = vocab2.from_disk(file_path) - assert vocab2[strings[0]].norm_ == lex_attr + assert vocab2[s].norm_ == lex_attr @pytest.mark.parametrize("strings1,strings2", test_strings) diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index d91f41db3..cd7f954ae 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -17,7 +17,7 @@ def test_issue361(en_vocab, text1, text2): @pytest.mark.issue(600) def test_issue600(): - vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) + vocab = Vocab() doc = Doc(vocab, words=["hello"]) doc[0].tag_ = "NN" diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index 871044fff..e4a88bfd8 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -26,7 +26,7 @@ class Vocab: def __init__( self, lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ..., - strings: Optional[Union[List[str], StringStore]] = ..., + strings: Optional[StringStore] = ..., lookups: Optional[Lookups] = ..., oov_prob: float = ..., writing_system: Dict[str, Any] = ..., diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f3c3595ef..0d3c9c883 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -49,9 +49,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ - def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, - oov_prob=-20., writing_system={}, get_noun_chunks=None, - **deprecated_kwargs): + def __init__(self, lex_attr_getters=None, strings=None, lookups=None, + oov_prob=-20., writing_system=None, get_noun_chunks=None): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to @@ -69,16 +68,19 @@ cdef class Vocab: self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_orth = PreshMap() - self.strings = StringStore() self.length = 0 - if strings: - for string in strings: - _ = self[string] + if strings is None: + self.strings = StringStore() + else: + self.strings = strings self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings) self.vectors = Vectors(strings=self.strings) self.lookups = lookups - self.writing_system = writing_system + if writing_system is None: + self.writing_system = {} + else: + self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks property vectors: diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 3faf1f1a0..304040f9c 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -17,14 +17,15 @@ Create the vocabulary. > #### Example > > ```python +> from spacy.strings import StringStore > from spacy.vocab import Vocab -> vocab = Vocab(strings=["hello", "world"]) +> vocab = Vocab(strings=StringStore(["hello", "world"])) > ``` | Name | Description | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ | -| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | +| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values. ~~Optional[StringStore]~~ | | `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | From 9340eb8ad2a7525096c902112c7cf1a21d145df2 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 20 Mar 2023 00:34:35 +0100 Subject: [PATCH 4/4] Introduce hierarchy for EL `Candidate` objects (#12341) * Convert Candidate from Cython to Python class. * Format. * Fix .entity_ typo in _add_activations() usage. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem * Update doc string of BaseCandidate.__init__(). * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem * Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate. * Adjust Candidate to support and mandate numerical entity IDs. * Format. * Fix docstring and docs. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem * Rename alias -> mention. * Refactor Candidate attribute names. Update docs and tests accordingly. * Refacor Candidate attributes and their usage. * Format. * Fix mypy error. * Update error code in line with v4 convention. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem * Updated error code. * Simplify interface for int/str representations. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem * Rename 'alias' to 'mention'. * Port Candidate and InMemoryCandidate to Cython. * Remove redundant entry in setup.py. * Add abstract class check. * Drop storing mention. * Update spacy/kb/candidate.pxd Co-authored-by: Sofie Van Landeghem * Fix entity_id refactoring problems in docstrings. * Drop unused InMemoryCandidate._entity_hash. * Update docstrings. * Move attributes out of Candidate. * Partially fix alias/mention terminology usage. Convert Candidate to interface. * Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs(). * Update docstrings related to prior_prob. * Update alias/mention usage in doc(strings). * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem * Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs. * Update docstrings. * Fix InMemoryCandidate attribute names. * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem * Update W401 test. * Update spacy/errors.py Co-authored-by: Sofie Van Landeghem * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem * Use Candidate output type for toy generators in the test suite to mimick best practices * fix docs * fix import --------- Co-authored-by: Sofie Van Landeghem --- spacy/errors.py | 8 +- spacy/kb/__init__.py | 5 +- spacy/kb/candidate.pxd | 19 ++-- spacy/kb/candidate.pyx | 120 ++++++++++++--------- spacy/kb/kb.pyx | 21 ++-- spacy/kb/kb_in_memory.pyx | 31 +++--- spacy/ml/models/entity_linker.py | 24 ++++- spacy/pipeline/entity_linker.py | 22 ++-- spacy/tests/pipeline/test_entity_linker.py | 49 +++++---- spacy/tests/serialize/test_serialize_kb.py | 12 ++- website/docs/api/inmemorylookupkb.mdx | 40 +++---- website/docs/api/kb.mdx | 51 ++++----- 12 files changed, 223 insertions(+), 179 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 390de126e..e1f7e7400 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes): "ignoring the duplicate entry.") W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " "incorrect. Modify PhraseMatcher._terminal_hash to fix.") - W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " + W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in " "the Knowledge Base.") W026 = ("Unable to set all sentence boundaries from dependency parses. If " "you are constructing a parse tree incrementally by setting " @@ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes): "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") + # v4 warning strings W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") + W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " + "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure " + "to return `True` in `.supports_prior_probs`.") class Errors(metaclass=ErrorsWithCodes): @@ -961,6 +965,8 @@ class Errors(metaclass=ErrorsWithCodes): "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") + E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.") + RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py index 1d70a9b34..ff0e209e3 100644 --- a/spacy/kb/__init__.py +++ b/spacy/kb/__init__.py @@ -1,3 +1,6 @@ from .kb import KnowledgeBase from .kb_in_memory import InMemoryLookupKB -from .candidate import Candidate, get_candidates, get_candidates_batch +from .candidate import Candidate, InMemoryCandidate + + +__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd index 942ce9dd0..f21f423e4 100644 --- a/spacy/kb/candidate.pxd +++ b/spacy/kb/candidate.pxd @@ -1,12 +1,15 @@ -from .kb cimport KnowledgeBase from libcpp.vector cimport vector +from .kb_in_memory cimport InMemoryLookupKB from ..typedefs cimport hash_t -# Object used by the Entity Linker that summarizes one entity-alias candidate combination. cdef class Candidate: - cdef readonly KnowledgeBase kb - cdef hash_t entity_hash - cdef float entity_freq - cdef vector[float] entity_vector - cdef hash_t alias_hash - cdef float prior_prob + pass + + +cdef class InMemoryCandidate(Candidate): + cdef readonly hash_t _entity_hash + cdef readonly hash_t _alias_hash + cpdef vector[float] _entity_vector + cdef float _prior_prob + cdef readonly InMemoryLookupKB _kb + cdef float _entity_freq diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index c89efeb03..3d8da4b95 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -1,74 +1,96 @@ # cython: infer_types=True, profile=True -from typing import Iterable -from .kb cimport KnowledgeBase -from ..tokens import Span +from .kb_in_memory cimport InMemoryLookupKB +from ..errors import Errors cdef class Candidate: - """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved - to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking + """A `Candidate` object refers to a textual mention that may or may not be resolved + to a specific entity from a Knowledge Base. This will be used as input for the entity linking algorithm which will disambiguate the various candidates to the correct one. - Each candidate (alias, entity) pair is assigned a certain prior probability. + Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base, + is assigned a certain prior probability. DOCS: https://spacy.io/api/kb/#candidate-init """ - def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): - self.kb = kb - self.entity_hash = entity_hash - self.entity_freq = entity_freq - self.entity_vector = entity_vector - self.alias_hash = alias_hash - self.prior_prob = prior_prob + def __init__(self): + # Make sure abstract Candidate is not instantiated. + if self.__class__ == Candidate: + raise TypeError( + Errors.E1046.format(cls_name=self.__class__.__name__) + ) @property - def entity(self) -> int: - """RETURNS (uint64): hash of the entity's KB ID/name""" - return self.entity_hash + def entity_id(self) -> int: + """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, + otherwise the hash of the entity ID string).""" + raise NotImplementedError @property - def entity_(self) -> str: - """RETURNS (str): ID/name of this entity in the KB""" - return self.kb.vocab.strings[self.entity_hash] + def entity_id_(self) -> str: + """RETURNS (str): String representation of entity ID.""" + raise NotImplementedError @property - def alias(self) -> int: - """RETURNS (uint64): hash of the alias""" - return self.alias_hash + def entity_vector(self) -> vector[float]: + """RETURNS (vector[float]): Entity vector.""" + raise NotImplementedError + + +cdef class InMemoryCandidate(Candidate): + """Candidate for InMemoryLookupKB.""" + + def __init__( + self, + kb: InMemoryLookupKB, + entity_hash: int, + alias_hash: int, + entity_vector: vector[float], + prior_prob: float, + entity_freq: float + ): + """ + kb (InMemoryLookupKB]): InMemoryLookupKB instance. + entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). + entity_freq (int): Entity frequency in KB corpus. + entity_vector (List[float]): Entity embedding. + alias_hash (int): Alias hash. + prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of + the context, this alias - which matches one of this entity's aliases - resolves to one this entity. + """ + super().__init__() + + self._entity_hash = entity_hash + self._entity_vector = entity_vector + self._prior_prob = prior_prob + self._kb = kb + self._alias_hash = alias_hash + self._entity_freq = entity_freq @property - def alias_(self) -> str: - """RETURNS (str): ID of the original alias""" - return self.kb.vocab.strings[self.alias_hash] + def entity_id(self) -> int: + return self._entity_hash @property - def entity_freq(self) -> float: - return self.entity_freq - - @property - def entity_vector(self) -> Iterable[float]: - return self.entity_vector + def entity_vector(self) -> vector[float]: + return self._entity_vector @property def prior_prob(self) -> float: - return self.prior_prob + """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to + this entity.""" + return self._prior_prob + @property + def alias(self) -> str: + """RETURNS (str): Alias.""" + return self._kb.vocab.strings[self._alias_hash] -def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: - """ - Return candidate entities for a given mention and fetching appropriate entries from the index. - kb (KnowledgeBase): Knowledge base to query. - mention (Span): Entity mention for which to identify candidates. - RETURNS (Iterable[Candidate]): Identified candidates. - """ - return kb.get_candidates(mention) + @property + def entity_id_(self) -> str: + return self._kb.vocab.strings[self._entity_hash] - -def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: - """ - Return candidate entities for the given mentions and fetching appropriate entries from the index. - kb (KnowledgeBase): Knowledge base to query. - mention (Iterable[Span]): Entity mentions for which to identify candidates. - RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. - """ - return kb.get_candidates_batch(mentions) + @property + def entity_freq(self) -> float: + """RETURNS (float): Entity frequency in KB corpus.""" + return self._entity_freq diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index ce4bc0138..1cb08f488 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -32,9 +32,10 @@ cdef class KnowledgeBase: def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: """ - Return candidate entities for specified texts. Each candidate defines the entity, the original alias, - and the prior probability of that alias resolving to that entity. - If no candidate is found for a given text, an empty list is returned. + Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the + entity's embedding vector. Depending on the KB implementation, further properties - such as the prior + probability of the specified mention text resolving to that entity - might be included. + If no candidates are found for a given mention, an empty list is returned. mentions (Iterable[Span]): Mentions for which to get candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ @@ -42,9 +43,10 @@ cdef class KnowledgeBase: def get_candidates(self, mention: Span) -> Iterable[Candidate]: """ - Return candidate entities for specified text. Each candidate defines the entity, the original alias, - and the prior probability of that alias resolving to that entity. - If the no candidate is found for a given text, an empty list is returned. + Return candidate entities for a specific mention. Each candidate defines at least the entity and the + entity's embedding vector. Depending on the KB implementation, further properties - such as the prior + probability of the specified mention text resolving to that entity - might be included. + If no candidate is found for the given mention, an empty list is returned. mention (Span): Mention for which to get candidates. RETURNS (Iterable[Candidate]): Identified candidates. """ @@ -106,3 +108,10 @@ cdef class KnowledgeBase: raise NotImplementedError( Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) ) + + @property + def supports_prior_probs(self) -> bool: + """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions.""" + raise NotImplementedError( + Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__) + ) diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 2a74d047b..c9ced8309 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -18,7 +18,7 @@ from .. import util from ..util import SimpleFrozenList, ensure_path from ..vocab cimport Vocab from .kb cimport KnowledgeBase -from .candidate import Candidate as Candidate +from .candidate import InMemoryCandidate cdef class InMemoryLookupKB(KnowledgeBase): @@ -226,10 +226,10 @@ cdef class InMemoryLookupKB(KnowledgeBase): alias_entry.probs = probs self._aliases_table[alias_index] = alias_entry - def get_candidates(self, mention: Span) -> Iterable[Candidate]: - return self.get_alias_candidates(mention.text) # type: ignore + def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]: + return self._get_alias_candidates(mention.text) # type: ignore - def get_alias_candidates(self, str alias) -> Iterable[Candidate]: + def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: """ Return candidate entities for an alias. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. @@ -241,14 +241,18 @@ cdef class InMemoryLookupKB(KnowledgeBase): alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] - return [Candidate(kb=self, - entity_hash=self._entries[entry_index].entity_hash, - entity_freq=self._entries[entry_index].freq, - entity_vector=self._vectors_table[self._entries[entry_index].vector_index], - alias_hash=alias_hash, - prior_prob=prior_prob) - for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) - if entry_index != 0] + return [ + InMemoryCandidate( + kb=self, + entity_hash=self._entries[entry_index].entity_hash, + alias_hash=alias_hash, + entity_vector=self._vectors_table[self._entries[entry_index].vector_index], + prior_prob=prior_prob, + entity_freq=self._entries[entry_index].freq + ) + for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) + if entry_index != 0 + ] def get_vector(self, str entity): cdef hash_t entity_hash = self.vocab.strings[entity] @@ -279,6 +283,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): return 0.0 + def supports_prior_probs(self) -> bool: + return True + def to_bytes(self, **kwargs): """Serialize the current state to a binary string. """ diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 7332ca199..7fe0b4741 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -6,7 +6,7 @@ from thinc.api import Model, Maxout, Linear, tuplify, Ragged from ...util import registry from ...kb import KnowledgeBase, InMemoryLookupKB -from ...kb import Candidate, get_candidates, get_candidates_batch +from ...kb import Candidate from ...vocab import Vocab from ...tokens import Span, Doc from ..extract_spans import extract_spans @@ -117,3 +117,25 @@ def create_candidates_batch() -> Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ]: return get_candidates_batch + + +def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: + """ + Return candidate entities for a given mention and fetching appropriate entries from the index. + kb (KnowledgeBase): Knowledge base to query. + mention (Span): Entity mention for which to identify candidates. + RETURNS (Iterable[Candidate]): Identified candidates. + """ + return kb.get_candidates(mention) + + +def get_candidates_batch( + kb: KnowledgeBase, mentions: Iterable[Span] +) -> Iterable[Iterable[Candidate]]: + """ + Return candidate entities for the given mentions and fetching appropriate entries from the index. + kb (KnowledgeBase): Knowledge base to query. + mentions (Iterable[Span]): Entity mentions for which to identify candidates. + RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. + """ + return kb.get_candidates_batch(mentions) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 6a187b6c3..caced9cfd 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,5 +1,5 @@ -from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any -from typing import cast +import warnings +from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast from numpy import dtype from thinc.types import Floats1d, Floats2d, Ints1d, Ragged from pathlib import Path @@ -10,14 +10,13 @@ from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import set_dropout_rate from ..kb import KnowledgeBase, Candidate -from ..ml import empty_kb from ..tokens import Doc, Span from .pipe import deserialize_config from .trainable_pipe import TrainablePipe from ..language import Language from ..vocab import Vocab from ..training import Example, validate_examples, validate_get_examples -from ..errors import Errors +from ..errors import Errors, Warnings from ..util import SimpleFrozenList, registry from .. import util from ..scorer import Scorer @@ -240,6 +239,8 @@ class EntityLinker(TrainablePipe): if candidates_batch_size < 1: raise ValueError(Errors.E1044) + if self.incl_prior and not self.kb.supports_prior_probs: + warnings.warn(Warnings.W401) def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will @@ -522,18 +523,19 @@ class EntityLinker(TrainablePipe): ) elif len(candidates) == 1 and self.threshold is None: # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_) + final_kb_ids.append(candidates[0].entity_id_) self._add_activations( doc_scores=doc_scores, doc_ents=doc_ents, scores=[1.0], - ents=[candidates[0].entity_], + ents=[candidates[0].entity_id], ) else: random.shuffle(candidates) # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.incl_prior: + if self.incl_prior and self.kb.supports_prior_probs: + prior_probs = xp.asarray([c.prior_prob for c in candidates]) # type: ignore + else: prior_probs = xp.asarray([0.0 for _ in candidates]) scores = prior_probs # add in similarity from the context @@ -557,7 +559,7 @@ class EntityLinker(TrainablePipe): raise ValueError(Errors.E161) scores = prior_probs + sims - (prior_probs * sims) final_kb_ids.append( - candidates[scores.argmax().item()].entity_ + candidates[scores.argmax().item()].entity_id_ if self.threshold is None or scores.max() >= self.threshold else EntityLinker.NIL @@ -566,7 +568,7 @@ class EntityLinker(TrainablePipe): doc_scores=doc_scores, doc_ents=doc_ents, scores=scores, - ents=[c.entity for c in candidates], + ents=[c.entity_id for c in candidates], ) self._add_doc_activations( docs_scores=docs_scores, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 87cacfc9d..65406a36e 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -7,10 +7,10 @@ from thinc.types import Ragged from spacy import registry, util from spacy.attrs import ENT_KB_ID from spacy.compat import pickle -from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase +from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase from spacy.lang.en import English from spacy.ml import load_kb -from spacy.ml.models.entity_linker import build_span_maker +from spacy.ml.models.entity_linker import build_span_maker, get_candidates from spacy.pipeline import EntityLinker, TrainablePipe from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer @@ -465,16 +465,17 @@ def test_candidate_generation(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates + adam_ent_cands = get_candidates(mykb, adam_ent) assert len(get_candidates(mykb, douglas_ent)) == 2 - assert len(get_candidates(mykb, adam_ent)) == 1 + assert len(adam_ent_cands) == 1 assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates - assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" - assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" - assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) - assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) + assert adam_ent_cands[0].entity_id_ == "Q2" + assert adam_ent_cands[0].alias == "adam" + assert_almost_equal(adam_ent_cands[0].entity_freq, 12) + assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9) def test_el_pipe_configuration(nlp): @@ -502,7 +503,7 @@ def test_el_pipe_configuration(nlp): assert doc[2].ent_kb_id_ == "Q2" def get_lowercased_candidates(kb, span): - return kb.get_alias_candidates(span.text.lower()) + return kb._get_alias_candidates(span.text.lower()) def get_lowercased_candidates_batch(kb, spans): return [get_lowercased_candidates(kb, span) for span in spans] @@ -561,24 +562,22 @@ def test_vocab_serialization(nlp): mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) - candidates = mykb.get_alias_candidates("adam") + candidates = mykb._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity == q2_hash - assert candidates[0].entity_ == "Q2" - assert candidates[0].alias == adam_hash - assert candidates[0].alias_ == "adam" + assert candidates[0].entity_id == q2_hash + assert candidates[0].entity_id_ == "Q2" + assert candidates[0].alias == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1) kb_new_vocab.from_disk(d / "kb") - candidates = kb_new_vocab.get_alias_candidates("adam") + candidates = kb_new_vocab._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity == q2_hash - assert candidates[0].entity_ == "Q2" - assert candidates[0].alias == adam_hash - assert candidates[0].alias_ == "adam" + assert candidates[0].entity_id == q2_hash + assert candidates[0].entity_id_ == "Q2" + assert candidates[0].alias == "adam" assert kb_new_vocab.get_vector("Q2") == [2] assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) @@ -598,20 +597,20 @@ def test_append_alias(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates - assert len(mykb.get_alias_candidates("douglas")) == 2 + assert len(mykb._get_alias_candidates("douglas")) == 2 # append an alias mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) # test the size of the relevant candidates has been incremented - assert len(mykb.get_alias_candidates("douglas")) == 3 + assert len(mykb._get_alias_candidates("douglas")) == 3 # append the same alias-entity pair again should not work (will throw a warning) with pytest.warns(UserWarning): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) # test the size of the relevant candidates remained unchanged - assert len(mykb.get_alias_candidates("douglas")) == 3 + assert len(mykb._get_alias_candidates("douglas")) == 3 @pytest.mark.filterwarnings("ignore:\\[W036") @@ -908,11 +907,11 @@ def test_kb_to_bytes(): assert kb_2.contains_alias("Russ Cochran") assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_alias_strings() == kb_2.get_alias_strings() - assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( - kb_2.get_alias_candidates("Russ Cochran") + assert len(kb_1._get_alias_candidates("Russ Cochran")) == len( + kb_2._get_alias_candidates("Russ Cochran") ) - assert len(kb_1.get_alias_candidates("Randomness")) == len( - kb_2.get_alias_candidates("Randomness") + assert len(kb_1._get_alias_candidates("Randomness")) == len( + kb_2._get_alias_candidates("Randomness") ) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index f9d2e226b..eb4254d31 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -66,19 +66,21 @@ def _check_kb(kb): assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_) + candidates = sorted( + kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_ + ) assert len(candidates) == 2 - assert candidates[0].entity_ == "Q007" + assert candidates[0].entity_id_ == "Q007" assert 6.999 < candidates[0].entity_freq < 7.01 assert candidates[0].entity_vector == [0, 0, 7] - assert candidates[0].alias_ == "double07" + assert candidates[0].alias == "double07" assert 0.899 < candidates[0].prior_prob < 0.901 - assert candidates[1].entity_ == "Q17" + assert candidates[1].entity_id_ == "Q17" assert 1.99 < candidates[1].entity_freq < 2.01 assert candidates[1].entity_vector == [7, 1, 0] - assert candidates[1].alias_ == "double07" + assert candidates[1].alias == "double07" assert 0.099 < candidates[1].prior_prob < 0.101 diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx index c24fe78d6..6fa6cb235 100644 --- a/website/docs/api/inmemorylookupkb.mdx +++ b/website/docs/api/inmemorylookupkb.mdx @@ -10,9 +10,9 @@ version: 3.5 The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and implements all of its methods. It stores all KB data in-memory and generates -[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with -entity names. It's highly optimized for both a low memory footprint and speed of -retrieval. +[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions +with entity names. It's highly optimized for both a low memory footprint and +speed of retrieval. ## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"} @@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base. ## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"} Given a certain textual mention as input, retrieve a list of candidate entities -of type [`Candidate`](/api/kb#candidate). Wraps +of type [`InMemoryCandidate`](/api/kb#candidate). Wraps [`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). > #### Example @@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps > candidates = kb.get_candidates(doc[0:2]) > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------- | -| `mention` | The textual mention or alias. ~~Span~~ | -| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------ | +| `mention` | The textual mention or alias. ~~Span~~ | +| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ | ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"} @@ -194,26 +194,10 @@ to you. > candidates = kb.get_candidates((doc[0:2], doc[3:])) > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------------------------------- | -| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | -| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | - -## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"} - -Given a certain textual mention as input, retrieve a list of candidate entities -of type [`Candidate`](/api/kb#candidate). - -> #### Example -> -> ```python -> candidates = kb.get_alias_candidates("Douglas") -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------------- | -| `alias` | The textual mention or alias. ~~str~~ | -| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------ | +| `mentions` | The textual mentions. ~~Iterable[Span]~~ | +| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ | ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"} diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index 2b0d4d9d6..9536a3fe3 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -103,23 +103,6 @@ to you. | `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | -## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"} - - - This method is _not_ available from spaCy 3.5 onwards. - - -From spaCy 3.5 on `KnowledgeBase` is an abstract class (with -[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to -allow more flexibility in customizing knowledge bases. Some of its methods were -moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring, -one of those being `get_alias_candidates()`. This method is now available as -[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). -Note: -[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates) -defaults to -[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). - ## KnowledgeBase.get_vector {id="get_vector",tag="method"} Given a certain entity ID, retrieve its pretrained entity vector. @@ -190,25 +173,27 @@ Restore the state of the knowledge base from a given directory. Note that the | `exclude` | List of components to exclude. ~~Iterable[str]~~ | | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ | -## Candidate {id="candidate",tag="class"} +## InMemoryCandidate {id="candidate",tag="class"} -A `Candidate` object refers to a textual mention (alias) that may or may not be -resolved to a specific entity from a `KnowledgeBase`. This will be used as input -for the entity linking algorithm which will disambiguate the various candidates -to the correct one. Each candidate `(alias, entity)` pair is assigned to a -certain prior probability. +An `InMemoryCandidate` object refers to a textual mention (alias) that may or +may not be resolved to a specific entity from a `KnowledgeBase`. This will be +used as input for the entity linking algorithm which will disambiguate the +various candidates to the correct one. Each candidate `(alias, entity)` pair is +assigned to a certain prior probability. -### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"} +### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"} -Construct a `Candidate` object. Usually this constructor is not called directly, -but instead these objects are returned by the `get_candidates` method of the -[`entity_linker`](/api/entitylinker) pipe. +Construct an `InMemoryCandidate` object. Usually this constructor is not called +directly, but instead these objects are returned by the `get_candidates` method +of the [`entity_linker`](/api/entitylinker) pipe. -> #### Example +> #### Example```python +> +> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb, +> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) +> +> ``` > -> ```python -> from spacy.kb import Candidate -> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) > ``` | Name | Description | @@ -216,10 +201,10 @@ but instead these objects are returned by the `get_candidates` method of the | `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ | | `entity_hash` | The hash of the entity's KB ID. ~~int~~ | | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ | -| `alias_hash` | The hash of the textual mention or alias. ~~int~~ | +| `alias_hash` | The hash of the entity alias. ~~int~~ | | `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | -## Candidate attributes {id="candidate-attributes"} +## InMemoryCandidate attributes {id="candidate-attributes"} | Name | Description | | --------------- | ------------------------------------------------------------------------ |