From 358cbb21e391afff49739cf47c50554bfd4641e1 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 18 Aug 2020 16:10:36 +0200 Subject: [PATCH] Define candidate generator in EL config (#5876) * candidate generator as separate part of EL config * update comment * ent instead of str as input for candidate generation * Span instead of str: correct type indication * fix types * unit test to create new candidate generator * fix replace_pipe argument passing * move error message, general cleanup * add vocab back to KB constructor * provide KB as callable from Vocab arg * rename to kb_loader, fix KB serialization as part of the EL pipe * fix typo * reformatting * cleanup * fix comment * fix wrongly duplicated code from merge conflict * rename dump to to_disk * from_disk instead of load_bulk * update test after recent removal of set_morphology in tagger * remove old doc --- bin/ud/ud_train.py | 3 +- examples/training/create_kb.py | 10 +- examples/training/train_entity_linker.py | 2 +- spacy/errors.py | 8 +- spacy/kb.pxd | 2 +- spacy/kb.pyx | 54 +++--- spacy/language.py | 4 +- spacy/ml/models/entity_linker.py | 28 +-- spacy/pipeline/entity_linker.py | 53 ++---- spacy/pipeline/nn_parser.pyx | 0 spacy/pipeline/tagger.pyx | 1 - spacy/tests/pipeline/test_entity_linker.py | 173 ++++++++++++------ spacy/tests/pipeline/test_pipe_methods.py | 8 + spacy/tests/regression/test_issue4501-5000.py | 10 +- spacy/tests/regression/test_issue5230.py | 24 +-- spacy/tests/serialize/test_serialize_kb.py | 64 ++++++- website/docs/api/kb.md | 8 +- 17 files changed, 272 insertions(+), 180 deletions(-) delete mode 100644 spacy/pipeline/nn_parser.pyx diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index 11ad564ec..362057b37 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -15,7 +15,8 @@ import spacy.util from bin.ud import conll17_ud_eval from spacy.tokens import Token, Doc from spacy.gold import Example -from spacy.util import compounding, minibatch, minibatch_by_words +from spacy.util import compounding, minibatch +from spacy.gold.batchers import minibatch_by_words from spacy.pipeline._parser_internals.nonproj import projectivize from spacy.matcher import Matcher from spacy import displacy diff --git a/examples/training/create_kb.py b/examples/training/create_kb.py index 0c6e29226..a455c8d7e 100644 --- a/examples/training/create_kb.py +++ b/examples/training/create_kb.py @@ -48,8 +48,7 @@ def main(model, output_dir=None): # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality. # For simplicity, we'll just use the original vector dimension here instead. vectors_dim = nlp.vocab.vectors.shape[1] - kb = KnowledgeBase(entity_vector_length=vectors_dim) - kb.initialize(nlp.vocab) + kb = KnowledgeBase(nlp.vocab, entity_vector_length=vectors_dim) # set up the data entity_ids = [] @@ -81,7 +80,7 @@ def main(model, output_dir=None): if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") - kb.dump(kb_path) + kb.to_disk(kb_path) print() print("Saved KB to", kb_path) @@ -96,9 +95,8 @@ def main(model, output_dir=None): print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) - kb2 = KnowledgeBase(entity_vector_length=1) - kb.initialize(vocab2) - kb2.load_bulk(kb_path) + kb2 = KnowledgeBase(vocab2, entity_vector_length=1) + kb2.from_disk(kb_path) print() _print_kb(kb2) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index 8a69ae39c..d2bd61e5b 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -83,7 +83,7 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50): if "entity_linker" not in nlp.pipe_names: print("Loading Knowledge Base from '%s'" % kb_path) cfg = { - "kb": { + "kb_loader": { "@assets": "spacy.KBFromFile.v1", "vocab_path": vocab_path, "kb_path": kb_path, diff --git a/spacy/errors.py b/spacy/errors.py index 1ad5197f7..d1e9489d1 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -477,6 +477,10 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the " + "provided argument {loc} is an existing directory.") + E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " + "not seem to exist.") E930 = ("Received invalid get_examples callback in {name}.begin_training. " "Expected function that returns an iterable of Example objects but " "got: {obj}") @@ -504,8 +508,6 @@ class Errors: "not found in pipeline. Available components: {opts}") E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded " "nlp object, but got: {source}") - E946 = ("The Vocab for the knowledge base is not initialized. Did you forget to " - "call kb.initialize()?") E947 = ("Matcher.add received invalid 'greedy' argument: expected " "a string value from {expected} but got: '{arg}'") E948 = ("Matcher.add received invalid 'patterns' argument: expected " @@ -612,8 +614,6 @@ class Errors: "of the training data in spaCy 3.0 onwards. The 'update' " "function should now be called with a batch of 'Example' " "objects, instead of (text, annotation) tuples. ") - E990 = ("An entity linking component needs to be initialized with a " - "KnowledgeBase object, but found {type} instead.") E991 = ("The function 'select_pipes' should be called with either a " "'disable' argument to list the names of the pipe components " "that should be disabled, or with an 'enable' argument that " diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 53038b5db..695693666 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -140,7 +140,7 @@ cdef class KnowledgeBase: self._entries.push_back(entry) self._aliases_table.push_back(alias) - cpdef load_bulk(self, loc) + cpdef from_disk(self, loc) cpdef set_entities(self, entity_list, freq_list, vector_list) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 9035f7e6a..3b8017a0c 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True, profile=True +from typing import Iterator from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from cpython.exc cimport PyErr_SetFromErrno @@ -64,6 +65,16 @@ cdef class Candidate: return self.prior_prob +def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]: + """ + Return candidate entities for a given span by using the text of the span as the alias + and fetching appropriate entries from the index. + This particular function is optimized to work with the built-in KB functionality, + but any other custom candidate generation method can be used in combination with the KB as well. + """ + return kb.get_alias_candidates(span.text) + + cdef class KnowledgeBase: """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases, to support entity linking of named entities to real-world concepts. @@ -71,25 +82,16 @@ cdef class KnowledgeBase: DOCS: https://spacy.io/api/kb """ - def __init__(self, entity_vector_length): - """Create a KnowledgeBase. Make sure to call kb.initialize() before using it.""" + def __init__(self, Vocab vocab, entity_vector_length): + """Create a KnowledgeBase.""" self.mem = Pool() self.entity_vector_length = entity_vector_length - self._entry_index = PreshMap() self._alias_index = PreshMap() - self.vocab = None - - - def initialize(self, Vocab vocab): self.vocab = vocab self.vocab.strings.add("") self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) - def require_vocab(self): - if self.vocab is None: - raise ValueError(Errors.E946) - @property def entity_vector_length(self): """RETURNS (uint64): length of the entity vectors""" @@ -102,14 +104,12 @@ cdef class KnowledgeBase: return len(self._entry_index) def get_entity_strings(self): - self.require_vocab() return [self.vocab.strings[x] for x in self._entry_index] def get_size_aliases(self): return len(self._alias_index) def get_alias_strings(self): - self.require_vocab() return [self.vocab.strings[x] for x in self._alias_index] def add_entity(self, unicode entity, float freq, vector[float] entity_vector): @@ -117,7 +117,6 @@ cdef class KnowledgeBase: Add an entity to the KB, optionally specifying its log probability based on corpus frequency Return the hash of the entity ID/name at the end. """ - self.require_vocab() cdef hash_t entity_hash = self.vocab.strings.add(entity) # Return if this entity was added before @@ -140,7 +139,6 @@ cdef class KnowledgeBase: return entity_hash cpdef set_entities(self, entity_list, freq_list, vector_list): - self.require_vocab() if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list): raise ValueError(Errors.E140) @@ -176,12 +174,10 @@ cdef class KnowledgeBase: i += 1 def contains_entity(self, unicode entity): - self.require_vocab() cdef hash_t entity_hash = self.vocab.strings.add(entity) return entity_hash in self._entry_index def contains_alias(self, unicode alias): - self.require_vocab() cdef hash_t alias_hash = self.vocab.strings.add(alias) return alias_hash in self._alias_index @@ -190,7 +186,6 @@ cdef class KnowledgeBase: For a given alias, add its potential entities and prior probabilies to the KB. Return the alias_hash at the end """ - self.require_vocab() # Throw an error if the length of entities and probabilities are not the same if not len(entities) == len(probabilities): raise ValueError(Errors.E132.format(alias=alias, @@ -234,7 +229,6 @@ cdef class KnowledgeBase: Throw an error if this entity+prior prob would exceed the sum of 1. For efficiency, it's best to use the method `add_alias` as much as possible instead of this one. """ - self.require_vocab() # Check if the alias exists in the KB cdef hash_t alias_hash = self.vocab.strings[alias] if not alias_hash in self._alias_index: @@ -274,14 +268,12 @@ cdef class KnowledgeBase: alias_entry.probs = probs self._aliases_table[alias_index] = alias_entry - - def get_candidates(self, unicode alias): + def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]: """ Return candidate entities for an alias. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. If the alias is not known in the KB, and empty list is returned. """ - self.require_vocab() cdef hash_t alias_hash = self.vocab.strings[alias] if not alias_hash in self._alias_index: return [] @@ -298,7 +290,6 @@ cdef class KnowledgeBase: if entry_index != 0] def get_vector(self, unicode entity): - self.require_vocab() cdef hash_t entity_hash = self.vocab.strings[entity] # Return an empty list if this entity is unknown in this KB @@ -311,7 +302,6 @@ cdef class KnowledgeBase: def get_prior_prob(self, unicode entity, unicode alias): """ Return the prior probability of a given alias being linked to a given entity, or return 0.0 when this combination is not known in the knowledge base""" - self.require_vocab() cdef hash_t alias_hash = self.vocab.strings[alias] cdef hash_t entity_hash = self.vocab.strings[entity] @@ -329,8 +319,7 @@ cdef class KnowledgeBase: return 0.0 - def dump(self, loc): - self.require_vocab() + def to_disk(self, loc): cdef Writer writer = Writer(loc) writer.write_header(self.get_size_entities(), self.entity_vector_length) @@ -370,7 +359,7 @@ cdef class KnowledgeBase: writer.close() - cpdef load_bulk(self, loc): + cpdef from_disk(self, loc): cdef hash_t entity_hash cdef hash_t alias_hash cdef int64_t entry_index @@ -462,12 +451,11 @@ cdef class KnowledgeBase: cdef class Writer: def __init__(self, object loc): - if path.exists(loc): - assert not path.isdir(loc), f"{loc} is directory" if isinstance(loc, Path): loc = bytes(loc) if path.exists(loc): - assert not path.isdir(loc), "%s is directory." % loc + if path.isdir(loc): + raise ValueError(Errors.E928.format(loc=loc)) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(bytes_loc, 'wb') if not self._fp: @@ -511,8 +499,10 @@ cdef class Reader: def __init__(self, object loc): if isinstance(loc, Path): loc = bytes(loc) - assert path.exists(loc) - assert not path.isdir(loc) + if not path.exists(loc): + raise ValueError(Errors.E929.format(loc=loc)) + if path.isdir(loc): + raise ValueError(Errors.E928.format(loc=loc)) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(bytes_loc, 'rb') if not self._fp: diff --git a/spacy/language.py b/spacy/language.py index bf3bdb9aa..6fc780f3e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -772,9 +772,9 @@ class Language: self.remove_pipe(name) if not len(self.pipeline) or pipe_index == len(self.pipeline): # we have no components to insert before/after, or we're replacing the last component - self.add_pipe(factory_name, name=name) + self.add_pipe(factory_name, name=name, config=config, validate=validate) else: - self.add_pipe(factory_name, name=name, before=pipe_index) + self.add_pipe(factory_name, name=name, before=pipe_index, config=config, validate=validate) def rename_pipe(self, old_name: str, new_name: str) -> None: """Rename a pipeline component. diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index f96d50a7b..55d8614e1 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,9 +1,9 @@ -from typing import Optional +from typing import Optional, Callable, Iterable from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear from ...util import registry -from ...kb import KnowledgeBase +from ...kb import KnowledgeBase, Candidate, get_candidates from ...vocab import Vocab @@ -25,15 +25,21 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: @registry.assets.register("spacy.KBFromFile.v1") -def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase: - vocab = Vocab().from_disk(vocab_path) - kb = KnowledgeBase(entity_vector_length=1) - kb.initialize(vocab) - kb.load_bulk(kb_path) - return kb +def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]: + def kb_from_file(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=1) + kb.from_disk(kb_path) + return kb + return kb_from_file @registry.assets.register("spacy.EmptyKB.v1") -def empty_kb(entity_vector_length: int) -> KnowledgeBase: - kb = KnowledgeBase(entity_vector_length=entity_vector_length) - return kb +def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]: + def empty_kb_factory(vocab): + return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length) + return empty_kb_factory + + +@registry.assets.register("spacy.CandidateGenerator.v1") +def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: + return get_candidates diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 35bf2906e..d92c700ba 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -6,7 +6,7 @@ from thinc.api import CosineDistance, get_array_module, Model, Optimizer, Config from thinc.api import set_dropout_rate import warnings -from ..kb import KnowledgeBase +from ..kb import KnowledgeBase, Candidate from ..tokens import Doc from .pipe import Pipe, deserialize_config from ..language import Language @@ -32,35 +32,30 @@ subword_features = true """ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] -default_kb_config = """ -[kb] -@assets = "spacy.EmptyKB.v1" -entity_vector_length = 64 -""" -DEFAULT_NEL_KB = Config().from_str(default_kb_config)["kb"] - @Language.factory( "entity_linker", requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], assigns=["token.ent_kb_id"], default_config={ - "kb": DEFAULT_NEL_KB, + "kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 64}, "model": DEFAULT_NEL_MODEL, "labels_discard": [], "incl_prior": True, "incl_context": True, + "get_candidates": {"@assets": "spacy.CandidateGenerator.v1"}, }, ) def make_entity_linker( nlp: Language, name: str, model: Model, - kb: KnowledgeBase, + kb_loader: Callable[[Vocab], KnowledgeBase], *, labels_discard: Iterable[str], incl_prior: bool, incl_context: bool, + get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]], ): """Construct an EntityLinker component. @@ -76,10 +71,11 @@ def make_entity_linker( nlp.vocab, model, name, - kb=kb, + kb_loader=kb_loader, labels_discard=labels_discard, incl_prior=incl_prior, incl_context=incl_context, + get_candidates=get_candidates, ) @@ -97,10 +93,11 @@ class EntityLinker(Pipe): model: Model, name: str = "entity_linker", *, - kb: KnowledgeBase, + kb_loader: Callable[[Vocab], KnowledgeBase], labels_discard: Iterable[str], incl_prior: bool, incl_context: bool, + get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]], ) -> None: """Initialize an entity linker. @@ -108,7 +105,7 @@ class EntityLinker(Pipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - kb (KnowledgeBase): The KnowledgeBase holding all entities and their aliases. + kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance. labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. incl_context (bool): Whether or not to include the local context in the model. @@ -119,17 +116,12 @@ class EntityLinker(Pipe): self.model = model self.name = name cfg = { - "kb": kb, "labels_discard": list(labels_discard), "incl_prior": incl_prior, "incl_context": incl_context, } - if not isinstance(kb, KnowledgeBase): - raise ValueError(Errors.E990.format(type=type(self.kb))) - kb.initialize(vocab) - self.kb = kb - if "kb" in cfg: - del cfg["kb"] # we don't want to duplicate its serialization + self.kb = kb_loader(self.vocab) + self.get_candidates = get_candidates self.cfg = dict(cfg) self.distance = CosineDistance(normalize=False) # how many neightbour sentences to take into account @@ -326,10 +318,11 @@ class EntityLinker(Pipe): end_token = sentences[end_sentence].end sent_doc = doc[start_token:end_token].as_doc() # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model.predict([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) + xp = self.model.ops.xp + if self.cfg.get("incl_context"): + sentence_encoding = self.model.predict([sent_doc])[0] + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) for ent in sent.ents: entity_count += 1 to_discard = self.cfg.get("labels_discard", []) @@ -337,7 +330,7 @@ class EntityLinker(Pipe): # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) else: - candidates = self.kb.get_candidates(ent.text) + candidates = self.get_candidates(self.kb, ent) if not candidates: # no prediction possible for this entity - setting to NIL final_kb_ids.append(self.NIL) @@ -421,10 +414,9 @@ class EntityLinker(Pipe): DOCS: https://spacy.io/api/entitylinker#to_disk """ serialize = {} - self.cfg["entity_width"] = self.kb.entity_vector_length serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) - serialize["kb"] = lambda p: self.kb.dump(p) + serialize["kb"] = lambda p: self.kb.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p) util.to_disk(path, serialize, exclude) @@ -446,15 +438,10 @@ class EntityLinker(Pipe): except AttributeError: raise ValueError(Errors.E149) from None - def load_kb(p): - self.kb = KnowledgeBase(entity_vector_length=self.cfg["entity_width"]) - self.kb.initialize(self.vocab) - self.kb.load_bulk(p) - deserialize = {} deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) - deserialize["kb"] = load_kb + deserialize["kb"] = lambda p: self.kb.from_disk(p) deserialize["model"] = load_model util.from_disk(path, deserialize, exclude) return self diff --git a/spacy/pipeline/nn_parser.pyx b/spacy/pipeline/nn_parser.pyx deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 9070329e8..2a4274597 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -68,7 +68,6 @@ class Tagger(Pipe): name (str): The component instance name, used to add entries to the losses during training. labels (List): The set of labels. Defaults to None. - set_morphology (bool): Whether to set morphological features. DOCS: https://spacy.io/api/tagger#init """ diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index b3fb6d0fc..4385d2bf9 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,6 +1,7 @@ +from typing import Callable, Iterable import pytest -from spacy.kb import KnowledgeBase +from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy import util, registry from spacy.gold import Example @@ -21,8 +22,7 @@ def assert_almost_equal(a, b): def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" - mykb = KnowledgeBase(entity_vector_length=3) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3]) @@ -51,8 +51,7 @@ def test_kb_valid_entities(nlp): def test_kb_invalid_entities(nlp): """Test the invalid construction of a KB with an alias linked to a non-existing entity""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) @@ -68,8 +67,7 @@ def test_kb_invalid_entities(nlp): def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) @@ -83,8 +81,7 @@ def test_kb_invalid_probabilities(nlp): def test_kb_invalid_combination(nlp): """Test the invalid construction of a KB with non-matching entity and probability lists""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) @@ -100,8 +97,7 @@ def test_kb_invalid_combination(nlp): def test_kb_invalid_entity_vector(nlp): """Test the invalid construction of a KB with non-matching entity vector lengths""" - mykb = KnowledgeBase(entity_vector_length=3) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3]) @@ -117,14 +113,14 @@ def test_kb_default(nlp): assert len(entity_linker.kb) == 0 assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_aliases() == 0 - # default value from pipeline.entity_linker + # 64 is the default value from pipeline.entity_linker assert entity_linker.kb.entity_vector_length == 64 def test_kb_custom_length(nlp): """Test that the default (empty) KB can be configured with a custom entity length""" entity_linker = nlp.add_pipe( - "entity_linker", config={"kb": {"entity_vector_length": 35}} + "entity_linker", config={"kb_loader": {"entity_vector_length": 35}} ) assert len(entity_linker.kb) == 0 assert entity_linker.kb.get_size_entities() == 0 @@ -141,7 +137,7 @@ def test_kb_undefined(nlp): def test_kb_empty(nlp): """Test that the EL can't train with an empty KB""" - config = {"kb": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}} + config = {"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}} entity_linker = nlp.add_pipe("entity_linker", config=config) assert len(entity_linker.kb) == 0 with pytest.raises(ValueError): @@ -150,8 +146,13 @@ def test_kb_empty(nlp): def test_candidate_generation(nlp): """Test correct candidate generation""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + doc = nlp("douglas adam Adam shrubbery") + + douglas_ent = doc[0:1] + adam_ent = doc[1:2] + Adam_ent = doc[2:3] + shrubbery_ent = doc[3:4] # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) @@ -163,21 +164,76 @@ def test_candidate_generation(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates - assert len(mykb.get_candidates("douglas")) == 2 - assert len(mykb.get_candidates("adam")) == 1 - assert len(mykb.get_candidates("shrubbery")) == 0 + assert len(get_candidates(mykb, douglas_ent)) == 2 + assert len(get_candidates(mykb, adam_ent)) == 1 + assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive + assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates - assert mykb.get_candidates("adam")[0].entity_ == "Q2" - assert mykb.get_candidates("adam")[0].alias_ == "adam" - assert_almost_equal(mykb.get_candidates("adam")[0].entity_freq, 12) - assert_almost_equal(mykb.get_candidates("adam")[0].prior_prob, 0.9) + assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" + assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" + assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) + assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) + + +def test_el_pipe_configuration(nlp): + """Test correct candidate generation as part of the EL pipe""" + nlp.add_pipe("sentencizer") + pattern = {"label": "PERSON", "pattern": [{"LOWER": "douglas"}]} + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns([pattern]) + + @registry.assets.register("myAdamKB.v1") + def mykb() -> Callable[["Vocab"], KnowledgeBase]: + def create_kb(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=1) + kb.add_entity(entity="Q2", freq=12, entity_vector=[2]) + kb.add_entity(entity="Q3", freq=5, entity_vector=[3]) + kb.add_alias( + alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1] + ) + return kb + + return create_kb + + # run an EL pipe without a trained context encoder, to check the candidate generation step only + nlp.add_pipe( + "entity_linker", + config={"kb_loader": {"@assets": "myAdamKB.v1"}, "incl_context": False}, + ) + # With the default get_candidates function, matching is case-sensitive + text = "Douglas and douglas are not the same." + doc = nlp(text) + assert doc[0].ent_kb_id_ == "NIL" + assert doc[1].ent_kb_id_ == "" + assert doc[2].ent_kb_id_ == "Q2" + + def get_lowercased_candidates(kb, span): + return kb.get_alias_candidates(span.text.lower()) + + @registry.assets.register("spacy.LowercaseCandidateGenerator.v1") + def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: + return get_lowercased_candidates + + # replace the pipe with a new one with with a different candidate generator + nlp.replace_pipe( + "entity_linker", + "entity_linker", + config={ + "kb_loader": {"@assets": "myAdamKB.v1"}, + "incl_context": False, + "get_candidates": {"@assets": "spacy.LowercaseCandidateGenerator.v1"}, + }, + ) + doc = nlp(text) + assert doc[0].ent_kb_id_ == "Q2" + assert doc[1].ent_kb_id_ == "" + assert doc[2].ent_kb_id_ == "Q2" def test_append_alias(nlp): """Test that we can append additional alias-entity pairs""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) @@ -189,26 +245,25 @@ def test_append_alias(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates - assert len(mykb.get_candidates("douglas")) == 2 + assert len(mykb.get_alias_candidates("douglas")) == 2 # append an alias mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) # test the size of the relevant candidates has been incremented - assert len(mykb.get_candidates("douglas")) == 3 + assert len(mykb.get_alias_candidates("douglas")) == 3 # append the same alias-entity pair again should not work (will throw a warning) with pytest.warns(UserWarning): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) # test the size of the relevant candidates remained unchanged - assert len(mykb.get_candidates("douglas")) == 3 + assert len(mykb.get_alias_candidates("douglas")) == 3 def test_append_invalid_alias(nlp): """Test that append an alias will throw an error if prior probs are exceeding 1""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) @@ -228,16 +283,18 @@ def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" @registry.assets.register("myLocationsKB.v1") - def dummy_kb() -> KnowledgeBase: - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) - # adding entities - mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) - mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) - # adding aliases - mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) - mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6]) - return mykb + def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: + def create_kb(vocab): + mykb = KnowledgeBase(vocab, entity_vector_length=1) + # adding entities + mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) + # adding aliases + mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) + mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6]) + return mykb + + return create_kb # set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained) nlp.add_pipe("sentencizer") @@ -247,7 +304,7 @@ def test_preserving_links_asdoc(nlp): ] ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) - el_config = {"kb": {"@assets": "myLocationsKB.v1"}, "incl_prior": False} + el_config = {"kb_loader": {"@assets": "myLocationsKB.v1"}, "incl_prior": False} el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True) el_pipe.begin_training(lambda: []) el_pipe.incl_context = False @@ -331,24 +388,28 @@ def test_overfitting_IO(): train_examples.append(Example.from_dict(doc, annotation)) @registry.assets.register("myOverfittingKB.v1") - def dummy_kb() -> KnowledgeBase: - # create artificial KB - assign same prior weight to the two russ cochran's - # Q2146908 (Russ Cochran): American golfer - # Q7381115 (Russ Cochran): publisher - mykb = KnowledgeBase(entity_vector_length=3) - mykb.initialize(nlp.vocab) - mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) - mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) - mykb.add_alias( - alias="Russ Cochran", - entities=["Q2146908", "Q7381115"], - probabilities=[0.5, 0.5], - ) - return mykb + def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: + def create_kb(vocab): + # create artificial KB - assign same prior weight to the two russ cochran's + # Q2146908 (Russ Cochran): American golfer + # Q7381115 (Russ Cochran): publisher + mykb = KnowledgeBase(vocab, entity_vector_length=3) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) + return mykb + + return create_kb # Create the Entity Linker component and add it to the pipeline nlp.add_pipe( - "entity_linker", config={"kb": {"@assets": "myOverfittingKB.v1"}}, last=True + "entity_linker", + config={"kb_loader": {"@assets": "myOverfittingKB.v1"}}, + last=True, ) # train the NEL pipe diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 0141708b4..feb11cabc 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -78,6 +78,14 @@ def test_replace_last_pipe(nlp): assert nlp.pipe_names == ["sentencizer", "ner"] +def test_replace_pipe_config(nlp): + nlp.add_pipe("entity_linker") + nlp.add_pipe("sentencizer") + assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == True + nlp.replace_pipe("entity_linker", "entity_linker", config={"incl_prior": False}) + assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == False + + @pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")]) def test_rename_pipe(nlp, old_name, new_name): with pytest.raises(ValueError): diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index 1e655851f..0d4ce9a30 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -139,8 +139,7 @@ def test_issue4665(): def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() - kb = KnowledgeBase(entity_vector_length=3) - kb.initialize(nlp.vocab) + kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] with pytest.warns(UserWarning): @@ -156,10 +155,9 @@ def test_issue4674(): if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" - kb.dump(str(file_path)) - kb2 = KnowledgeBase(entity_vector_length=3) - kb2.initialize(nlp.vocab) - kb2.load_bulk(str(file_path)) + kb.to_disk(str(file_path)) + kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb2.from_disk(str(file_path)) assert kb2.get_size_entities() == 1 diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 93069d9a3..58d03ca8b 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,3 +1,4 @@ +from typing import Callable import warnings from unittest import TestCase import pytest @@ -70,13 +71,14 @@ def entity_linker(): nlp = Language() @registry.assets.register("TestIssue5230KB.v1") - def dummy_kb() -> KnowledgeBase: - kb = KnowledgeBase(entity_vector_length=1) - kb.initialize(nlp.vocab) - kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) - return kb + def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: + def create_kb(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=1) + kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) + return kb + return create_kb - config = {"kb": {"@assets": "TestIssue5230KB.v1"}} + config = {"kb_loader": {"@assets": "TestIssue5230KB.v1"}} entity_linker = nlp.add_pipe("entity_linker", config=config) # need to add model for two reasons: # 1. no model leads to error in serialization, @@ -121,19 +123,17 @@ def test_writer_with_path_py35(): def test_save_and_load_knowledge_base(): nlp = Language() - kb = KnowledgeBase(entity_vector_length=1) - kb.initialize(nlp.vocab) + kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) with make_tempdir() as d: path = d / "kb" try: - kb.dump(path) + kb.to_disk(path) except Exception as e: pytest.fail(str(e)) try: - kb_loaded = KnowledgeBase(entity_vector_length=1) - kb_loaded.initialize(nlp.vocab) - kb_loaded.load_bulk(path) + kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb_loaded.from_disk(path) except Exception as e: pytest.fail(str(e)) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 3f33c6f06..3cf5485d7 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,4 +1,8 @@ -from spacy.util import ensure_path +from typing import Callable + +from spacy import util +from spacy.lang.en import English +from spacy.util import ensure_path, registry from spacy.kb import KnowledgeBase from ..util import make_tempdir @@ -15,20 +19,16 @@ def test_serialize_kb_disk(en_vocab): if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" - kb1.dump(str(file_path)) - - kb2 = KnowledgeBase(entity_vector_length=3) - kb2.initialize(en_vocab) - kb2.load_bulk(str(file_path)) + kb1.to_disk(str(file_path)) + kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3) + kb2.from_disk(str(file_path)) # final assertions _check_kb(kb2) def _get_dummy_kb(vocab): - kb = KnowledgeBase(entity_vector_length=3) - kb.initialize(vocab) - + kb = KnowledgeBase(vocab, entity_vector_length=3) kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3]) kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0]) kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7]) @@ -61,7 +61,7 @@ def _check_kb(kb): assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_) + candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_) assert len(candidates) == 2 assert candidates[0].entity_ == "Q007" @@ -75,3 +75,47 @@ def _check_kb(kb): assert candidates[1].entity_vector == [7, 1, 0] assert candidates[1].alias_ == "double07" assert 0.099 < candidates[1].prior_prob < 0.101 + + +def test_serialize_subclassed_kb(): + """Check that IO of a custom KB works fine as part of an EL pipe.""" + + class SubKnowledgeBase(KnowledgeBase): + def __init__(self, vocab, entity_vector_length, custom_field): + super().__init__(vocab, entity_vector_length) + self.custom_field = custom_field + + @registry.assets.register("spacy.CustomKB.v1") + def custom_kb( + entity_vector_length: int, custom_field: int + ) -> Callable[["Vocab"], KnowledgeBase]: + def custom_kb_factory(vocab): + return SubKnowledgeBase( + vocab=vocab, + entity_vector_length=entity_vector_length, + custom_field=custom_field, + ) + + return custom_kb_factory + + nlp = English() + config = { + "kb_loader": { + "@assets": "spacy.CustomKB.v1", + "entity_vector_length": 342, + "custom_field": 666, + } + } + entity_linker = nlp.add_pipe("entity_linker", config=config) + assert type(entity_linker.kb) == SubKnowledgeBase + assert entity_linker.kb.entity_vector_length == 342 + assert entity_linker.kb.custom_field == 666 + + # Make sure the custom KB is serialized correctly + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + entity_linker2 = nlp2.get_pipe("entity_linker") + assert type(entity_linker2.kb) == SubKnowledgeBase + assert entity_linker2.kb.entity_vector_length == 342 + assert entity_linker2.kb.custom_field == 666 diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index 3db5d6bac..855dead27 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -200,21 +200,21 @@ probability of the fact that the mention links to the entity ID. | `alias` | The textual mention or alias. ~~str~~ | | **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ | -## KnowledgeBase.dump {#dump tag="method"} +## KnowledgeBase.to_disk {#to_disk tag="method"} Save the current state of the knowledge base to a directory. > #### Example > > ```python -> kb.dump(loc) +> kb.to_disk(loc) > ``` | Name | Description | | ----- | ------------------------------------------------------------------------------------------------------------------------------------------ | | `loc` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | -## KnowledgeBase.load_bulk {#load_bulk tag="method"} +## KnowledgeBase.from_disk {#from_disk tag="method"} Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab) should also be the same as the one used to create the KB. @@ -226,7 +226,7 @@ Restore the state of the knowledge base from a given directory. Note that the > from spacy.vocab import Vocab > vocab = Vocab().from_disk("/path/to/vocab") > kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) -> kb.load_bulk("/path/to/kb") +> kb.from_disk("/path/to/kb") > ``` | Name | Description |