Various fixes to NEL functionality, Example class etc (#5460)

* setting KB in the EL constructor, similar to how the model is passed on * removing wikipedia example files - moved to projects * throw an error when nlp.update is called with 2 positional arguments * rewriting the config logic in create pipe to accomodate for other objects (e.g. KB) in the config * update config files with new parameters * avoid training pipeline components that don't have a model (like sentencizer) * various small fixes + UX improvements * small fixes * set thinc to 8.0.0a9 everywhere * remove outdated comment
2025-07-11 08:42:28 +03:00 · 2020-05-20 11:41:12 +02:00 · 2020-05-20 11:41:12 +02:00 · 7f5715a081
commit 7f5715a081
parent 664a3603b0
15 changed files with 108 additions and 46 deletions
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@ -12,6 +12,8 @@ use_gpu = 0
 scores = ["tags_acc", "uas", "las"]
 score_weights = {"las": 0.8, "tags_acc": 0.2}
 limit = 0
 seed = 0
 accumulate_gradient = 2
 [training.batch_size]
@schedules = "compounding.v1"
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@ -12,6 +12,8 @@ use_gpu = -1
 scores = ["tags_acc", "uas", "las"]
 score_weights = {"las": 0.8, "tags_acc": 0.2}
 limit = 0
 seed = 0
 accumulate_gradient = 2
 [training.batch_size]
@schedules = "compounding.v1"
--- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
@ -10,6 +10,8 @@ orth_variant_level = 0.0
 gold_preproc = true
 max_length = 0
 batch_size = 25
 seed = 0
 accumulate_gradient = 2
 [optimizer]
@optimizers = "Adam.v1"
--- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@ -9,6 +9,8 @@ score_weights = {"ents_f": 1}
 orth_variant_level = 0.0
 gold_preproc = true
 max_length = 0
 seed = 0
 accumulate_gradient = 2
 [training.batch_size]
@schedules = "compounding.v1"
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@ -83,13 +83,13 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
        # use only the predicted EL score and not the prior probability (for demo purposes)
        cfg = {"incl_prior": False}
        entity_linker = nlp.create_pipe("entity_linker", cfg)
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
-        entity_linker.set_kb(kb)
+
        # use only the predicted EL score and not the prior probability (for demo purposes)
        cfg = {"kb": kb, "incl_prior": False}
        entity_linker = nlp.create_pipe("entity_linker", cfg)
        nlp.add_pipe(entity_linker, last=True)
    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc==8.0.0a8",
+    "thinc==8.0.0a9",
    "blis>=0.4.0,<0.5.0"
 ]
 build-backend = "setuptools.build_meta"
--- a/setup.cfg
+++ b/setup.cfg
@ -36,7 +36,7 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc==8.0.0a8
+    thinc==8.0.0a9
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@ -12,7 +12,7 @@ import random
 from ..gold import GoldCorpus
 from .. import util
-
+from ..errors import Errors
 registry = util.registry
@ -233,6 +233,8 @@ def create_train_batches(nlp, corpus, cfg):
            max_length=cfg["max_length"],
            ignore_misaligned=True,
        ))
        if len(train_examples) == 0:
            raise ValueError(Errors.E988)
        random.shuffle(train_examples)
        batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
        for batch in batches:
@ -313,12 +315,14 @@ def train_while_improving(
        dropouts = dropout
    results = []
    losses = {}
    to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
    for step, batch in enumerate(train_data):
        dropout = next(dropouts)
        with nlp.select_pipes(enable=to_enable):
            for subbatch in subdivide_batch(batch, accumulate_gradient):
                nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
            for name, proc in nlp.pipeline:
            if hasattr(proc, "model"):
                proc.model.finish_update(optimizer)
        optimizer.step_schedules()
        if not (step % eval_frequency):
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -195,7 +195,7 @@ class Errors(object):
            "the documentation:\nhttps://spacy.io/usage/models")
    E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
            "component to the pipeline with: "
-            "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
+            "nlp.add_pipe(nlp.create_pipe('sentencizer')). "
            "Alternatively, add the dependency parser, or set sentence "
            "boundaries by setting doc[i].is_sent_start.")
    E031 = ("Invalid token: empty string ('') at position {i}.")
@ -430,8 +430,7 @@ class Errors(object):
    E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
            "includes either the `text` or `tokens` key. For more info, see "
            "the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
-    E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
+    E139 = ("Knowledge Base for component '{name}' is empty.")
            "forget to call set_kb()?")
    E140 = ("The list of entities, prior probabilities and entity vectors "
            "should be of equal length.")
    E141 = ("Entity vectors should be of length {required} instead of the "
@ -548,6 +547,18 @@ class Errors(object):
            "token itself.")
    # TODO: fix numbering after merging develop into master
    E987 = ("The text of an example training instance is either a Doc or "
            "a string, but found {type} instead.")
    E988 = ("Could not parse any training examples. Ensure the data is "
            "formatted correctly.")
    E989 = ("'nlp.update()' was called with two positional arguments. This "
            "may be due to a backwards-incompatible change to the format "
            "of the training data in spaCy 3.0 onwards. The 'update' "
            "function should now be called with a batch of 'Example' "
            "objects, instead of (text, annotation) tuples. ")
    E990 = ("An entity linking component needs to be initialized with a "
            "KnowledgeBase object, but found {type} instead.")
    E991 = ("The function 'select_pipes' should be called with either a "
            "'disable' argument to list the names of the pipe components "
            "that should be disabled, or with an 'enable' argument that "
@ -562,8 +573,8 @@ class Errors(object):
    E997 = ("Tokenizer special cases are not allowed to modify the text. "
            "This would map '{chunk}' to '{orth}' given token attributes "
            "'{token_attrs}'.")
-    E998 = ("Can only create GoldParse objects from Example objects without a "
+    E998 = ("To create GoldParse objects from Example objects without a "
-            "Doc if get_gold_parses() is called with a Vocab object.")
+            "Doc, get_gold_parses() should be called with a Vocab object.")
    E999 = ("Encountered an unexpected format for the dictionary holding "
            "gold annotations: {gold_dict}")
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -212,6 +212,8 @@ class GoldCorpus(object):
                            doc = ex_dict.get("doc", None)
                            if doc is None:
                                doc = ex_dict.get("text", None)
                            if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)):
                                raise ValueError(Errors.E987.format(type=type(doc)))
                            examples.append(Example.from_dict(ex_dict, doc=doc))
            elif file_name.endswith("msg"):
@ -288,7 +290,6 @@ class GoldCorpus(object):
        """ Setting gold_preproc will result in creating a doc per sentence """
        for example in examples:
            if gold_preproc:
                example.doc = None
                split_examples = example.split_sents()
                example_golds = []
                for split_example in split_examples:
@ -716,6 +717,12 @@ cdef class TokenAnnotation:
    def get_sent_start(self, i):
        return self.sent_starts[i] if i < len(self.sent_starts) else None
    def __str__(self):
        return str(self.to_dict())
    def __repr__(self):
        return self.__str__()
 cdef class DocAnnotation:
    def __init__(self, cats=None, links=None):
@ -729,6 +736,12 @@ cdef class DocAnnotation:
    def to_dict(self):
        return {"cats": self.cats, "links": self.links}
    def __str__(self):
        return str(self.to_dict())
    def __repr__(self):
        return self.__str__()
 cdef class Example:
    def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
@ -747,9 +760,9 @@ cdef class Example:
    @classmethod
    def from_dict(cls, example_dict, doc=None):
-        token_dict = example_dict["token_annotation"]
+        token_dict = example_dict.get("token_annotation", {})
        token_annotation = TokenAnnotation.from_dict(token_dict)
-        doc_dict = example_dict["doc_annotation"]
+        doc_dict = example_dict.get("doc_annotation", {})
        doc_annotation = DocAnnotation.from_dict(doc_dict)
        return cls(doc_annotation, token_annotation, doc)
@ -791,6 +804,8 @@ cdef class Example:
    def split_sents(self):
        """ Split the token annotations into multiple Examples based on
        sent_starts and return a list of the new Examples"""
        if not self.token_annotation.words:
            return [self]
        s_example = Example(doc=None, doc_annotation=self.doc_annotation)
        s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
        s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
@ -842,7 +857,7 @@ cdef class Example:
        if merge:
            t = self.token_annotation
            doc = self.doc
-            if self.doc is None:
+            if doc is None or not isinstance(doc, Doc):
                if not vocab:
                    raise ValueError(Errors.E998)
                doc = Doc(vocab, words=t.words)
@ -1052,7 +1067,7 @@ cdef class GoldParse:
            self.sent_starts = [None] * len(doc)
            # This needs to be done before we align the words
-            if make_projective and heads is not None and deps is not None:
+            if make_projective and any(heads) and any(deps) :
                heads, deps = nonproj.projectivize(heads, deps)
            # Do many-to-one alignment for misaligned tokens.
--- a/spacy/language.py
+++ b/spacy/language.py
@ -314,19 +314,20 @@ class Language(object):
        # transform the model's config to an actual Model
        factory_cfg = dict(config)
-        model_cfg = None
+
        # check whether we have a proper model config, or load a default one
        if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
            warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name))
        # refer to the model configuration in the cfg settings for this component
        if "model" in factory_cfg:
-            model_cfg = factory_cfg["model"]
+            self.config[name] = {"model": factory_cfg["model"]}
-            if not isinstance(model_cfg, dict):
+
-                warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name))
+        # create all objects in the config
-                model_cfg = None
+        factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"]
        model = factory_cfg.get("model", None)
        if model is not None:
            del factory_cfg["model"]
        model = None
        if model_cfg is not None:
            self.config[name] = {"model": model_cfg}
            model = registry.make_from_config({"model": model_cfg}, validate=True)[
                "model"
            ]
        return factory(self, model, **factory_cfg)
    def add_pipe(
@ -517,10 +518,11 @@ class Language(object):
    def make_doc(self, text):
        return self.tokenizer(text)
-    def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None):
+    def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None):
        """Update the models in the pipeline.
        examples (iterable): A batch of `Example` or `Doc` objects.
        dummy: Should not be set - serves to catch backwards-incompatible scripts.
        drop (float): The dropout rate.
        sgd (callable): An optimizer.
        losses (dict): Dictionary to update with the loss, keyed by component.
@ -529,6 +531,9 @@ class Language(object):
        DOCS: https://spacy.io/api/language#update
        """
        if dummy is not None:
            raise ValueError(Errors.E989)
        if len(examples) == 0:
            return
        examples = Example.to_example_objects(examples, make_doc=self.make_doc)
@ -735,7 +740,7 @@ class Language(object):
        contexts = [
            pipe.use_params(params)
            for name, pipe in self.pipeline
-            if hasattr(pipe, "use_params")
+            if hasattr(pipe, "use_params") and hasattr(pipe, "model")
        ]
        # TODO: Having trouble with contextlib
        # Workaround: these aren't actually context managers atm.
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -1,7 +1,11 @@
 from pathlib import Path
 from thinc.api import chain, clone, list2ragged, reduce_mean, residual
 from thinc.api import Model, Maxout, Linear
 from ...util import registry
 from ...kb import KnowledgeBase
 from ...vocab import Vocab
@registry.architectures.register("spacy.EntityLinker.v1")
@ -19,3 +23,11 @@ def build_nel_encoder(tok2vec, nO=None):
        model.set_ref("output_layer", output_layer)
        model.set_ref("tok2vec", tok2vec)
    return model
@registry.assets.register("spacy.KBFromFile.v1")
 def load_kb(nlp_path, kb_path) -> KnowledgeBase:
    vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
    kb = KnowledgeBase(vocab=vocab)
    kb.load_bulk(kb_path)
    return kb
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -2,6 +2,7 @@
 import numpy
 import srsly
 import random
 from ast import literal_eval
 from thinc.api import CosineDistance, to_categorical, get_array_module
 from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
@ -1244,15 +1245,20 @@ class EntityLinker(Pipe):
        self.vocab = vocab
        self.model = model
        self.kb = None
        self.kb = cfg.get("kb", None)
        if self.kb is None:
            # create an empty KB that should be filled by calling from_disk
            self.kb = KnowledgeBase(vocab=vocab)
        else:
            del cfg["kb"]   # we don't want to duplicate its serialization
        if not isinstance(self.kb, KnowledgeBase):
            raise ValueError(Errors.E990.format(type=type(self.kb)))
        self.cfg = dict(cfg)
        self.distance = CosineDistance(normalize=False)
    def set_kb(self, kb):
        self.kb = kb
    def require_kb(self):
        # Raise an error if the knowledge base is not initialized.
-        if getattr(self, "kb", None) in (None, True, False):
+        if len(self.kb) == 0:
            raise ValueError(Errors.E139.format(name=self.name))
    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
@ -1285,6 +1291,8 @@ class EntityLinker(Pipe):
                ents_by_offset[(ent.start_char, ent.end_char)] = ent
            for entity, kb_dict in gold.links.items():
                if isinstance(entity, str):
                    entity = literal_eval(entity)
                start, end = entity
                mention = doc.text[start:end]
@ -1375,7 +1383,6 @@ class EntityLinker(Pipe):
    def predict(self, docs):
        """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
        self.require_kb()
        entity_count = 0
        final_kb_ids = []
        final_tensors = []
@ -1486,9 +1493,8 @@ class EntityLinker(Pipe):
                raise ValueError(Errors.E149)
        def load_kb(p):
-            kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
+            self.kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
-            kb.load_bulk(p)
+            self.kb.load_bulk(p)
            self.set_kb(kb)
        deserialize = {}
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -203,8 +203,8 @@ def test_preserving_links_asdoc(nlp):
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
-    el_pipe = nlp.create_pipe(name="entity_linker")
+    cfg = {"kb": mykb, "incl_prior": False}
-    el_pipe.set_kb(mykb)
+    el_pipe = nlp.create_pipe(name="entity_linker", config=cfg)
    el_pipe.begin_training()
    el_pipe.incl_context = False
    el_pipe.incl_prior = True
@ -288,8 +288,7 @@ def test_overfitting_IO():
    mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
    # Create the Entity Linker component and add it to the pipeline
-    entity_linker = nlp.create_pipe("entity_linker")
+    entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})
    entity_linker.set_kb(mykb)
    nlp.add_pipe(entity_linker, last=True)
    # train the NEL pipe
--- a/spacy/util.py
+++ b/spacy/util.py
@ -34,6 +34,7 @@ class registry(thinc.registry):
    lookups = catalogue.create("spacy", "lookups", entry_points=True)
    factories = catalogue.create("spacy", "factories", entry_points=True)
    displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
    assets = catalogue.create("spacy", "assets", entry_points=True)
 def set_env_log(value):
@ -160,6 +161,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
    for name in pipeline:
        if name not in disable:
            config = meta.get("pipeline_args", {}).get(name, {})
            config.update(overrides)
            factory = factories.get(name, name)
            if nlp_config.get(name, None):
                model_config = nlp_config[name]["model"]