From 7f5715a08159c06c249c3efe4d8934df2c98544d Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 20 May 2020 11:41:12 +0200
Subject: [PATCH] Various fixes to NEL functionality, Example class etc (#5460)

* setting KB in the EL constructor, similar to how the model is passed on

* removing wikipedia example files - moved to projects

* throw an error when nlp.update is called with 2 positional arguments

* rewriting the config logic in create pipe to accomodate for other objects (e.g. KB) in the config

* update config files with new parameters

* avoid training pipeline components that don't have a model (like sentencizer)

* various small fixes + UX improvements

* small fixes

* set thinc to 8.0.0a9 everywhere

* remove outdated comment
---
 .../ptb-joint-pos-dep/bilstm_tok2vec.cfg      |  2 ++
 .../ptb-joint-pos-dep/defaults.cfg            |  2 ++
 .../tok2vec-ner/charembed_tok2vec.cfg         |  2 ++
 .../tok2vec-ner/multihashembed_tok2vec.cfg    |  2 ++
 examples/training/train_entity_linker.py      |  8 ++---
 pyproject.toml                                |  2 +-
 setup.cfg                                     |  2 +-
 spacy/cli/train_from_config.py                | 14 ++++++---
 spacy/errors.py                               | 21 ++++++++++---
 spacy/gold.pyx                                | 25 ++++++++++++---
 spacy/language.py                             | 31 +++++++++++--------
 spacy/ml/models/entity_linker.py              | 12 +++++++
 spacy/pipeline/pipes.pyx                      | 22 ++++++++-----
 spacy/tests/pipeline/test_entity_linker.py    |  7 ++---
 spacy/util.py                                 |  2 ++
 15 files changed, 108 insertions(+), 46 deletions(-)

diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
index b6b4e82b6..e152fa5e0 100644
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@@ -12,6 +12,8 @@ use_gpu = 0
 scores = ["tags_acc", "uas", "las"]
 score_weights = {"las": 0.8, "tags_acc": 0.2}
 limit = 0
+seed = 0
+accumulate_gradient = 2
 
 [training.batch_size]
 @schedules = "compounding.v1"
diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
index 2ceaab0be..9a10c45f0 100644
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -12,6 +12,8 @@ use_gpu = -1
 scores = ["tags_acc", "uas", "las"]
 score_weights = {"las": 0.8, "tags_acc": 0.2}
 limit = 0
+seed = 0
+accumulate_gradient = 2
 
 [training.batch_size]
 @schedules = "compounding.v1"
diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
index b8219ad10..796c8670f 100644
--- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
@@ -10,6 +10,8 @@ orth_variant_level = 0.0
 gold_preproc = true
 max_length = 0
 batch_size = 25
+seed = 0
+accumulate_gradient = 2
 
 [optimizer]
 @optimizers = "Adam.v1"
diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
index dc25a1c3b..3ac70675b 100644
--- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@@ -9,6 +9,8 @@ score_weights = {"ents_f": 1}
 orth_variant_level = 0.0
 gold_preproc = true
 max_length = 0
+seed = 0
+accumulate_gradient = 2
 
 [training.batch_size]
 @schedules = "compounding.v1"
diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py
index a22f255e7..2da1db26d 100644
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@@ -83,13 +83,13 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
 
     # Create the Entity Linker component and add it to the pipeline.
     if "entity_linker" not in nlp.pipe_names:
-        # use only the predicted EL score and not the prior probability (for demo purposes)
-        cfg = {"incl_prior": False}
-        entity_linker = nlp.create_pipe("entity_linker", cfg)
         kb = KnowledgeBase(vocab=nlp.vocab)
         kb.load_bulk(kb_path)
         print("Loaded Knowledge Base from '%s'" % kb_path)
-        entity_linker.set_kb(kb)
+
+        # use only the predicted EL score and not the prior probability (for demo purposes)
+        cfg = {"kb": kb, "incl_prior": False}
+        entity_linker = nlp.create_pipe("entity_linker", cfg)
         nlp.add_pipe(entity_linker, last=True)
 
     # Convert the texts to docs to make sure we have doc.ents set for the training examples.
diff --git a/pyproject.toml b/pyproject.toml
index 548664e89..66a06c1d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc==8.0.0a8",
+    "thinc==8.0.0a9",
     "blis>=0.4.0,<0.5.0"
 ]
 build-backend = "setuptools.build_meta"
diff --git a/setup.cfg b/setup.cfg
index df1658fd0..1cd088279 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,7 +36,7 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc==8.0.0a8
+    thinc==8.0.0a9
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index bd83deb04..96c5b676e 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -12,7 +12,7 @@ import random
 
 from ..gold import GoldCorpus
 from .. import util
-
+from ..errors import Errors
 
 registry = util.registry
 
@@ -233,6 +233,8 @@ def create_train_batches(nlp, corpus, cfg):
             max_length=cfg["max_length"],
             ignore_misaligned=True,
         ))
+        if len(train_examples) == 0:
+            raise ValueError(Errors.E988)
         random.shuffle(train_examples)
         batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
         for batch in batches:
@@ -313,12 +315,14 @@ def train_while_improving(
         dropouts = dropout
     results = []
     losses = {}
+    to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
+
     for step, batch in enumerate(train_data):
         dropout = next(dropouts)
-        for subbatch in subdivide_batch(batch, accumulate_gradient):
-            nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
-        for name, proc in nlp.pipeline:
-            if hasattr(proc, "model"):
+        with nlp.select_pipes(enable=to_enable):
+            for subbatch in subdivide_batch(batch, accumulate_gradient):
+                nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
+            for name, proc in nlp.pipeline:
                 proc.model.finish_update(optimizer)
         optimizer.step_schedules()
         if not (step % eval_frequency):
diff --git a/spacy/errors.py b/spacy/errors.py
index 7a7b44731..4d38ab586 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -195,7 +195,7 @@ class Errors(object):
             "the documentation:\nhttps://spacy.io/usage/models")
     E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
             "component to the pipeline with: "
-            "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
+            "nlp.add_pipe(nlp.create_pipe('sentencizer')). "
             "Alternatively, add the dependency parser, or set sentence "
             "boundaries by setting doc[i].is_sent_start.")
     E031 = ("Invalid token: empty string ('') at position {i}.")
@@ -430,8 +430,7 @@ class Errors(object):
     E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
             "includes either the `text` or `tokens` key. For more info, see "
             "the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
-    E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
-            "forget to call set_kb()?")
+    E139 = ("Knowledge Base for component '{name}' is empty.")
     E140 = ("The list of entities, prior probabilities and entity vectors "
             "should be of equal length.")
     E141 = ("Entity vectors should be of length {required} instead of the "
@@ -548,6 +547,18 @@ class Errors(object):
             "token itself.")
 
     # TODO: fix numbering after merging develop into master
+
+    E987 = ("The text of an example training instance is either a Doc or "
+            "a string, but found {type} instead.")
+    E988 = ("Could not parse any training examples. Ensure the data is "
+            "formatted correctly.")
+    E989 = ("'nlp.update()' was called with two positional arguments. This "
+            "may be due to a backwards-incompatible change to the format "
+            "of the training data in spaCy 3.0 onwards. The 'update' "
+            "function should now be called with a batch of 'Example' "
+            "objects, instead of (text, annotation) tuples. ")
+    E990 = ("An entity linking component needs to be initialized with a "
+            "KnowledgeBase object, but found {type} instead.")
     E991 = ("The function 'select_pipes' should be called with either a "
             "'disable' argument to list the names of the pipe components "
             "that should be disabled, or with an 'enable' argument that "
@@ -562,8 +573,8 @@ class Errors(object):
     E997 = ("Tokenizer special cases are not allowed to modify the text. "
             "This would map '{chunk}' to '{orth}' given token attributes "
             "'{token_attrs}'.")
-    E998 = ("Can only create GoldParse objects from Example objects without a "
-            "Doc if get_gold_parses() is called with a Vocab object.")
+    E998 = ("To create GoldParse objects from Example objects without a "
+            "Doc, get_gold_parses() should be called with a Vocab object.")
     E999 = ("Encountered an unexpected format for the dictionary holding "
             "gold annotations: {gold_dict}")
 
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 6647e41b4..46a6ae583 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -212,6 +212,8 @@ class GoldCorpus(object):
                             doc = ex_dict.get("doc", None)
                             if doc is None:
                                 doc = ex_dict.get("text", None)
+                            if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)):
+                                raise ValueError(Errors.E987.format(type=type(doc)))
                             examples.append(Example.from_dict(ex_dict, doc=doc))
 
             elif file_name.endswith("msg"):
@@ -288,7 +290,6 @@ class GoldCorpus(object):
         """ Setting gold_preproc will result in creating a doc per sentence """
         for example in examples:
             if gold_preproc:
-                example.doc = None
                 split_examples = example.split_sents()
                 example_golds = []
                 for split_example in split_examples:
@@ -716,6 +717,12 @@ cdef class TokenAnnotation:
     def get_sent_start(self, i):
         return self.sent_starts[i] if i < len(self.sent_starts) else None
 
+    def __str__(self):
+        return str(self.to_dict())
+
+    def __repr__(self):
+        return self.__str__()
+
 
 cdef class DocAnnotation:
     def __init__(self, cats=None, links=None):
@@ -729,6 +736,12 @@ cdef class DocAnnotation:
     def to_dict(self):
         return {"cats": self.cats, "links": self.links}
 
+    def __str__(self):
+        return str(self.to_dict())
+
+    def __repr__(self):
+        return self.__str__()
+
 
 cdef class Example:
     def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
@@ -747,9 +760,9 @@ cdef class Example:
 
     @classmethod
     def from_dict(cls, example_dict, doc=None):
-        token_dict = example_dict["token_annotation"]
+        token_dict = example_dict.get("token_annotation", {})
         token_annotation = TokenAnnotation.from_dict(token_dict)
-        doc_dict = example_dict["doc_annotation"]
+        doc_dict = example_dict.get("doc_annotation", {})
         doc_annotation = DocAnnotation.from_dict(doc_dict)
         return cls(doc_annotation, token_annotation, doc)
 
@@ -791,6 +804,8 @@ cdef class Example:
     def split_sents(self):
         """ Split the token annotations into multiple Examples based on
         sent_starts and return a list of the new Examples"""
+        if not self.token_annotation.words:
+            return [self]
         s_example = Example(doc=None, doc_annotation=self.doc_annotation)
         s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
         s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
@@ -842,7 +857,7 @@ cdef class Example:
         if merge:
             t = self.token_annotation
             doc = self.doc
-            if self.doc is None:
+            if doc is None or not isinstance(doc, Doc):
                 if not vocab:
                     raise ValueError(Errors.E998)
                 doc = Doc(vocab, words=t.words)
@@ -1052,7 +1067,7 @@ cdef class GoldParse:
             self.sent_starts = [None] * len(doc)
 
             # This needs to be done before we align the words
-            if make_projective and heads is not None and deps is not None:
+            if make_projective and any(heads) and any(deps) :
                 heads, deps = nonproj.projectivize(heads, deps)
 
             # Do many-to-one alignment for misaligned tokens.
diff --git a/spacy/language.py b/spacy/language.py
index 2b8fa129e..d71c27406 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -314,19 +314,20 @@ class Language(object):
 
         # transform the model's config to an actual Model
         factory_cfg = dict(config)
-        model_cfg = None
+
+        # check whether we have a proper model config, or load a default one
+        if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
+            warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name))
+
+        # refer to the model configuration in the cfg settings for this component
         if "model" in factory_cfg:
-            model_cfg = factory_cfg["model"]
-            if not isinstance(model_cfg, dict):
-                warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name))
-                model_cfg = None
+            self.config[name] = {"model": factory_cfg["model"]}
+
+        # create all objects in the config
+        factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"]
+        model = factory_cfg.get("model", None)
+        if model is not None:
             del factory_cfg["model"]
-        model = None
-        if model_cfg is not None:
-            self.config[name] = {"model": model_cfg}
-            model = registry.make_from_config({"model": model_cfg}, validate=True)[
-                "model"
-            ]
         return factory(self, model, **factory_cfg)
 
     def add_pipe(
@@ -517,10 +518,11 @@ class Language(object):
     def make_doc(self, text):
         return self.tokenizer(text)
 
-    def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None):
+    def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None):
         """Update the models in the pipeline.
 
         examples (iterable): A batch of `Example` or `Doc` objects.
+        dummy: Should not be set - serves to catch backwards-incompatible scripts.
         drop (float): The dropout rate.
         sgd (callable): An optimizer.
         losses (dict): Dictionary to update with the loss, keyed by component.
@@ -529,6 +531,9 @@ class Language(object):
 
         DOCS: https://spacy.io/api/language#update
         """
+        if dummy is not None:
+            raise ValueError(Errors.E989)
+
         if len(examples) == 0:
             return
         examples = Example.to_example_objects(examples, make_doc=self.make_doc)
@@ -735,7 +740,7 @@ class Language(object):
         contexts = [
             pipe.use_params(params)
             for name, pipe in self.pipeline
-            if hasattr(pipe, "use_params")
+            if hasattr(pipe, "use_params") and hasattr(pipe, "model")
         ]
         # TODO: Having trouble with contextlib
         # Workaround: these aren't actually context managers atm.
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 9cbaba984..00689e85b 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,7 +1,11 @@
+from pathlib import Path
+
 from thinc.api import chain, clone, list2ragged, reduce_mean, residual
 from thinc.api import Model, Maxout, Linear
 
 from ...util import registry
+from ...kb import KnowledgeBase
+from ...vocab import Vocab
 
 
 @registry.architectures.register("spacy.EntityLinker.v1")
@@ -19,3 +23,11 @@ def build_nel_encoder(tok2vec, nO=None):
         model.set_ref("output_layer", output_layer)
         model.set_ref("tok2vec", tok2vec)
     return model
+
+
+@registry.assets.register("spacy.KBFromFile.v1")
+def load_kb(nlp_path, kb_path) -> KnowledgeBase:
+    vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
+    kb = KnowledgeBase(vocab=vocab)
+    kb.load_bulk(kb_path)
+    return kb
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 4ff956e1d..56fe54664 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -2,6 +2,7 @@
 import numpy
 import srsly
 import random
+from ast import literal_eval
 
 from thinc.api import CosineDistance, to_categorical, get_array_module
 from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
@@ -1244,15 +1245,20 @@ class EntityLinker(Pipe):
         self.vocab = vocab
         self.model = model
         self.kb = None
+        self.kb = cfg.get("kb", None)
+        if self.kb is None:
+            # create an empty KB that should be filled by calling from_disk
+            self.kb = KnowledgeBase(vocab=vocab)
+        else:
+            del cfg["kb"]   # we don't want to duplicate its serialization
+        if not isinstance(self.kb, KnowledgeBase):
+            raise ValueError(Errors.E990.format(type=type(self.kb)))
         self.cfg = dict(cfg)
         self.distance = CosineDistance(normalize=False)
 
-    def set_kb(self, kb):
-        self.kb = kb
-
     def require_kb(self):
         # Raise an error if the knowledge base is not initialized.
-        if getattr(self, "kb", None) in (None, True, False):
+        if len(self.kb) == 0:
             raise ValueError(Errors.E139.format(name=self.name))
 
     def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
@@ -1285,6 +1291,8 @@ class EntityLinker(Pipe):
                 ents_by_offset[(ent.start_char, ent.end_char)] = ent
 
             for entity, kb_dict in gold.links.items():
+                if isinstance(entity, str):
+                    entity = literal_eval(entity)
                 start, end = entity
                 mention = doc.text[start:end]
 
@@ -1375,7 +1383,6 @@ class EntityLinker(Pipe):
     def predict(self, docs):
         """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
         self.require_kb()
-
         entity_count = 0
         final_kb_ids = []
         final_tensors = []
@@ -1486,9 +1493,8 @@ class EntityLinker(Pipe):
                 raise ValueError(Errors.E149)
 
         def load_kb(p):
-            kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
-            kb.load_bulk(p)
-            self.set_kb(kb)
+            self.kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
+            self.kb.load_bulk(p)
 
         deserialize = {}
         deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index cdd8451fd..32b434e04 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -203,8 +203,8 @@ def test_preserving_links_asdoc(nlp):
     ruler.add_patterns(patterns)
     nlp.add_pipe(ruler)
 
-    el_pipe = nlp.create_pipe(name="entity_linker")
-    el_pipe.set_kb(mykb)
+    cfg = {"kb": mykb, "incl_prior": False}
+    el_pipe = nlp.create_pipe(name="entity_linker", config=cfg)
     el_pipe.begin_training()
     el_pipe.incl_context = False
     el_pipe.incl_prior = True
@@ -288,8 +288,7 @@ def test_overfitting_IO():
     mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
 
     # Create the Entity Linker component and add it to the pipeline
-    entity_linker = nlp.create_pipe("entity_linker")
-    entity_linker.set_kb(mykb)
+    entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})
     nlp.add_pipe(entity_linker, last=True)
 
     # train the NEL pipe
diff --git a/spacy/util.py b/spacy/util.py
index 048d923ee..f39813694 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -34,6 +34,7 @@ class registry(thinc.registry):
     lookups = catalogue.create("spacy", "lookups", entry_points=True)
     factories = catalogue.create("spacy", "factories", entry_points=True)
     displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
+    assets = catalogue.create("spacy", "assets", entry_points=True)
 
 
 def set_env_log(value):
@@ -160,6 +161,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
     for name in pipeline:
         if name not in disable:
             config = meta.get("pipeline_args", {}).get(name, {})
+            config.update(overrides)
             factory = factories.get(name, name)
             if nlp_config.get(name, None):
                 model_config = nlp_config[name]["model"]