From 7f5715a08159c06c249c3efe4d8934df2c98544d Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 20 May 2020 11:41:12 +0200 Subject: [PATCH] Various fixes to NEL functionality, Example class etc (#5460) * setting KB in the EL constructor, similar to how the model is passed on * removing wikipedia example files - moved to projects * throw an error when nlp.update is called with 2 positional arguments * rewriting the config logic in create pipe to accomodate for other objects (e.g. KB) in the config * update config files with new parameters * avoid training pipeline components that don't have a model (like sentencizer) * various small fixes + UX improvements * small fixes * set thinc to 8.0.0a9 everywhere * remove outdated comment --- .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 2 ++ .../ptb-joint-pos-dep/defaults.cfg | 2 ++ .../tok2vec-ner/charembed_tok2vec.cfg | 2 ++ .../tok2vec-ner/multihashembed_tok2vec.cfg | 2 ++ examples/training/train_entity_linker.py | 8 ++--- pyproject.toml | 2 +- setup.cfg | 2 +- spacy/cli/train_from_config.py | 14 ++++++--- spacy/errors.py | 21 ++++++++++--- spacy/gold.pyx | 25 ++++++++++++--- spacy/language.py | 31 +++++++++++-------- spacy/ml/models/entity_linker.py | 12 +++++++ spacy/pipeline/pipes.pyx | 22 ++++++++----- spacy/tests/pipeline/test_entity_linker.py | 7 ++--- spacy/util.py | 2 ++ 15 files changed, 108 insertions(+), 46 deletions(-) diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index b6b4e82b6..e152fa5e0 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -12,6 +12,8 @@ use_gpu = 0 scores = ["tags_acc", "uas", "las"] score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 +seed = 0 +accumulate_gradient = 2 [training.batch_size] @schedules = "compounding.v1" diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 2ceaab0be..9a10c45f0 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -12,6 +12,8 @@ use_gpu = -1 scores = ["tags_acc", "uas", "las"] score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 +seed = 0 +accumulate_gradient = 2 [training.batch_size] @schedules = "compounding.v1" diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg index b8219ad10..796c8670f 100644 --- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg @@ -10,6 +10,8 @@ orth_variant_level = 0.0 gold_preproc = true max_length = 0 batch_size = 25 +seed = 0 +accumulate_gradient = 2 [optimizer] @optimizers = "Adam.v1" diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg index dc25a1c3b..3ac70675b 100644 --- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -9,6 +9,8 @@ score_weights = {"ents_f": 1} orth_variant_level = 0.0 gold_preproc = true max_length = 0 +seed = 0 +accumulate_gradient = 2 [training.batch_size] @schedules = "compounding.v1" diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index a22f255e7..2da1db26d 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -83,13 +83,13 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): # Create the Entity Linker component and add it to the pipeline. if "entity_linker" not in nlp.pipe_names: - # use only the predicted EL score and not the prior probability (for demo purposes) - cfg = {"incl_prior": False} - entity_linker = nlp.create_pipe("entity_linker", cfg) kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_path) print("Loaded Knowledge Base from '%s'" % kb_path) - entity_linker.set_kb(kb) + + # use only the predicted EL score and not the prior probability (for demo purposes) + cfg = {"kb": kb, "incl_prior": False} + entity_linker = nlp.create_pipe("entity_linker", cfg) nlp.add_pipe(entity_linker, last=True) # Convert the texts to docs to make sure we have doc.ents set for the training examples. diff --git a/pyproject.toml b/pyproject.toml index 548664e89..66a06c1d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==8.0.0a8", + "thinc==8.0.0a9", "blis>=0.4.0,<0.5.0" ] build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg index df1658fd0..1cd088279 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,7 +36,7 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc==8.0.0a8 + thinc==8.0.0a9 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index bd83deb04..96c5b676e 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -12,7 +12,7 @@ import random from ..gold import GoldCorpus from .. import util - +from ..errors import Errors registry = util.registry @@ -233,6 +233,8 @@ def create_train_batches(nlp, corpus, cfg): max_length=cfg["max_length"], ignore_misaligned=True, )) + if len(train_examples) == 0: + raise ValueError(Errors.E988) random.shuffle(train_examples) batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"]) for batch in batches: @@ -313,12 +315,14 @@ def train_while_improving( dropouts = dropout results = [] losses = {} + to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")] + for step, batch in enumerate(train_data): dropout = next(dropouts) - for subbatch in subdivide_batch(batch, accumulate_gradient): - nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) - for name, proc in nlp.pipeline: - if hasattr(proc, "model"): + with nlp.select_pipes(enable=to_enable): + for subbatch in subdivide_batch(batch, accumulate_gradient): + nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) + for name, proc in nlp.pipeline: proc.model.finish_update(optimizer) optimizer.step_schedules() if not (step % eval_frequency): diff --git a/spacy/errors.py b/spacy/errors.py index 7a7b44731..4d38ab586 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -195,7 +195,7 @@ class Errors(object): "the documentation:\nhttps://spacy.io/usage/models") E030 = ("Sentence boundaries unset. You can add the 'sentencizer' " "component to the pipeline with: " - "nlp.add_pipe(nlp.create_pipe('sentencizer')) " + "nlp.add_pipe(nlp.create_pipe('sentencizer')). " "Alternatively, add the dependency parser, or set sentence " "boundaries by setting doc[i].is_sent_start.") E031 = ("Invalid token: empty string ('') at position {i}.") @@ -430,8 +430,7 @@ class Errors(object): E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input " "includes either the `text` or `tokens` key. For more info, see " "the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl") - E139 = ("Knowledge Base for component '{name}' not initialized. Did you " - "forget to call set_kb()?") + E139 = ("Knowledge Base for component '{name}' is empty.") E140 = ("The list of entities, prior probabilities and entity vectors " "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " @@ -548,6 +547,18 @@ class Errors(object): "token itself.") # TODO: fix numbering after merging develop into master + + E987 = ("The text of an example training instance is either a Doc or " + "a string, but found {type} instead.") + E988 = ("Could not parse any training examples. Ensure the data is " + "formatted correctly.") + E989 = ("'nlp.update()' was called with two positional arguments. This " + "may be due to a backwards-incompatible change to the format " + "of the training data in spaCy 3.0 onwards. The 'update' " + "function should now be called with a batch of 'Example' " + "objects, instead of (text, annotation) tuples. ") + E990 = ("An entity linking component needs to be initialized with a " + "KnowledgeBase object, but found {type} instead.") E991 = ("The function 'select_pipes' should be called with either a " "'disable' argument to list the names of the pipe components " "that should be disabled, or with an 'enable' argument that " @@ -562,8 +573,8 @@ class Errors(object): E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " "'{token_attrs}'.") - E998 = ("Can only create GoldParse objects from Example objects without a " - "Doc if get_gold_parses() is called with a Vocab object.") + E998 = ("To create GoldParse objects from Example objects without a " + "Doc, get_gold_parses() should be called with a Vocab object.") E999 = ("Encountered an unexpected format for the dictionary holding " "gold annotations: {gold_dict}") diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 6647e41b4..46a6ae583 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -212,6 +212,8 @@ class GoldCorpus(object): doc = ex_dict.get("doc", None) if doc is None: doc = ex_dict.get("text", None) + if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)): + raise ValueError(Errors.E987.format(type=type(doc))) examples.append(Example.from_dict(ex_dict, doc=doc)) elif file_name.endswith("msg"): @@ -288,7 +290,6 @@ class GoldCorpus(object): """ Setting gold_preproc will result in creating a doc per sentence """ for example in examples: if gold_preproc: - example.doc = None split_examples = example.split_sents() example_golds = [] for split_example in split_examples: @@ -716,6 +717,12 @@ cdef class TokenAnnotation: def get_sent_start(self, i): return self.sent_starts[i] if i < len(self.sent_starts) else None + def __str__(self): + return str(self.to_dict()) + + def __repr__(self): + return self.__str__() + cdef class DocAnnotation: def __init__(self, cats=None, links=None): @@ -729,6 +736,12 @@ cdef class DocAnnotation: def to_dict(self): return {"cats": self.cats, "links": self.links} + def __str__(self): + return str(self.to_dict()) + + def __repr__(self): + return self.__str__() + cdef class Example: def __init__(self, doc_annotation=None, token_annotation=None, doc=None, @@ -747,9 +760,9 @@ cdef class Example: @classmethod def from_dict(cls, example_dict, doc=None): - token_dict = example_dict["token_annotation"] + token_dict = example_dict.get("token_annotation", {}) token_annotation = TokenAnnotation.from_dict(token_dict) - doc_dict = example_dict["doc_annotation"] + doc_dict = example_dict.get("doc_annotation", {}) doc_annotation = DocAnnotation.from_dict(doc_dict) return cls(doc_annotation, token_annotation, doc) @@ -791,6 +804,8 @@ cdef class Example: def split_sents(self): """ Split the token annotations into multiple Examples based on sent_starts and return a list of the new Examples""" + if not self.token_annotation.words: + return [self] s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] @@ -842,7 +857,7 @@ cdef class Example: if merge: t = self.token_annotation doc = self.doc - if self.doc is None: + if doc is None or not isinstance(doc, Doc): if not vocab: raise ValueError(Errors.E998) doc = Doc(vocab, words=t.words) @@ -1052,7 +1067,7 @@ cdef class GoldParse: self.sent_starts = [None] * len(doc) # This needs to be done before we align the words - if make_projective and heads is not None and deps is not None: + if make_projective and any(heads) and any(deps) : heads, deps = nonproj.projectivize(heads, deps) # Do many-to-one alignment for misaligned tokens. diff --git a/spacy/language.py b/spacy/language.py index 2b8fa129e..d71c27406 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -314,19 +314,20 @@ class Language(object): # transform the model's config to an actual Model factory_cfg = dict(config) - model_cfg = None + + # check whether we have a proper model config, or load a default one + if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict): + warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name)) + + # refer to the model configuration in the cfg settings for this component if "model" in factory_cfg: - model_cfg = factory_cfg["model"] - if not isinstance(model_cfg, dict): - warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name)) - model_cfg = None + self.config[name] = {"model": factory_cfg["model"]} + + # create all objects in the config + factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"] + model = factory_cfg.get("model", None) + if model is not None: del factory_cfg["model"] - model = None - if model_cfg is not None: - self.config[name] = {"model": model_cfg} - model = registry.make_from_config({"model": model_cfg}, validate=True)[ - "model" - ] return factory(self, model, **factory_cfg) def add_pipe( @@ -517,10 +518,11 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) - def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None): + def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None): """Update the models in the pipeline. examples (iterable): A batch of `Example` or `Doc` objects. + dummy: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. sgd (callable): An optimizer. losses (dict): Dictionary to update with the loss, keyed by component. @@ -529,6 +531,9 @@ class Language(object): DOCS: https://spacy.io/api/language#update """ + if dummy is not None: + raise ValueError(Errors.E989) + if len(examples) == 0: return examples = Example.to_example_objects(examples, make_doc=self.make_doc) @@ -735,7 +740,7 @@ class Language(object): contexts = [ pipe.use_params(params) for name, pipe in self.pipeline - if hasattr(pipe, "use_params") + if hasattr(pipe, "use_params") and hasattr(pipe, "model") ] # TODO: Having trouble with contextlib # Workaround: these aren't actually context managers atm. diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 9cbaba984..00689e85b 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,7 +1,11 @@ +from pathlib import Path + from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear from ...util import registry +from ...kb import KnowledgeBase +from ...vocab import Vocab @registry.architectures.register("spacy.EntityLinker.v1") @@ -19,3 +23,11 @@ def build_nel_encoder(tok2vec, nO=None): model.set_ref("output_layer", output_layer) model.set_ref("tok2vec", tok2vec) return model + + +@registry.assets.register("spacy.KBFromFile.v1") +def load_kb(nlp_path, kb_path) -> KnowledgeBase: + vocab = Vocab().from_disk(Path(nlp_path) / "vocab") + kb = KnowledgeBase(vocab=vocab) + kb.load_bulk(kb_path) + return kb diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 4ff956e1d..56fe54664 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -2,6 +2,7 @@ import numpy import srsly import random +from ast import literal_eval from thinc.api import CosineDistance, to_categorical, get_array_module from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy @@ -1244,15 +1245,20 @@ class EntityLinker(Pipe): self.vocab = vocab self.model = model self.kb = None + self.kb = cfg.get("kb", None) + if self.kb is None: + # create an empty KB that should be filled by calling from_disk + self.kb = KnowledgeBase(vocab=vocab) + else: + del cfg["kb"] # we don't want to duplicate its serialization + if not isinstance(self.kb, KnowledgeBase): + raise ValueError(Errors.E990.format(type=type(self.kb))) self.cfg = dict(cfg) self.distance = CosineDistance(normalize=False) - def set_kb(self, kb): - self.kb = kb - def require_kb(self): # Raise an error if the knowledge base is not initialized. - if getattr(self, "kb", None) in (None, True, False): + if len(self.kb) == 0: raise ValueError(Errors.E139.format(name=self.name)) def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): @@ -1285,6 +1291,8 @@ class EntityLinker(Pipe): ents_by_offset[(ent.start_char, ent.end_char)] = ent for entity, kb_dict in gold.links.items(): + if isinstance(entity, str): + entity = literal_eval(entity) start, end = entity mention = doc.text[start:end] @@ -1375,7 +1383,6 @@ class EntityLinker(Pipe): def predict(self, docs): """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ self.require_kb() - entity_count = 0 final_kb_ids = [] final_tensors = [] @@ -1486,9 +1493,8 @@ class EntityLinker(Pipe): raise ValueError(Errors.E149) def load_kb(p): - kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"]) - kb.load_bulk(p) - self.set_kb(kb) + self.kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"]) + self.kb.load_bulk(p) deserialize = {} deserialize["vocab"] = lambda p: self.vocab.from_disk(p) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index cdd8451fd..32b434e04 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -203,8 +203,8 @@ def test_preserving_links_asdoc(nlp): ruler.add_patterns(patterns) nlp.add_pipe(ruler) - el_pipe = nlp.create_pipe(name="entity_linker") - el_pipe.set_kb(mykb) + cfg = {"kb": mykb, "incl_prior": False} + el_pipe = nlp.create_pipe(name="entity_linker", config=cfg) el_pipe.begin_training() el_pipe.incl_context = False el_pipe.incl_prior = True @@ -288,8 +288,7 @@ def test_overfitting_IO(): mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.create_pipe("entity_linker") - entity_linker.set_kb(mykb) + entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb}) nlp.add_pipe(entity_linker, last=True) # train the NEL pipe diff --git a/spacy/util.py b/spacy/util.py index 048d923ee..f39813694 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -34,6 +34,7 @@ class registry(thinc.registry): lookups = catalogue.create("spacy", "lookups", entry_points=True) factories = catalogue.create("spacy", "factories", entry_points=True) displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) + assets = catalogue.create("spacy", "assets", entry_points=True) def set_env_log(value): @@ -160,6 +161,7 @@ def load_model_from_path(model_path, meta=False, **overrides): for name in pipeline: if name not in disable: config = meta.get("pipeline_args", {}).get(name, {}) + config.update(overrides) factory = factories.get(name, name) if nlp_config.get(name, None): model_config = nlp_config[name]["model"]