Various fixes to NEL functionality, Example class etc (#5460)

* setting KB in the EL constructor, similar to how the model is passed on

* removing wikipedia example files - moved to projects

* throw an error when nlp.update is called with 2 positional arguments

* rewriting the config logic in create pipe to accomodate for other objects (e.g. KB) in the config

* update config files with new parameters

* avoid training pipeline components that don't have a model (like sentencizer)

* various small fixes + UX improvements

* small fixes

* set thinc to 8.0.0a9 everywhere

* remove outdated comment
This commit is contained in:
Sofie Van Landeghem 2020-05-20 11:41:12 +02:00 committed by GitHub
parent 664a3603b0
commit 7f5715a081
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 108 additions and 46 deletions

View File

@ -12,6 +12,8 @@ use_gpu = 0
scores = ["tags_acc", "uas", "las"]
score_weights = {"las": 0.8, "tags_acc": 0.2}
limit = 0
seed = 0
accumulate_gradient = 2
[training.batch_size]
@schedules = "compounding.v1"

View File

@ -12,6 +12,8 @@ use_gpu = -1
scores = ["tags_acc", "uas", "las"]
score_weights = {"las": 0.8, "tags_acc": 0.2}
limit = 0
seed = 0
accumulate_gradient = 2
[training.batch_size]
@schedules = "compounding.v1"

View File

@ -10,6 +10,8 @@ orth_variant_level = 0.0
gold_preproc = true
max_length = 0
batch_size = 25
seed = 0
accumulate_gradient = 2
[optimizer]
@optimizers = "Adam.v1"

View File

@ -9,6 +9,8 @@ score_weights = {"ents_f": 1}
orth_variant_level = 0.0
gold_preproc = true
max_length = 0
seed = 0
accumulate_gradient = 2
[training.batch_size]
@schedules = "compounding.v1"

View File

@ -83,13 +83,13 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
# Create the Entity Linker component and add it to the pipeline.
if "entity_linker" not in nlp.pipe_names:
# use only the predicted EL score and not the prior probability (for demo purposes)
cfg = {"incl_prior": False}
entity_linker = nlp.create_pipe("entity_linker", cfg)
kb = KnowledgeBase(vocab=nlp.vocab)
kb.load_bulk(kb_path)
print("Loaded Knowledge Base from '%s'" % kb_path)
entity_linker.set_kb(kb)
# use only the predicted EL score and not the prior probability (for demo purposes)
cfg = {"kb": kb, "incl_prior": False}
entity_linker = nlp.create_pipe("entity_linker", cfg)
nlp.add_pipe(entity_linker, last=True)
# Convert the texts to docs to make sure we have doc.ents set for the training examples.

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc==8.0.0a8",
"thinc==8.0.0a9",
"blis>=0.4.0,<0.5.0"
]
build-backend = "setuptools.build_meta"

View File

@ -36,7 +36,7 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc==8.0.0a8
thinc==8.0.0a9
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0

View File

@ -12,7 +12,7 @@ import random
from ..gold import GoldCorpus
from .. import util
from ..errors import Errors
registry = util.registry
@ -233,6 +233,8 @@ def create_train_batches(nlp, corpus, cfg):
max_length=cfg["max_length"],
ignore_misaligned=True,
))
if len(train_examples) == 0:
raise ValueError(Errors.E988)
random.shuffle(train_examples)
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
for batch in batches:
@ -313,12 +315,14 @@ def train_while_improving(
dropouts = dropout
results = []
losses = {}
to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
for step, batch in enumerate(train_data):
dropout = next(dropouts)
for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
for name, proc in nlp.pipeline:
if hasattr(proc, "model"):
with nlp.select_pipes(enable=to_enable):
for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
for name, proc in nlp.pipeline:
proc.model.finish_update(optimizer)
optimizer.step_schedules()
if not (step % eval_frequency):

View File

@ -195,7 +195,7 @@ class Errors(object):
"the documentation:\nhttps://spacy.io/usage/models")
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"nlp.add_pipe(nlp.create_pipe('sentencizer')). "
"Alternatively, add the dependency parser, or set sentence "
"boundaries by setting doc[i].is_sent_start.")
E031 = ("Invalid token: empty string ('') at position {i}.")
@ -430,8 +430,7 @@ class Errors(object):
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
"includes either the `text` or `tokens` key. For more info, see "
"the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
"forget to call set_kb()?")
E139 = ("Knowledge Base for component '{name}' is empty.")
E140 = ("The list of entities, prior probabilities and entity vectors "
"should be of equal length.")
E141 = ("Entity vectors should be of length {required} instead of the "
@ -548,6 +547,18 @@ class Errors(object):
"token itself.")
# TODO: fix numbering after merging develop into master
E987 = ("The text of an example training instance is either a Doc or "
"a string, but found {type} instead.")
E988 = ("Could not parse any training examples. Ensure the data is "
"formatted correctly.")
E989 = ("'nlp.update()' was called with two positional arguments. This "
"may be due to a backwards-incompatible change to the format "
"of the training data in spaCy 3.0 onwards. The 'update' "
"function should now be called with a batch of 'Example' "
"objects, instead of (text, annotation) tuples. ")
E990 = ("An entity linking component needs to be initialized with a "
"KnowledgeBase object, but found {type} instead.")
E991 = ("The function 'select_pipes' should be called with either a "
"'disable' argument to list the names of the pipe components "
"that should be disabled, or with an 'enable' argument that "
@ -562,8 +573,8 @@ class Errors(object):
E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.")
E998 = ("Can only create GoldParse objects from Example objects without a "
"Doc if get_gold_parses() is called with a Vocab object.")
E998 = ("To create GoldParse objects from Example objects without a "
"Doc, get_gold_parses() should be called with a Vocab object.")
E999 = ("Encountered an unexpected format for the dictionary holding "
"gold annotations: {gold_dict}")

View File

@ -212,6 +212,8 @@ class GoldCorpus(object):
doc = ex_dict.get("doc", None)
if doc is None:
doc = ex_dict.get("text", None)
if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)):
raise ValueError(Errors.E987.format(type=type(doc)))
examples.append(Example.from_dict(ex_dict, doc=doc))
elif file_name.endswith("msg"):
@ -288,7 +290,6 @@ class GoldCorpus(object):
""" Setting gold_preproc will result in creating a doc per sentence """
for example in examples:
if gold_preproc:
example.doc = None
split_examples = example.split_sents()
example_golds = []
for split_example in split_examples:
@ -716,6 +717,12 @@ cdef class TokenAnnotation:
def get_sent_start(self, i):
return self.sent_starts[i] if i < len(self.sent_starts) else None
def __str__(self):
return str(self.to_dict())
def __repr__(self):
return self.__str__()
cdef class DocAnnotation:
def __init__(self, cats=None, links=None):
@ -729,6 +736,12 @@ cdef class DocAnnotation:
def to_dict(self):
return {"cats": self.cats, "links": self.links}
def __str__(self):
return str(self.to_dict())
def __repr__(self):
return self.__str__()
cdef class Example:
def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
@ -747,9 +760,9 @@ cdef class Example:
@classmethod
def from_dict(cls, example_dict, doc=None):
token_dict = example_dict["token_annotation"]
token_dict = example_dict.get("token_annotation", {})
token_annotation = TokenAnnotation.from_dict(token_dict)
doc_dict = example_dict["doc_annotation"]
doc_dict = example_dict.get("doc_annotation", {})
doc_annotation = DocAnnotation.from_dict(doc_dict)
return cls(doc_annotation, token_annotation, doc)
@ -791,6 +804,8 @@ cdef class Example:
def split_sents(self):
""" Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples"""
if not self.token_annotation.words:
return [self]
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
@ -842,7 +857,7 @@ cdef class Example:
if merge:
t = self.token_annotation
doc = self.doc
if self.doc is None:
if doc is None or not isinstance(doc, Doc):
if not vocab:
raise ValueError(Errors.E998)
doc = Doc(vocab, words=t.words)
@ -1052,7 +1067,7 @@ cdef class GoldParse:
self.sent_starts = [None] * len(doc)
# This needs to be done before we align the words
if make_projective and heads is not None and deps is not None:
if make_projective and any(heads) and any(deps) :
heads, deps = nonproj.projectivize(heads, deps)
# Do many-to-one alignment for misaligned tokens.

View File

@ -314,19 +314,20 @@ class Language(object):
# transform the model's config to an actual Model
factory_cfg = dict(config)
model_cfg = None
# check whether we have a proper model config, or load a default one
if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name))
# refer to the model configuration in the cfg settings for this component
if "model" in factory_cfg:
model_cfg = factory_cfg["model"]
if not isinstance(model_cfg, dict):
warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name))
model_cfg = None
self.config[name] = {"model": factory_cfg["model"]}
# create all objects in the config
factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"]
model = factory_cfg.get("model", None)
if model is not None:
del factory_cfg["model"]
model = None
if model_cfg is not None:
self.config[name] = {"model": model_cfg}
model = registry.make_from_config({"model": model_cfg}, validate=True)[
"model"
]
return factory(self, model, **factory_cfg)
def add_pipe(
@ -517,10 +518,11 @@ class Language(object):
def make_doc(self, text):
return self.tokenizer(text)
def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None):
def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None):
"""Update the models in the pipeline.
examples (iterable): A batch of `Example` or `Doc` objects.
dummy: Should not be set - serves to catch backwards-incompatible scripts.
drop (float): The dropout rate.
sgd (callable): An optimizer.
losses (dict): Dictionary to update with the loss, keyed by component.
@ -529,6 +531,9 @@ class Language(object):
DOCS: https://spacy.io/api/language#update
"""
if dummy is not None:
raise ValueError(Errors.E989)
if len(examples) == 0:
return
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
@ -735,7 +740,7 @@ class Language(object):
contexts = [
pipe.use_params(params)
for name, pipe in self.pipeline
if hasattr(pipe, "use_params")
if hasattr(pipe, "use_params") and hasattr(pipe, "model")
]
# TODO: Having trouble with contextlib
# Workaround: these aren't actually context managers atm.

View File

@ -1,7 +1,11 @@
from pathlib import Path
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear
from ...util import registry
from ...kb import KnowledgeBase
from ...vocab import Vocab
@registry.architectures.register("spacy.EntityLinker.v1")
@ -19,3 +23,11 @@ def build_nel_encoder(tok2vec, nO=None):
model.set_ref("output_layer", output_layer)
model.set_ref("tok2vec", tok2vec)
return model
@registry.assets.register("spacy.KBFromFile.v1")
def load_kb(nlp_path, kb_path) -> KnowledgeBase:
vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
kb = KnowledgeBase(vocab=vocab)
kb.load_bulk(kb_path)
return kb

View File

@ -2,6 +2,7 @@
import numpy
import srsly
import random
from ast import literal_eval
from thinc.api import CosineDistance, to_categorical, get_array_module
from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
@ -1244,15 +1245,20 @@ class EntityLinker(Pipe):
self.vocab = vocab
self.model = model
self.kb = None
self.kb = cfg.get("kb", None)
if self.kb is None:
# create an empty KB that should be filled by calling from_disk
self.kb = KnowledgeBase(vocab=vocab)
else:
del cfg["kb"] # we don't want to duplicate its serialization
if not isinstance(self.kb, KnowledgeBase):
raise ValueError(Errors.E990.format(type=type(self.kb)))
self.cfg = dict(cfg)
self.distance = CosineDistance(normalize=False)
def set_kb(self, kb):
self.kb = kb
def require_kb(self):
# Raise an error if the knowledge base is not initialized.
if getattr(self, "kb", None) in (None, True, False):
if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name))
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
@ -1285,6 +1291,8 @@ class EntityLinker(Pipe):
ents_by_offset[(ent.start_char, ent.end_char)] = ent
for entity, kb_dict in gold.links.items():
if isinstance(entity, str):
entity = literal_eval(entity)
start, end = entity
mention = doc.text[start:end]
@ -1375,7 +1383,6 @@ class EntityLinker(Pipe):
def predict(self, docs):
""" Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
self.require_kb()
entity_count = 0
final_kb_ids = []
final_tensors = []
@ -1486,9 +1493,8 @@ class EntityLinker(Pipe):
raise ValueError(Errors.E149)
def load_kb(p):
kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
kb.load_bulk(p)
self.set_kb(kb)
self.kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
self.kb.load_bulk(p)
deserialize = {}
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)

View File

@ -203,8 +203,8 @@ def test_preserving_links_asdoc(nlp):
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
el_pipe = nlp.create_pipe(name="entity_linker")
el_pipe.set_kb(mykb)
cfg = {"kb": mykb, "incl_prior": False}
el_pipe = nlp.create_pipe(name="entity_linker", config=cfg)
el_pipe.begin_training()
el_pipe.incl_context = False
el_pipe.incl_prior = True
@ -288,8 +288,7 @@ def test_overfitting_IO():
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
# Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.create_pipe("entity_linker")
entity_linker.set_kb(mykb)
entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})
nlp.add_pipe(entity_linker, last=True)
# train the NEL pipe

View File

@ -34,6 +34,7 @@ class registry(thinc.registry):
lookups = catalogue.create("spacy", "lookups", entry_points=True)
factories = catalogue.create("spacy", "factories", entry_points=True)
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
assets = catalogue.create("spacy", "assets", entry_points=True)
def set_env_log(value):
@ -160,6 +161,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
for name in pipeline:
if name not in disable:
config = meta.get("pipeline_args", {}).get(name, {})
config.update(overrides)
factory = factories.get(name, name)
if nlp_config.get(name, None):
model_config = nlp_config[name]["model"]