Various fixes to NEL functionality, Example class etc (#5460)

* setting KB in the EL constructor, similar to how the model is passed on

* removing wikipedia example files - moved to projects

* throw an error when nlp.update is called with 2 positional arguments

* rewriting the config logic in create pipe to accomodate for other objects (e.g. KB) in the config

* update config files with new parameters

* avoid training pipeline components that don't have a model (like sentencizer)

* various small fixes + UX improvements

* small fixes

* set thinc to 8.0.0a9 everywhere

* remove outdated comment
This commit is contained in:
Sofie Van Landeghem 2020-05-20 11:41:12 +02:00 committed by GitHub
parent 664a3603b0
commit 7f5715a081
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 108 additions and 46 deletions

View File

@ -12,6 +12,8 @@ use_gpu = 0
scores = ["tags_acc", "uas", "las"] scores = ["tags_acc", "uas", "las"]
score_weights = {"las": 0.8, "tags_acc": 0.2} score_weights = {"las": 0.8, "tags_acc": 0.2}
limit = 0 limit = 0
seed = 0
accumulate_gradient = 2
[training.batch_size] [training.batch_size]
@schedules = "compounding.v1" @schedules = "compounding.v1"

View File

@ -12,6 +12,8 @@ use_gpu = -1
scores = ["tags_acc", "uas", "las"] scores = ["tags_acc", "uas", "las"]
score_weights = {"las": 0.8, "tags_acc": 0.2} score_weights = {"las": 0.8, "tags_acc": 0.2}
limit = 0 limit = 0
seed = 0
accumulate_gradient = 2
[training.batch_size] [training.batch_size]
@schedules = "compounding.v1" @schedules = "compounding.v1"

View File

@ -10,6 +10,8 @@ orth_variant_level = 0.0
gold_preproc = true gold_preproc = true
max_length = 0 max_length = 0
batch_size = 25 batch_size = 25
seed = 0
accumulate_gradient = 2
[optimizer] [optimizer]
@optimizers = "Adam.v1" @optimizers = "Adam.v1"

View File

@ -9,6 +9,8 @@ score_weights = {"ents_f": 1}
orth_variant_level = 0.0 orth_variant_level = 0.0
gold_preproc = true gold_preproc = true
max_length = 0 max_length = 0
seed = 0
accumulate_gradient = 2
[training.batch_size] [training.batch_size]
@schedules = "compounding.v1" @schedules = "compounding.v1"

View File

@ -83,13 +83,13 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
# Create the Entity Linker component and add it to the pipeline. # Create the Entity Linker component and add it to the pipeline.
if "entity_linker" not in nlp.pipe_names: if "entity_linker" not in nlp.pipe_names:
# use only the predicted EL score and not the prior probability (for demo purposes)
cfg = {"incl_prior": False}
entity_linker = nlp.create_pipe("entity_linker", cfg)
kb = KnowledgeBase(vocab=nlp.vocab) kb = KnowledgeBase(vocab=nlp.vocab)
kb.load_bulk(kb_path) kb.load_bulk(kb_path)
print("Loaded Knowledge Base from '%s'" % kb_path) print("Loaded Knowledge Base from '%s'" % kb_path)
entity_linker.set_kb(kb)
# use only the predicted EL score and not the prior probability (for demo purposes)
cfg = {"kb": kb, "incl_prior": False}
entity_linker = nlp.create_pipe("entity_linker", cfg)
nlp.add_pipe(entity_linker, last=True) nlp.add_pipe(entity_linker, last=True)
# Convert the texts to docs to make sure we have doc.ents set for the training examples. # Convert the texts to docs to make sure we have doc.ents set for the training examples.

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc==8.0.0a8", "thinc==8.0.0a9",
"blis>=0.4.0,<0.5.0" "blis>=0.4.0,<0.5.0"
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"

View File

@ -36,7 +36,7 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc==8.0.0a8 thinc==8.0.0a9
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -12,7 +12,7 @@ import random
from ..gold import GoldCorpus from ..gold import GoldCorpus
from .. import util from .. import util
from ..errors import Errors
registry = util.registry registry = util.registry
@ -233,6 +233,8 @@ def create_train_batches(nlp, corpus, cfg):
max_length=cfg["max_length"], max_length=cfg["max_length"],
ignore_misaligned=True, ignore_misaligned=True,
)) ))
if len(train_examples) == 0:
raise ValueError(Errors.E988)
random.shuffle(train_examples) random.shuffle(train_examples)
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"]) batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
for batch in batches: for batch in batches:
@ -313,12 +315,14 @@ def train_while_improving(
dropouts = dropout dropouts = dropout
results = [] results = []
losses = {} losses = {}
to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
for step, batch in enumerate(train_data): for step, batch in enumerate(train_data):
dropout = next(dropouts) dropout = next(dropouts)
with nlp.select_pipes(enable=to_enable):
for subbatch in subdivide_batch(batch, accumulate_gradient): for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
for name, proc in nlp.pipeline: for name, proc in nlp.pipeline:
if hasattr(proc, "model"):
proc.model.finish_update(optimizer) proc.model.finish_update(optimizer)
optimizer.step_schedules() optimizer.step_schedules()
if not (step % eval_frequency): if not (step % eval_frequency):

View File

@ -195,7 +195,7 @@ class Errors(object):
"the documentation:\nhttps://spacy.io/usage/models") "the documentation:\nhttps://spacy.io/usage/models")
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' " E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: " "component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) " "nlp.add_pipe(nlp.create_pipe('sentencizer')). "
"Alternatively, add the dependency parser, or set sentence " "Alternatively, add the dependency parser, or set sentence "
"boundaries by setting doc[i].is_sent_start.") "boundaries by setting doc[i].is_sent_start.")
E031 = ("Invalid token: empty string ('') at position {i}.") E031 = ("Invalid token: empty string ('') at position {i}.")
@ -430,8 +430,7 @@ class Errors(object):
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input " E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
"includes either the `text` or `tokens` key. For more info, see " "includes either the `text` or `tokens` key. For more info, see "
"the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl") "the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
E139 = ("Knowledge Base for component '{name}' not initialized. Did you " E139 = ("Knowledge Base for component '{name}' is empty.")
"forget to call set_kb()?")
E140 = ("The list of entities, prior probabilities and entity vectors " E140 = ("The list of entities, prior probabilities and entity vectors "
"should be of equal length.") "should be of equal length.")
E141 = ("Entity vectors should be of length {required} instead of the " E141 = ("Entity vectors should be of length {required} instead of the "
@ -548,6 +547,18 @@ class Errors(object):
"token itself.") "token itself.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E987 = ("The text of an example training instance is either a Doc or "
"a string, but found {type} instead.")
E988 = ("Could not parse any training examples. Ensure the data is "
"formatted correctly.")
E989 = ("'nlp.update()' was called with two positional arguments. This "
"may be due to a backwards-incompatible change to the format "
"of the training data in spaCy 3.0 onwards. The 'update' "
"function should now be called with a batch of 'Example' "
"objects, instead of (text, annotation) tuples. ")
E990 = ("An entity linking component needs to be initialized with a "
"KnowledgeBase object, but found {type} instead.")
E991 = ("The function 'select_pipes' should be called with either a " E991 = ("The function 'select_pipes' should be called with either a "
"'disable' argument to list the names of the pipe components " "'disable' argument to list the names of the pipe components "
"that should be disabled, or with an 'enable' argument that " "that should be disabled, or with an 'enable' argument that "
@ -562,8 +573,8 @@ class Errors(object):
E997 = ("Tokenizer special cases are not allowed to modify the text. " E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes " "This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.") "'{token_attrs}'.")
E998 = ("Can only create GoldParse objects from Example objects without a " E998 = ("To create GoldParse objects from Example objects without a "
"Doc if get_gold_parses() is called with a Vocab object.") "Doc, get_gold_parses() should be called with a Vocab object.")
E999 = ("Encountered an unexpected format for the dictionary holding " E999 = ("Encountered an unexpected format for the dictionary holding "
"gold annotations: {gold_dict}") "gold annotations: {gold_dict}")

View File

@ -212,6 +212,8 @@ class GoldCorpus(object):
doc = ex_dict.get("doc", None) doc = ex_dict.get("doc", None)
if doc is None: if doc is None:
doc = ex_dict.get("text", None) doc = ex_dict.get("text", None)
if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)):
raise ValueError(Errors.E987.format(type=type(doc)))
examples.append(Example.from_dict(ex_dict, doc=doc)) examples.append(Example.from_dict(ex_dict, doc=doc))
elif file_name.endswith("msg"): elif file_name.endswith("msg"):
@ -288,7 +290,6 @@ class GoldCorpus(object):
""" Setting gold_preproc will result in creating a doc per sentence """ """ Setting gold_preproc will result in creating a doc per sentence """
for example in examples: for example in examples:
if gold_preproc: if gold_preproc:
example.doc = None
split_examples = example.split_sents() split_examples = example.split_sents()
example_golds = [] example_golds = []
for split_example in split_examples: for split_example in split_examples:
@ -716,6 +717,12 @@ cdef class TokenAnnotation:
def get_sent_start(self, i): def get_sent_start(self, i):
return self.sent_starts[i] if i < len(self.sent_starts) else None return self.sent_starts[i] if i < len(self.sent_starts) else None
def __str__(self):
return str(self.to_dict())
def __repr__(self):
return self.__str__()
cdef class DocAnnotation: cdef class DocAnnotation:
def __init__(self, cats=None, links=None): def __init__(self, cats=None, links=None):
@ -729,6 +736,12 @@ cdef class DocAnnotation:
def to_dict(self): def to_dict(self):
return {"cats": self.cats, "links": self.links} return {"cats": self.cats, "links": self.links}
def __str__(self):
return str(self.to_dict())
def __repr__(self):
return self.__str__()
cdef class Example: cdef class Example:
def __init__(self, doc_annotation=None, token_annotation=None, doc=None, def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
@ -747,9 +760,9 @@ cdef class Example:
@classmethod @classmethod
def from_dict(cls, example_dict, doc=None): def from_dict(cls, example_dict, doc=None):
token_dict = example_dict["token_annotation"] token_dict = example_dict.get("token_annotation", {})
token_annotation = TokenAnnotation.from_dict(token_dict) token_annotation = TokenAnnotation.from_dict(token_dict)
doc_dict = example_dict["doc_annotation"] doc_dict = example_dict.get("doc_annotation", {})
doc_annotation = DocAnnotation.from_dict(doc_dict) doc_annotation = DocAnnotation.from_dict(doc_dict)
return cls(doc_annotation, token_annotation, doc) return cls(doc_annotation, token_annotation, doc)
@ -791,6 +804,8 @@ cdef class Example:
def split_sents(self): def split_sents(self):
""" Split the token annotations into multiple Examples based on """ Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples""" sent_starts and return a list of the new Examples"""
if not self.token_annotation.words:
return [self]
s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], [] s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], [] s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
@ -842,7 +857,7 @@ cdef class Example:
if merge: if merge:
t = self.token_annotation t = self.token_annotation
doc = self.doc doc = self.doc
if self.doc is None: if doc is None or not isinstance(doc, Doc):
if not vocab: if not vocab:
raise ValueError(Errors.E998) raise ValueError(Errors.E998)
doc = Doc(vocab, words=t.words) doc = Doc(vocab, words=t.words)
@ -1052,7 +1067,7 @@ cdef class GoldParse:
self.sent_starts = [None] * len(doc) self.sent_starts = [None] * len(doc)
# This needs to be done before we align the words # This needs to be done before we align the words
if make_projective and heads is not None and deps is not None: if make_projective and any(heads) and any(deps) :
heads, deps = nonproj.projectivize(heads, deps) heads, deps = nonproj.projectivize(heads, deps)
# Do many-to-one alignment for misaligned tokens. # Do many-to-one alignment for misaligned tokens.

View File

@ -314,19 +314,20 @@ class Language(object):
# transform the model's config to an actual Model # transform the model's config to an actual Model
factory_cfg = dict(config) factory_cfg = dict(config)
model_cfg = None
# check whether we have a proper model config, or load a default one
if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name))
# refer to the model configuration in the cfg settings for this component
if "model" in factory_cfg: if "model" in factory_cfg:
model_cfg = factory_cfg["model"] self.config[name] = {"model": factory_cfg["model"]}
if not isinstance(model_cfg, dict):
warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name)) # create all objects in the config
model_cfg = None factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"]
model = factory_cfg.get("model", None)
if model is not None:
del factory_cfg["model"] del factory_cfg["model"]
model = None
if model_cfg is not None:
self.config[name] = {"model": model_cfg}
model = registry.make_from_config({"model": model_cfg}, validate=True)[
"model"
]
return factory(self, model, **factory_cfg) return factory(self, model, **factory_cfg)
def add_pipe( def add_pipe(
@ -517,10 +518,11 @@ class Language(object):
def make_doc(self, text): def make_doc(self, text):
return self.tokenizer(text) return self.tokenizer(text)
def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None): def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None):
"""Update the models in the pipeline. """Update the models in the pipeline.
examples (iterable): A batch of `Example` or `Doc` objects. examples (iterable): A batch of `Example` or `Doc` objects.
dummy: Should not be set - serves to catch backwards-incompatible scripts.
drop (float): The dropout rate. drop (float): The dropout rate.
sgd (callable): An optimizer. sgd (callable): An optimizer.
losses (dict): Dictionary to update with the loss, keyed by component. losses (dict): Dictionary to update with the loss, keyed by component.
@ -529,6 +531,9 @@ class Language(object):
DOCS: https://spacy.io/api/language#update DOCS: https://spacy.io/api/language#update
""" """
if dummy is not None:
raise ValueError(Errors.E989)
if len(examples) == 0: if len(examples) == 0:
return return
examples = Example.to_example_objects(examples, make_doc=self.make_doc) examples = Example.to_example_objects(examples, make_doc=self.make_doc)
@ -735,7 +740,7 @@ class Language(object):
contexts = [ contexts = [
pipe.use_params(params) pipe.use_params(params)
for name, pipe in self.pipeline for name, pipe in self.pipeline
if hasattr(pipe, "use_params") if hasattr(pipe, "use_params") and hasattr(pipe, "model")
] ]
# TODO: Having trouble with contextlib # TODO: Having trouble with contextlib
# Workaround: these aren't actually context managers atm. # Workaround: these aren't actually context managers atm.

View File

@ -1,7 +1,11 @@
from pathlib import Path
from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import chain, clone, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear from thinc.api import Model, Maxout, Linear
from ...util import registry from ...util import registry
from ...kb import KnowledgeBase
from ...vocab import Vocab
@registry.architectures.register("spacy.EntityLinker.v1") @registry.architectures.register("spacy.EntityLinker.v1")
@ -19,3 +23,11 @@ def build_nel_encoder(tok2vec, nO=None):
model.set_ref("output_layer", output_layer) model.set_ref("output_layer", output_layer)
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
return model return model
@registry.assets.register("spacy.KBFromFile.v1")
def load_kb(nlp_path, kb_path) -> KnowledgeBase:
vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
kb = KnowledgeBase(vocab=vocab)
kb.load_bulk(kb_path)
return kb

View File

@ -2,6 +2,7 @@
import numpy import numpy
import srsly import srsly
import random import random
from ast import literal_eval
from thinc.api import CosineDistance, to_categorical, get_array_module from thinc.api import CosineDistance, to_categorical, get_array_module
from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
@ -1244,15 +1245,20 @@ class EntityLinker(Pipe):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.kb = None self.kb = None
self.kb = cfg.get("kb", None)
if self.kb is None:
# create an empty KB that should be filled by calling from_disk
self.kb = KnowledgeBase(vocab=vocab)
else:
del cfg["kb"] # we don't want to duplicate its serialization
if not isinstance(self.kb, KnowledgeBase):
raise ValueError(Errors.E990.format(type=type(self.kb)))
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.distance = CosineDistance(normalize=False) self.distance = CosineDistance(normalize=False)
def set_kb(self, kb):
self.kb = kb
def require_kb(self): def require_kb(self):
# Raise an error if the knowledge base is not initialized. # Raise an error if the knowledge base is not initialized.
if getattr(self, "kb", None) in (None, True, False): if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name)) raise ValueError(Errors.E139.format(name=self.name))
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
@ -1285,6 +1291,8 @@ class EntityLinker(Pipe):
ents_by_offset[(ent.start_char, ent.end_char)] = ent ents_by_offset[(ent.start_char, ent.end_char)] = ent
for entity, kb_dict in gold.links.items(): for entity, kb_dict in gold.links.items():
if isinstance(entity, str):
entity = literal_eval(entity)
start, end = entity start, end = entity
mention = doc.text[start:end] mention = doc.text[start:end]
@ -1375,7 +1383,6 @@ class EntityLinker(Pipe):
def predict(self, docs): def predict(self, docs):
""" Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
self.require_kb() self.require_kb()
entity_count = 0 entity_count = 0
final_kb_ids = [] final_kb_ids = []
final_tensors = [] final_tensors = []
@ -1486,9 +1493,8 @@ class EntityLinker(Pipe):
raise ValueError(Errors.E149) raise ValueError(Errors.E149)
def load_kb(p): def load_kb(p):
kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"]) self.kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
kb.load_bulk(p) self.kb.load_bulk(p)
self.set_kb(kb)
deserialize = {} deserialize = {}
deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["vocab"] = lambda p: self.vocab.from_disk(p)

View File

@ -203,8 +203,8 @@ def test_preserving_links_asdoc(nlp):
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
nlp.add_pipe(ruler) nlp.add_pipe(ruler)
el_pipe = nlp.create_pipe(name="entity_linker") cfg = {"kb": mykb, "incl_prior": False}
el_pipe.set_kb(mykb) el_pipe = nlp.create_pipe(name="entity_linker", config=cfg)
el_pipe.begin_training() el_pipe.begin_training()
el_pipe.incl_context = False el_pipe.incl_context = False
el_pipe.incl_prior = True el_pipe.incl_prior = True
@ -288,8 +288,7 @@ def test_overfitting_IO():
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
# Create the Entity Linker component and add it to the pipeline # Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.create_pipe("entity_linker") entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})
entity_linker.set_kb(mykb)
nlp.add_pipe(entity_linker, last=True) nlp.add_pipe(entity_linker, last=True)
# train the NEL pipe # train the NEL pipe

View File

@ -34,6 +34,7 @@ class registry(thinc.registry):
lookups = catalogue.create("spacy", "lookups", entry_points=True) lookups = catalogue.create("spacy", "lookups", entry_points=True)
factories = catalogue.create("spacy", "factories", entry_points=True) factories = catalogue.create("spacy", "factories", entry_points=True)
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
assets = catalogue.create("spacy", "assets", entry_points=True)
def set_env_log(value): def set_env_log(value):
@ -160,6 +161,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
for name in pipeline: for name in pipeline:
if name not in disable: if name not in disable:
config = meta.get("pipeline_args", {}).get(name, {}) config = meta.get("pipeline_args", {}).get(name, {})
config.update(overrides)
factory = factories.get(name, name) factory = factories.get(name, name)
if nlp_config.get(name, None): if nlp_config.get(name, None):
model_config = nlp_config[name]["model"] model_config = nlp_config[name]["model"]