mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Various fixes to NEL functionality, Example class etc (#5460)
* setting KB in the EL constructor, similar to how the model is passed on * removing wikipedia example files - moved to projects * throw an error when nlp.update is called with 2 positional arguments * rewriting the config logic in create pipe to accomodate for other objects (e.g. KB) in the config * update config files with new parameters * avoid training pipeline components that don't have a model (like sentencizer) * various small fixes + UX improvements * small fixes * set thinc to 8.0.0a9 everywhere * remove outdated comment
This commit is contained in:
parent
664a3603b0
commit
7f5715a081
|
@ -12,6 +12,8 @@ use_gpu = 0
|
|||
scores = ["tags_acc", "uas", "las"]
|
||||
score_weights = {"las": 0.8, "tags_acc": 0.2}
|
||||
limit = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
|
|
|
@ -12,6 +12,8 @@ use_gpu = -1
|
|||
scores = ["tags_acc", "uas", "las"]
|
||||
score_weights = {"las": 0.8, "tags_acc": 0.2}
|
||||
limit = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
|
|
|
@ -10,6 +10,8 @@ orth_variant_level = 0.0
|
|||
gold_preproc = true
|
||||
max_length = 0
|
||||
batch_size = 25
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
|
||||
[optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
|
|
|
@ -9,6 +9,8 @@ score_weights = {"ents_f": 1}
|
|||
orth_variant_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
|
|
|
@ -83,13 +83,13 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
|
||||
# Create the Entity Linker component and add it to the pipeline.
|
||||
if "entity_linker" not in nlp.pipe_names:
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
cfg = {"incl_prior": False}
|
||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
print("Loaded Knowledge Base from '%s'" % kb_path)
|
||||
entity_linker.set_kb(kb)
|
||||
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
cfg = {"kb": kb, "incl_prior": False}
|
||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||
nlp.add_pipe(entity_linker, last=True)
|
||||
|
||||
# Convert the texts to docs to make sure we have doc.ents set for the training examples.
|
||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc==8.0.0a8",
|
||||
"thinc==8.0.0a9",
|
||||
"blis>=0.4.0,<0.5.0"
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -36,7 +36,7 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc==8.0.0a8
|
||||
thinc==8.0.0a9
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -12,7 +12,7 @@ import random
|
|||
|
||||
from ..gold import GoldCorpus
|
||||
from .. import util
|
||||
|
||||
from ..errors import Errors
|
||||
|
||||
registry = util.registry
|
||||
|
||||
|
@ -233,6 +233,8 @@ def create_train_batches(nlp, corpus, cfg):
|
|||
max_length=cfg["max_length"],
|
||||
ignore_misaligned=True,
|
||||
))
|
||||
if len(train_examples) == 0:
|
||||
raise ValueError(Errors.E988)
|
||||
random.shuffle(train_examples)
|
||||
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
|
||||
for batch in batches:
|
||||
|
@ -313,12 +315,14 @@ def train_while_improving(
|
|||
dropouts = dropout
|
||||
results = []
|
||||
losses = {}
|
||||
to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
|
||||
|
||||
for step, batch in enumerate(train_data):
|
||||
dropout = next(dropouts)
|
||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
||||
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
|
||||
for name, proc in nlp.pipeline:
|
||||
if hasattr(proc, "model"):
|
||||
with nlp.select_pipes(enable=to_enable):
|
||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
||||
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
|
||||
for name, proc in nlp.pipeline:
|
||||
proc.model.finish_update(optimizer)
|
||||
optimizer.step_schedules()
|
||||
if not (step % eval_frequency):
|
||||
|
|
|
@ -195,7 +195,7 @@ class Errors(object):
|
|||
"the documentation:\nhttps://spacy.io/usage/models")
|
||||
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
||||
"component to the pipeline with: "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')). "
|
||||
"Alternatively, add the dependency parser, or set sentence "
|
||||
"boundaries by setting doc[i].is_sent_start.")
|
||||
E031 = ("Invalid token: empty string ('') at position {i}.")
|
||||
|
@ -430,8 +430,7 @@ class Errors(object):
|
|||
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
|
||||
"includes either the `text` or `tokens` key. For more info, see "
|
||||
"the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
|
||||
E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
|
||||
"forget to call set_kb()?")
|
||||
E139 = ("Knowledge Base for component '{name}' is empty.")
|
||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||
"should be of equal length.")
|
||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||
|
@ -548,6 +547,18 @@ class Errors(object):
|
|||
"token itself.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
|
||||
E987 = ("The text of an example training instance is either a Doc or "
|
||||
"a string, but found {type} instead.")
|
||||
E988 = ("Could not parse any training examples. Ensure the data is "
|
||||
"formatted correctly.")
|
||||
E989 = ("'nlp.update()' was called with two positional arguments. This "
|
||||
"may be due to a backwards-incompatible change to the format "
|
||||
"of the training data in spaCy 3.0 onwards. The 'update' "
|
||||
"function should now be called with a batch of 'Example' "
|
||||
"objects, instead of (text, annotation) tuples. ")
|
||||
E990 = ("An entity linking component needs to be initialized with a "
|
||||
"KnowledgeBase object, but found {type} instead.")
|
||||
E991 = ("The function 'select_pipes' should be called with either a "
|
||||
"'disable' argument to list the names of the pipe components "
|
||||
"that should be disabled, or with an 'enable' argument that "
|
||||
|
@ -562,8 +573,8 @@ class Errors(object):
|
|||
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||
"'{token_attrs}'.")
|
||||
E998 = ("Can only create GoldParse objects from Example objects without a "
|
||||
"Doc if get_gold_parses() is called with a Vocab object.")
|
||||
E998 = ("To create GoldParse objects from Example objects without a "
|
||||
"Doc, get_gold_parses() should be called with a Vocab object.")
|
||||
E999 = ("Encountered an unexpected format for the dictionary holding "
|
||||
"gold annotations: {gold_dict}")
|
||||
|
||||
|
|
|
@ -212,6 +212,8 @@ class GoldCorpus(object):
|
|||
doc = ex_dict.get("doc", None)
|
||||
if doc is None:
|
||||
doc = ex_dict.get("text", None)
|
||||
if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)):
|
||||
raise ValueError(Errors.E987.format(type=type(doc)))
|
||||
examples.append(Example.from_dict(ex_dict, doc=doc))
|
||||
|
||||
elif file_name.endswith("msg"):
|
||||
|
@ -288,7 +290,6 @@ class GoldCorpus(object):
|
|||
""" Setting gold_preproc will result in creating a doc per sentence """
|
||||
for example in examples:
|
||||
if gold_preproc:
|
||||
example.doc = None
|
||||
split_examples = example.split_sents()
|
||||
example_golds = []
|
||||
for split_example in split_examples:
|
||||
|
@ -716,6 +717,12 @@ cdef class TokenAnnotation:
|
|||
def get_sent_start(self, i):
|
||||
return self.sent_starts[i] if i < len(self.sent_starts) else None
|
||||
|
||||
def __str__(self):
|
||||
return str(self.to_dict())
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
cdef class DocAnnotation:
|
||||
def __init__(self, cats=None, links=None):
|
||||
|
@ -729,6 +736,12 @@ cdef class DocAnnotation:
|
|||
def to_dict(self):
|
||||
return {"cats": self.cats, "links": self.links}
|
||||
|
||||
def __str__(self):
|
||||
return str(self.to_dict())
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
cdef class Example:
|
||||
def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
|
||||
|
@ -747,9 +760,9 @@ cdef class Example:
|
|||
|
||||
@classmethod
|
||||
def from_dict(cls, example_dict, doc=None):
|
||||
token_dict = example_dict["token_annotation"]
|
||||
token_dict = example_dict.get("token_annotation", {})
|
||||
token_annotation = TokenAnnotation.from_dict(token_dict)
|
||||
doc_dict = example_dict["doc_annotation"]
|
||||
doc_dict = example_dict.get("doc_annotation", {})
|
||||
doc_annotation = DocAnnotation.from_dict(doc_dict)
|
||||
return cls(doc_annotation, token_annotation, doc)
|
||||
|
||||
|
@ -791,6 +804,8 @@ cdef class Example:
|
|||
def split_sents(self):
|
||||
""" Split the token annotations into multiple Examples based on
|
||||
sent_starts and return a list of the new Examples"""
|
||||
if not self.token_annotation.words:
|
||||
return [self]
|
||||
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
||||
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
|
||||
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
|
||||
|
@ -842,7 +857,7 @@ cdef class Example:
|
|||
if merge:
|
||||
t = self.token_annotation
|
||||
doc = self.doc
|
||||
if self.doc is None:
|
||||
if doc is None or not isinstance(doc, Doc):
|
||||
if not vocab:
|
||||
raise ValueError(Errors.E998)
|
||||
doc = Doc(vocab, words=t.words)
|
||||
|
@ -1052,7 +1067,7 @@ cdef class GoldParse:
|
|||
self.sent_starts = [None] * len(doc)
|
||||
|
||||
# This needs to be done before we align the words
|
||||
if make_projective and heads is not None and deps is not None:
|
||||
if make_projective and any(heads) and any(deps) :
|
||||
heads, deps = nonproj.projectivize(heads, deps)
|
||||
|
||||
# Do many-to-one alignment for misaligned tokens.
|
||||
|
|
|
@ -314,19 +314,20 @@ class Language(object):
|
|||
|
||||
# transform the model's config to an actual Model
|
||||
factory_cfg = dict(config)
|
||||
model_cfg = None
|
||||
|
||||
# check whether we have a proper model config, or load a default one
|
||||
if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
|
||||
warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name))
|
||||
|
||||
# refer to the model configuration in the cfg settings for this component
|
||||
if "model" in factory_cfg:
|
||||
model_cfg = factory_cfg["model"]
|
||||
if not isinstance(model_cfg, dict):
|
||||
warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name))
|
||||
model_cfg = None
|
||||
self.config[name] = {"model": factory_cfg["model"]}
|
||||
|
||||
# create all objects in the config
|
||||
factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"]
|
||||
model = factory_cfg.get("model", None)
|
||||
if model is not None:
|
||||
del factory_cfg["model"]
|
||||
model = None
|
||||
if model_cfg is not None:
|
||||
self.config[name] = {"model": model_cfg}
|
||||
model = registry.make_from_config({"model": model_cfg}, validate=True)[
|
||||
"model"
|
||||
]
|
||||
return factory(self, model, **factory_cfg)
|
||||
|
||||
def add_pipe(
|
||||
|
@ -517,10 +518,11 @@ class Language(object):
|
|||
def make_doc(self, text):
|
||||
return self.tokenizer(text)
|
||||
|
||||
def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None):
|
||||
def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None):
|
||||
"""Update the models in the pipeline.
|
||||
|
||||
examples (iterable): A batch of `Example` or `Doc` objects.
|
||||
dummy: Should not be set - serves to catch backwards-incompatible scripts.
|
||||
drop (float): The dropout rate.
|
||||
sgd (callable): An optimizer.
|
||||
losses (dict): Dictionary to update with the loss, keyed by component.
|
||||
|
@ -529,6 +531,9 @@ class Language(object):
|
|||
|
||||
DOCS: https://spacy.io/api/language#update
|
||||
"""
|
||||
if dummy is not None:
|
||||
raise ValueError(Errors.E989)
|
||||
|
||||
if len(examples) == 0:
|
||||
return
|
||||
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
|
||||
|
@ -735,7 +740,7 @@ class Language(object):
|
|||
contexts = [
|
||||
pipe.use_params(params)
|
||||
for name, pipe in self.pipeline
|
||||
if hasattr(pipe, "use_params")
|
||||
if hasattr(pipe, "use_params") and hasattr(pipe, "model")
|
||||
]
|
||||
# TODO: Having trouble with contextlib
|
||||
# Workaround: these aren't actually context managers atm.
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
from pathlib import Path
|
||||
|
||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||
from thinc.api import Model, Maxout, Linear
|
||||
|
||||
from ...util import registry
|
||||
from ...kb import KnowledgeBase
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.EntityLinker.v1")
|
||||
|
@ -19,3 +23,11 @@ def build_nel_encoder(tok2vec, nO=None):
|
|||
model.set_ref("output_layer", output_layer)
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
return model
|
||||
|
||||
|
||||
@registry.assets.register("spacy.KBFromFile.v1")
|
||||
def load_kb(nlp_path, kb_path) -> KnowledgeBase:
|
||||
vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
|
||||
kb = KnowledgeBase(vocab=vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
return kb
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
import numpy
|
||||
import srsly
|
||||
import random
|
||||
from ast import literal_eval
|
||||
|
||||
from thinc.api import CosineDistance, to_categorical, get_array_module
|
||||
from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
|
||||
|
@ -1244,15 +1245,20 @@ class EntityLinker(Pipe):
|
|||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.kb = None
|
||||
self.kb = cfg.get("kb", None)
|
||||
if self.kb is None:
|
||||
# create an empty KB that should be filled by calling from_disk
|
||||
self.kb = KnowledgeBase(vocab=vocab)
|
||||
else:
|
||||
del cfg["kb"] # we don't want to duplicate its serialization
|
||||
if not isinstance(self.kb, KnowledgeBase):
|
||||
raise ValueError(Errors.E990.format(type=type(self.kb)))
|
||||
self.cfg = dict(cfg)
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
|
||||
def set_kb(self, kb):
|
||||
self.kb = kb
|
||||
|
||||
def require_kb(self):
|
||||
# Raise an error if the knowledge base is not initialized.
|
||||
if getattr(self, "kb", None) in (None, True, False):
|
||||
if len(self.kb) == 0:
|
||||
raise ValueError(Errors.E139.format(name=self.name))
|
||||
|
||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||
|
@ -1285,6 +1291,8 @@ class EntityLinker(Pipe):
|
|||
ents_by_offset[(ent.start_char, ent.end_char)] = ent
|
||||
|
||||
for entity, kb_dict in gold.links.items():
|
||||
if isinstance(entity, str):
|
||||
entity = literal_eval(entity)
|
||||
start, end = entity
|
||||
mention = doc.text[start:end]
|
||||
|
||||
|
@ -1375,7 +1383,6 @@ class EntityLinker(Pipe):
|
|||
def predict(self, docs):
|
||||
""" Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
|
||||
self.require_kb()
|
||||
|
||||
entity_count = 0
|
||||
final_kb_ids = []
|
||||
final_tensors = []
|
||||
|
@ -1486,9 +1493,8 @@ class EntityLinker(Pipe):
|
|||
raise ValueError(Errors.E149)
|
||||
|
||||
def load_kb(p):
|
||||
kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
|
||||
kb.load_bulk(p)
|
||||
self.set_kb(kb)
|
||||
self.kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
|
||||
self.kb.load_bulk(p)
|
||||
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||
|
|
|
@ -203,8 +203,8 @@ def test_preserving_links_asdoc(nlp):
|
|||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
el_pipe = nlp.create_pipe(name="entity_linker")
|
||||
el_pipe.set_kb(mykb)
|
||||
cfg = {"kb": mykb, "incl_prior": False}
|
||||
el_pipe = nlp.create_pipe(name="entity_linker", config=cfg)
|
||||
el_pipe.begin_training()
|
||||
el_pipe.incl_context = False
|
||||
el_pipe.incl_prior = True
|
||||
|
@ -288,8 +288,7 @@ def test_overfitting_IO():
|
|||
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline
|
||||
entity_linker = nlp.create_pipe("entity_linker")
|
||||
entity_linker.set_kb(mykb)
|
||||
entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})
|
||||
nlp.add_pipe(entity_linker, last=True)
|
||||
|
||||
# train the NEL pipe
|
||||
|
|
|
@ -34,6 +34,7 @@ class registry(thinc.registry):
|
|||
lookups = catalogue.create("spacy", "lookups", entry_points=True)
|
||||
factories = catalogue.create("spacy", "factories", entry_points=True)
|
||||
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
|
||||
assets = catalogue.create("spacy", "assets", entry_points=True)
|
||||
|
||||
|
||||
def set_env_log(value):
|
||||
|
@ -160,6 +161,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
|||
for name in pipeline:
|
||||
if name not in disable:
|
||||
config = meta.get("pipeline_args", {}).get(name, {})
|
||||
config.update(overrides)
|
||||
factory = factories.get(name, name)
|
||||
if nlp_config.get(name, None):
|
||||
model_config = nlp_config[name]["model"]
|
||||
|
|
Loading…
Reference in New Issue
Block a user