Update EL example (#4789)

* update EL example script after sentence-central refactor

* version bump

* set incl_prior to False for quick demo purposes

* clean up
This commit is contained in:
Sofie Van Landeghem 2019-12-11 18:19:42 +01:00 committed by Ines Montani
parent 38e1bc19f4
commit 5355b0038f
4 changed files with 42 additions and 25 deletions

View File

@ -8,8 +8,8 @@ For more details, see the documentation:
* Knowledge base: https://spacy.io/api/kb * Knowledge base: https://spacy.io/api/kb
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
Compatible with: spaCy v2.2 Compatible with: spaCy v2.2.3
Last tested with: v2.2 Last tested with: v2.2.3
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -8,8 +8,8 @@ For more details, see the documentation:
* Training: https://spacy.io/usage/training * Training: https://spacy.io/usage/training
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
Compatible with: spaCy v2.2 Compatible with: spaCy v2.2.3
Last tested with: v2.2 Last tested with: v2.2.3
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
@ -22,6 +22,7 @@ from spacy.vocab import Vocab
import spacy import spacy
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy.pipeline import EntityRuler
from spacy.tokens import Span from spacy.tokens import Span
from spacy.util import minibatch, compounding from spacy.util import minibatch, compounding
@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
nlp.vocab.vectors.name = "spacy_pretrained_vectors" nlp.vocab.vectors.name = "spacy_pretrained_vectors"
print("Created blank 'en' model with vocab from '%s'" % vocab_path) print("Created blank 'en' model with vocab from '%s'" % vocab_path)
# create the built-in pipeline components and add them to the pipeline # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
# nlp.create_pipe works for built-ins that are registered with spaCy nlp.add_pipe(nlp.create_pipe('sentencizer'))
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
# Note that in a realistic application, an actual NER algorithm should be used instead.
ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
# Create the Entity Linker component and add it to the pipeline.
if "entity_linker" not in nlp.pipe_names: if "entity_linker" not in nlp.pipe_names:
entity_linker = nlp.create_pipe("entity_linker") # use only the predicted EL score and not the prior probability (for demo purposes)
cfg = {"incl_prior": False}
entity_linker = nlp.create_pipe("entity_linker", cfg)
kb = KnowledgeBase(vocab=nlp.vocab) kb = KnowledgeBase(vocab=nlp.vocab)
kb.load_bulk(kb_path) kb.load_bulk(kb_path)
print("Loaded Knowledge Base from '%s'" % kb_path) print("Loaded Knowledge Base from '%s'" % kb_path)
entity_linker.set_kb(kb) entity_linker.set_kb(kb)
nlp.add_pipe(entity_linker, last=True) nlp.add_pipe(entity_linker, last=True)
else:
entity_linker = nlp.get_pipe("entity_linker")
kb = entity_linker.kb
# make sure the annotated examples correspond to known identifiers in the knowlege base # Convert the texts to docs to make sure we have doc.ents set for the training examples.
kb_ids = kb.get_entity_strings() # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
TRAIN_DOCS = []
for text, annotation in TRAIN_DATA: for text, annotation in TRAIN_DATA:
with nlp.disable_pipes("entity_linker"):
doc = nlp(text)
annotation_clean = annotation
for offset, kb_id_dict in annotation["links"].items(): for offset, kb_id_dict in annotation["links"].items():
new_dict = {} new_dict = {}
for kb_id, value in kb_id_dict.items(): for kb_id, value in kb_id_dict.items():
@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
print( print(
"Removed", kb_id, "from training because it is not in the KB." "Removed", kb_id, "from training because it is not in the KB."
) )
annotation["links"][offset] = new_dict annotation_clean["links"][offset] = new_dict
TRAIN_DOCS.append((doc, annotation_clean))
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
# reset and initialize the weights randomly # reset and initialize the weights randomly
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DOCS)
losses = {} losses = {}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
texts, annotations = zip(*batch) texts, annotations = zip(*batch)
nlp.update( nlp.update(
@ -138,16 +153,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
def _apply_model(nlp): def _apply_model(nlp):
for text, annotation in TRAIN_DATA: for text, annotation in TRAIN_DATA:
doc = nlp.tokenizer(text)
# set entities so the evaluation is independent of the NER step
# all the examples contain 'Russ Cochran' as the first two tokens in the sentence
rc_ent = Span(doc, 0, 2, label=PERSON)
doc.ents = [rc_ent]
# apply the entity linker which will now make predictions for the 'Russ Cochran' entities # apply the entity linker which will now make predictions for the 'Russ Cochran' entities
doc = nlp.get_pipe("entity_linker")(doc) doc = nlp(text)
print() print()
print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents]) print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc]) print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])

View File

@ -531,6 +531,9 @@ class Errors(object):
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.") "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
E186 = ("'{tok_a}' and '{tok_b}' are different texts.") E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
E187 = ("Only unicode strings are supported as labels.") E187 = ("Only unicode strings are supported as labels.")
E188 = ("Could not match the gold entity links to entities in the doc - "
"make sure the gold EL data refers to valid results of the "
"named entity recognizer in the `nlp` pipeline.")
@add_codes @add_codes

View File

@ -1220,13 +1220,20 @@ class EntityLinker(Pipe):
for entity, kb_dict in gold.links.items(): for entity, kb_dict in gold.links.items():
start, end = entity start, end = entity
mention = doc.text[start:end] mention = doc.text[start:end]
# the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
if not (start, end) in ents_by_offset:
raise RuntimeError(Errors.E188)
ent = ents_by_offset[(start, end)] ent = ents_by_offset[(start, end)]
for kb_id, value in kb_dict.items(): for kb_id, value in kb_dict.items():
# Currently only training on the positive instances # Currently only training on the positive instances
if value: if value:
sentence_docs.append(ent.sent.as_doc()) try:
sentence_docs.append(ent.sent.as_doc())
except AttributeError:
# Catch the exception when ent.sent is None and provide a user-friendly warning
raise RuntimeError(Errors.E030)
sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)