mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Update EL example (#4789)
* update EL example script after sentence-central refactor * version bump * set incl_prior to False for quick demo purposes * clean up
This commit is contained in:
parent
38e1bc19f4
commit
5355b0038f
|
@ -8,8 +8,8 @@ For more details, see the documentation:
|
|||
* Knowledge base: https://spacy.io/api/kb
|
||||
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
||||
|
||||
Compatible with: spaCy v2.2
|
||||
Last tested with: v2.2
|
||||
Compatible with: spaCy v2.2.3
|
||||
Last tested with: v2.2.3
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -8,8 +8,8 @@ For more details, see the documentation:
|
|||
* Training: https://spacy.io/usage/training
|
||||
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
||||
|
||||
Compatible with: spaCy v2.2
|
||||
Last tested with: v2.2
|
||||
Compatible with: spaCy v2.2.3
|
||||
Last tested with: v2.2.3
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
@ -22,6 +22,7 @@ from spacy.vocab import Vocab
|
|||
|
||||
import spacy
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.tokens import Span
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
nlp.vocab.vectors.name = "spacy_pretrained_vectors"
|
||||
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
|
||||
|
||||
# create the built-in pipeline components and add them to the pipeline
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
# Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
|
||||
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
|
||||
# Note that in a realistic application, an actual NER algorithm should be used instead.
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline.
|
||||
if "entity_linker" not in nlp.pipe_names:
|
||||
entity_linker = nlp.create_pipe("entity_linker")
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
cfg = {"incl_prior": False}
|
||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
print("Loaded Knowledge Base from '%s'" % kb_path)
|
||||
entity_linker.set_kb(kb)
|
||||
nlp.add_pipe(entity_linker, last=True)
|
||||
else:
|
||||
entity_linker = nlp.get_pipe("entity_linker")
|
||||
kb = entity_linker.kb
|
||||
|
||||
# make sure the annotated examples correspond to known identifiers in the knowlege base
|
||||
kb_ids = kb.get_entity_strings()
|
||||
# Convert the texts to docs to make sure we have doc.ents set for the training examples.
|
||||
# Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
|
||||
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
|
||||
TRAIN_DOCS = []
|
||||
for text, annotation in TRAIN_DATA:
|
||||
with nlp.disable_pipes("entity_linker"):
|
||||
doc = nlp(text)
|
||||
annotation_clean = annotation
|
||||
for offset, kb_id_dict in annotation["links"].items():
|
||||
new_dict = {}
|
||||
for kb_id, value in kb_id_dict.items():
|
||||
|
@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
print(
|
||||
"Removed", kb_id, "from training because it is not in the KB."
|
||||
)
|
||||
annotation["links"][offset] = new_dict
|
||||
annotation_clean["links"][offset] = new_dict
|
||||
TRAIN_DOCS.append((doc, annotation_clean))
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
|
||||
|
@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
# reset and initialize the weights randomly
|
||||
optimizer = nlp.begin_training()
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
random.shuffle(TRAIN_DOCS)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(
|
||||
|
@ -138,16 +153,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
|
||||
def _apply_model(nlp):
|
||||
for text, annotation in TRAIN_DATA:
|
||||
doc = nlp.tokenizer(text)
|
||||
|
||||
# set entities so the evaluation is independent of the NER step
|
||||
# all the examples contain 'Russ Cochran' as the first two tokens in the sentence
|
||||
rc_ent = Span(doc, 0, 2, label=PERSON)
|
||||
doc.ents = [rc_ent]
|
||||
|
||||
# apply the entity linker which will now make predictions for the 'Russ Cochran' entities
|
||||
doc = nlp.get_pipe("entity_linker")(doc)
|
||||
|
||||
doc = nlp(text)
|
||||
print()
|
||||
print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
|
||||
print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])
|
||||
|
|
|
@ -531,6 +531,9 @@ class Errors(object):
|
|||
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
||||
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
|
||||
E187 = ("Only unicode strings are supported as labels.")
|
||||
E188 = ("Could not match the gold entity links to entities in the doc - "
|
||||
"make sure the gold EL data refers to valid results of the "
|
||||
"named entity recognizer in the `nlp` pipeline.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -1220,13 +1220,20 @@ class EntityLinker(Pipe):
|
|||
for entity, kb_dict in gold.links.items():
|
||||
start, end = entity
|
||||
mention = doc.text[start:end]
|
||||
|
||||
# the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
|
||||
if not (start, end) in ents_by_offset:
|
||||
raise RuntimeError(Errors.E188)
|
||||
ent = ents_by_offset[(start, end)]
|
||||
|
||||
for kb_id, value in kb_dict.items():
|
||||
# Currently only training on the positive instances
|
||||
if value:
|
||||
try:
|
||||
sentence_docs.append(ent.sent.as_doc())
|
||||
except AttributeError:
|
||||
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
||||
raise RuntimeError(Errors.E030)
|
||||
|
||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
|
||||
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)
|
||||
|
|
Loading…
Reference in New Issue
Block a user