Update EL example (#4789)

* update EL example script after sentence-central refactor * version bump * set incl_prior to False for quick demo purposes * clean up
2025-07-15 10:42:34 +03:00 · 2019-12-11 18:19:42 +01:00 · 2019-12-11 18:19:42 +01:00 · 5355b0038f
commit 5355b0038f
parent 38e1bc19f4
4 changed files with 42 additions and 25 deletions
--- a/examples/training/pretrain_kb.py
+++ b/examples/training/pretrain_kb.py
@ -8,8 +8,8 @@ For more details, see the documentation:
 * Knowledge base: https://spacy.io/api/kb
 * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
-Compatible with: spaCy v2.2
+Compatible with: spaCy v2.2.3
-Last tested with: v2.2
+Last tested with: v2.2.3
 """
 from __future__ import unicode_literals, print_function
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@ -8,8 +8,8 @@ For more details, see the documentation:
 * Training: https://spacy.io/usage/training
 * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
-Compatible with: spaCy v2.2
+Compatible with: spaCy v2.2.3
-Last tested with: v2.2
+Last tested with: v2.2.3
 """
 from __future__ import unicode_literals, print_function
@ -22,6 +22,7 @@ from spacy.vocab import Vocab
 import spacy
 from spacy.kb import KnowledgeBase
 from spacy.pipeline import EntityRuler
 from spacy.tokens import Span
 from spacy.util import minibatch, compounding
@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)
-    # create the built-in pipeline components and add them to the pipeline
+    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
-    # nlp.create_pipe works for built-ins that are registered with spaCy
+    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
    # Note that in a realistic application, an actual NER algorithm should be used instead.
    ruler = EntityRuler(nlp)
    patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
-        entity_linker = nlp.create_pipe("entity_linker")
+        # use only the predicted EL score and not the prior probability (for demo purposes)
        cfg = {"incl_prior": False}
        entity_linker = nlp.create_pipe("entity_linker", cfg)
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
        entity_linker.set_kb(kb)
        nlp.add_pipe(entity_linker, last=True)
    else:
        entity_linker = nlp.get_pipe("entity_linker")
        kb = entity_linker.kb
-    # make sure the annotated examples correspond to known identifiers in the knowlege base
+    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
-    kb_ids = kb.get_entity_strings()
+    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
    TRAIN_DOCS = []
    for text, annotation in TRAIN_DATA:
        with nlp.disable_pipes("entity_linker"):
            doc = nlp(text)
        annotation_clean = annotation
        for offset, kb_id_dict in annotation["links"].items():
            new_dict = {}
            for kb_id, value in kb_id_dict.items():
@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
                    print(
                        "Removed", kb_id, "from training because it is not in the KB."
                    )
-            annotation["links"][offset] = new_dict
+            annotation_clean["links"][offset] = new_dict
        TRAIN_DOCS.append((doc, annotation_clean))
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
-            random.shuffle(TRAIN_DATA)
+            random.shuffle(TRAIN_DOCS)
            losses = {}
            # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
@ -138,16 +153,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
 def _apply_model(nlp):
    for text, annotation in TRAIN_DATA:
        doc = nlp.tokenizer(text)
        # set entities so the evaluation is independent of the NER step
        # all the examples contain 'Russ Cochran' as the first two tokens in the sentence
        rc_ent = Span(doc, 0, 2, label=PERSON)
        doc.ents = [rc_ent]
        # apply the entity linker which will now make predictions for the 'Russ Cochran' entities
-        doc = nlp.get_pipe("entity_linker")(doc)
+        doc = nlp(text)
        print()
        print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -531,6 +531,9 @@ class Errors(object):
            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
    E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
    E187 = ("Only unicode strings are supported as labels.")
    E188 = ("Could not match the gold entity links to entities in the doc - "
            "make sure the gold EL data refers to valid results of the "
            "named entity recognizer in the `nlp` pipeline.")
@add_codes
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -1220,13 +1220,20 @@ class EntityLinker(Pipe):
            for entity, kb_dict in gold.links.items():
                start, end = entity
                mention = doc.text[start:end]
                # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
                if not (start, end) in ents_by_offset:
                    raise RuntimeError(Errors.E188)
                ent = ents_by_offset[(start, end)]
                for kb_id, value in kb_dict.items():
                    # Currently only training on the positive instances
                    if value:
-                        sentence_docs.append(ent.sent.as_doc())
+                        try:
                            sentence_docs.append(ent.sent.as_doc())
                        except AttributeError:
                            # Catch the exception when ent.sent is None and provide a user-friendly warning
                            raise RuntimeError(Errors.E030)
        sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
        loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)