Update EL example (#4789)

* update EL example script after sentence-central refactor * version bump * set incl_prior to False for quick demo purposes * clean up
2025-10-18 09:44:16 +03:00 · 2019-12-11 18:19:42 +01:00 · 2019-12-11 18:19:42 +01:00 · 5355b0038f
commit 5355b0038f
parent 38e1bc19f4
4 changed files with 42 additions and 25 deletions
--- a/examples/training/pretrain_kb.py
+++ b/examples/training/pretrain_kb.py
@ -8,8 +8,8 @@ For more details, see the documentation:
 * Knowledge base: https://spacy.io/api/kb
 * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking

-Compatible with: spaCy v2.2
-Last tested with: v2.2
+Compatible with: spaCy v2.2.3
+Last tested with: v2.2.3
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@ -8,8 +8,8 @@ For more details, see the documentation:
 * Training: https://spacy.io/usage/training
 * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking

-Compatible with: spaCy v2.2
-Last tested with: v2.2
+Compatible with: spaCy v2.2.3
+Last tested with: v2.2.3
 """
 from __future__ import unicode_literals, print_function

@ -22,6 +22,7 @@ from spacy.vocab import Vocab

 import spacy
 from spacy.kb import KnowledgeBase
+from spacy.pipeline import EntityRuler
 from spacy.tokens import Span
 from spacy.util import minibatch, compounding

@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

-    # create the built-in pipeline components and add them to the pipeline
-    # nlp.create_pipe works for built-ins that are registered with spaCy
+    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
+    nlp.add_pipe(nlp.create_pipe('sentencizer'))
+
+    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
+    # Note that in a realistic application, an actual NER algorithm should be used instead.
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+
+    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
-        entity_linker = nlp.create_pipe("entity_linker")
+        # use only the predicted EL score and not the prior probability (for demo purposes)
+        cfg = {"incl_prior": False}
+        entity_linker = nlp.create_pipe("entity_linker", cfg)
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
        entity_linker.set_kb(kb)
        nlp.add_pipe(entity_linker, last=True)
-    else:
-        entity_linker = nlp.get_pipe("entity_linker")
-        kb = entity_linker.kb

-    # make sure the annotated examples correspond to known identifiers in the knowlege base
-    kb_ids = kb.get_entity_strings()
+    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
+    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
+    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
+    TRAIN_DOCS = []
    for text, annotation in TRAIN_DATA:
+        with nlp.disable_pipes("entity_linker"):
+            doc = nlp(text)
+        annotation_clean = annotation
        for offset, kb_id_dict in annotation["links"].items():
            new_dict = {}
            for kb_id, value in kb_id_dict.items():
@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
                    print(
                        "Removed", kb_id, "from training because it is not in the KB."
                    )
-            annotation["links"][offset] = new_dict
+            annotation_clean["links"][offset] = new_dict
+        TRAIN_DOCS.append((doc, annotation_clean))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
-            random.shuffle(TRAIN_DATA)
+            random.shuffle(TRAIN_DOCS)
            losses = {}
            # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
@ -138,16 +153,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):

 def _apply_model(nlp):
    for text, annotation in TRAIN_DATA:
-        doc = nlp.tokenizer(text)
-
-        # set entities so the evaluation is independent of the NER step
-        # all the examples contain 'Russ Cochran' as the first two tokens in the sentence
-        rc_ent = Span(doc, 0, 2, label=PERSON)
-        doc.ents = [rc_ent]
-
        # apply the entity linker which will now make predictions for the 'Russ Cochran' entities
-        doc = nlp.get_pipe("entity_linker")(doc)
-
+        doc = nlp(text)
        print()
        print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -531,6 +531,9 @@ class Errors(object):
            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
    E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
    E187 = ("Only unicode strings are supported as labels.")
+    E188 = ("Could not match the gold entity links to entities in the doc - "
+            "make sure the gold EL data refers to valid results of the "
+            "named entity recognizer in the `nlp` pipeline.")


@add_codes
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -1220,13 +1220,20 @@ class EntityLinker(Pipe):
            for entity, kb_dict in gold.links.items():
                start, end = entity
                mention = doc.text[start:end]
+
                # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
+                if not (start, end) in ents_by_offset:
+                    raise RuntimeError(Errors.E188)
                ent = ents_by_offset[(start, end)]

                for kb_id, value in kb_dict.items():
                    # Currently only training on the positive instances
                    if value:
+                        try:
                            sentence_docs.append(ent.sent.as_doc())
+                        except AttributeError:
+                            # Catch the exception when ent.sent is None and provide a user-friendly warning
+                            raise RuntimeError(Errors.E030)

        sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
        loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)