only evaluate named entities for NEL if there is a corresponding gold span (#7074)

2025-12-22 09:34:23 +03:00 · 2021-02-22 01:06:50 +01:00 · 2021-02-22 01:06:50 +01:00 · 113e8d082b
commit 113e8d082b
parent 264862c67a
2 changed files with 76 additions and 21 deletions
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -531,6 +531,7 @@ class Scorer:
                gold_span = gold_ent_by_offset.get(
                    (pred_ent.start_char, pred_ent.end_char), None
                )
+                if gold_span is not None:
                    label = gold_span.label_
                    if label not in f_per_type:
                        f_per_type[label] = PRFScore()
--- a/spacy/tests/regression/test_issue7062.py
+++ b/spacy/tests/regression/test_issue7062.py
@ -0,0 +1,54 @@
+from spacy.kb import KnowledgeBase
+from spacy.training import Example
+from spacy.lang.en import English
+
+
+# fmt: off
+TRAIN_DATA = [
+    ("Russ Cochran his reprints include EC Comics.",
+        {"links": {(0, 12): {"Q2146908": 1.0}},
+         "entities": [(0, 12, "PERSON")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]})
+]
+# fmt: on
+
+
+def test_partial_links():
+    # Test that having some entities on the doc without gold links, doesn't crash
+    nlp = English()
+    vector_length = 3
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))
+
+    def create_kb(vocab):
+        # create artificial KB
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
+        return mykb
+
+    # Create and train the Entity Linker
+    entity_linker = nlp.add_pipe("entity_linker", last=True)
+    entity_linker.set_kb(create_kb)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(2):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+    # adding additional components that are required for the entity_linker
+    nlp.add_pipe("sentencizer", first=True)
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
+        {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
+    # this will run the pipeline on the examples and shouldn't crash
+    results = nlp.evaluate(train_examples)
+    assert "PERSON" in results["ents_per_type"]
+    assert "PERSON" in results["nel_f_per_type"]
+    assert "ORG" in results["ents_per_type"]
+    assert "ORG" not in results["nel_f_per_type"]