From 113e8d082b87f9a394a76dc8faced2a94023dc7a Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 22 Feb 2021 01:06:50 +0100 Subject: [PATCH] only evaluate named entities for NEL if there is a corresponding gold span (#7074) --- spacy/scorer.py | 43 ++++++++++--------- spacy/tests/regression/test_issue7062.py | 54 ++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 21 deletions(-) create mode 100644 spacy/tests/regression/test_issue7062.py diff --git a/spacy/scorer.py b/spacy/scorer.py index f20a0d786..f10824fd6 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -531,27 +531,28 @@ class Scorer: gold_span = gold_ent_by_offset.get( (pred_ent.start_char, pred_ent.end_char), None ) - label = gold_span.label_ - if label not in f_per_type: - f_per_type[label] = PRFScore() - gold = gold_span.kb_id_ - # only evaluating entities that overlap between gold and pred, - # to disentangle the performance of the NEL from the NER - if gold is not None: - pred = pred_ent.kb_id_ - if gold in negative_labels and pred in negative_labels: - # ignore true negatives - pass - elif gold == pred: - f_per_type[label].tp += 1 - elif gold in negative_labels: - f_per_type[label].fp += 1 - elif pred in negative_labels: - f_per_type[label].fn += 1 - else: - # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN - f_per_type[label].fp += 1 - f_per_type[label].fn += 1 + if gold_span is not None: + label = gold_span.label_ + if label not in f_per_type: + f_per_type[label] = PRFScore() + gold = gold_span.kb_id_ + # only evaluating entities that overlap between gold and pred, + # to disentangle the performance of the NEL from the NER + if gold is not None: + pred = pred_ent.kb_id_ + if gold in negative_labels and pred in negative_labels: + # ignore true negatives + pass + elif gold == pred: + f_per_type[label].tp += 1 + elif gold in negative_labels: + f_per_type[label].fp += 1 + elif pred in negative_labels: + f_per_type[label].fn += 1 + else: + # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN + f_per_type[label].fp += 1 + f_per_type[label].fn += 1 micro_prf = PRFScore() for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp diff --git a/spacy/tests/regression/test_issue7062.py b/spacy/tests/regression/test_issue7062.py new file mode 100644 index 000000000..88e5d2520 --- /dev/null +++ b/spacy/tests/regression/test_issue7062.py @@ -0,0 +1,54 @@ +from spacy.kb import KnowledgeBase +from spacy.training import Example +from spacy.lang.en import English + + +# fmt: off +TRAIN_DATA = [ + ("Russ Cochran his reprints include EC Comics.", + {"links": {(0, 12): {"Q2146908": 1.0}}, + "entities": [(0, 12, "PERSON")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}) +] +# fmt: on + + +def test_partial_links(): + # Test that having some entities on the doc without gold links, doesn't crash + nlp = English() + vector_length = 3 + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) + return mykb + + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, + {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]} + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + + # this will run the pipeline on the examples and shouldn't crash + results = nlp.evaluate(train_examples) + assert "PERSON" in results["ents_per_type"] + assert "PERSON" in results["nel_f_per_type"] + assert "ORG" in results["ents_per_type"] + assert "ORG" not in results["nel_f_per_type"]