Fix scorer bug for NER, related to ambiguity between missing annotations and misaligned tokens

2025-10-18 09:44:16 +03:00 · 2017-03-16 09:38:28 -05:00 · 2017-03-16 09:38:28 -05:00 · 2611ac2a89
commit 2611ac2a89
parent 3d0833c3df
3 changed files with 4 additions and 4 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -272,8 +272,8 @@ cdef class GoldParse:
        self.words = [None] * len(doc)
        self.tags = [None] * len(doc)
        self.heads = [None] * len(doc)
-        self.labels = [''] * len(doc)
-        self.ner = ['-'] * len(doc)
+        self.labels = [None] * len(doc)
+        self.ner = [None] * len(doc)

        self.cand_to_gold = align([t.orth_ for t in doc], words)
        self.gold_to_cand = align(words, [t.orth_ for t in doc])
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -87,7 +87,7 @@ class Scorer(object):
        gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
        for id_, word, tag, head, dep, ner in gold.orig_annot:
            gold_tags.add((id_, tag))
-            if dep is not None and dep.lower() not in punct_labels:
+            if dep not in (None, "") and dep.lower() not in punct_labels:
                gold_deps.add((id_, head, dep.lower()))
        cand_deps = set()
        cand_tags = set()
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -106,7 +106,7 @@ cdef class BiluoPushDown(TransitionSystem):
                self.freqs[ENT_TYPE][0] += 1

    cdef Transition lookup_transition(self, object name) except *:
-        if name == '-':
+        if name == '-' or name == None:
            move_str = 'M'
            label = 0
        elif '-' in name: