Fix scorer bug for NER, related to ambiguity between missing annotations and misaligned tokens

This commit is contained in:
Matthew Honnibal 2017-03-16 09:38:28 -05:00
parent 3d0833c3df
commit 2611ac2a89
3 changed files with 4 additions and 4 deletions

View File

@ -272,8 +272,8 @@ cdef class GoldParse:
self.words = [None] * len(doc) self.words = [None] * len(doc)
self.tags = [None] * len(doc) self.tags = [None] * len(doc)
self.heads = [None] * len(doc) self.heads = [None] * len(doc)
self.labels = [''] * len(doc) self.labels = [None] * len(doc)
self.ner = ['-'] * len(doc) self.ner = [None] * len(doc)
self.cand_to_gold = align([t.orth_ for t in doc], words) self.cand_to_gold = align([t.orth_ for t in doc], words)
self.gold_to_cand = align(words, [t.orth_ for t in doc]) self.gold_to_cand = align(words, [t.orth_ for t in doc])

View File

@ -87,7 +87,7 @@ class Scorer(object):
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
for id_, word, tag, head, dep, ner in gold.orig_annot: for id_, word, tag, head, dep, ner in gold.orig_annot:
gold_tags.add((id_, tag)) gold_tags.add((id_, tag))
if dep is not None and dep.lower() not in punct_labels: if dep not in (None, "") and dep.lower() not in punct_labels:
gold_deps.add((id_, head, dep.lower())) gold_deps.add((id_, head, dep.lower()))
cand_deps = set() cand_deps = set()
cand_tags = set() cand_tags = set()

View File

@ -106,7 +106,7 @@ cdef class BiluoPushDown(TransitionSystem):
self.freqs[ENT_TYPE][0] += 1 self.freqs[ENT_TYPE][0] += 1
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
if name == '-': if name == '-' or name == None:
move_str = 'M' move_str = 'M'
label = 0 label = 0
elif '-' in name: elif '-' in name: