Fix NER gold-standard around whitespace

This commit is contained in:
Matthew Honnibal 2019-08-29 14:33:07 +02:00
parent 216f63a987
commit 6511e1d8d3

View File

@ -635,7 +635,7 @@ cdef class GoldParse:
self.tags[i] = "_SP"
self.heads[i] = None
self.labels[i] = None
self.ner[i] = "O"
self.ner[i] = None
self.morphology[i] = set()
if gold_i is None:
if i in i2j_multi:
@ -686,9 +686,20 @@ cdef class GoldParse:
self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i]
# Prevent whitespace that isn't within entities from being tagged as
# an entity.
for i in range(len(self.ner)):
if self.tags[i] == "_SP":
prev_ner = self.ner[i-1] if i >= 1 else None
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
if prev_ner == "O" or next_ner == "O":
self.ner[i] = "O"
cycle = nonproj.contains_cycle(self.heads)
if cycle is not None:
raise ValueError(Errors.E069.format(cycle=cycle, cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), doc_tokens=" ".join(words[:50])))
raise ValueError(Errors.E069.format(cycle=cycle,
cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
doc_tokens=" ".join(words[:50])))
def __len__(self):
"""Get the number of gold-standard tokens.