Fix many-to-one IOB codes

This commit is contained in:
svlandeg 2020-06-15 23:06:22 +02:00
parent 12886b787b
commit a28f8f369e
2 changed files with 18 additions and 4 deletions

View File

@ -95,6 +95,20 @@ cdef class Example:
else:
output.append(gold_values[gold_i])
if field in ["ENT_IOB"]:
# Fix many-to-one IOB codes
prev_j = -1
prev_value = -1
for i, value in enumerate(output):
if i in i2j_multi:
j = i2j_multi[i]
if j == prev_j and prev_value == value == 3:
output[i] = 1 # set B to I
prev_j = j
else:
prev_j = -1
prev_value = value
if field in ["ENT_IOB", "ENT_TYPE"]:
# Assign O/- for one-to-many O/- NER tags
for j, cand_j in enumerate(gold_to_cand):

View File

@ -166,10 +166,10 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
spaces = [True, True, True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
gp = GoldParse(
doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities
)
assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
gold_words =["I", "flew to", "San Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2]
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""]
# misaligned
words = ["I flew", "to", "San Francisco", "Valley", "."]