mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 17:10:36 +03:00
Fix many-to-one IOB codes
This commit is contained in:
parent
12886b787b
commit
a28f8f369e
|
@ -95,6 +95,20 @@ cdef class Example:
|
||||||
else:
|
else:
|
||||||
output.append(gold_values[gold_i])
|
output.append(gold_values[gold_i])
|
||||||
|
|
||||||
|
if field in ["ENT_IOB"]:
|
||||||
|
# Fix many-to-one IOB codes
|
||||||
|
prev_j = -1
|
||||||
|
prev_value = -1
|
||||||
|
for i, value in enumerate(output):
|
||||||
|
if i in i2j_multi:
|
||||||
|
j = i2j_multi[i]
|
||||||
|
if j == prev_j and prev_value == value == 3:
|
||||||
|
output[i] = 1 # set B to I
|
||||||
|
prev_j = j
|
||||||
|
else:
|
||||||
|
prev_j = -1
|
||||||
|
prev_value = value
|
||||||
|
|
||||||
if field in ["ENT_IOB", "ENT_TYPE"]:
|
if field in ["ENT_IOB", "ENT_TYPE"]:
|
||||||
# Assign O/- for one-to-many O/- NER tags
|
# Assign O/- for one-to-many O/- NER tags
|
||||||
for j, cand_j in enumerate(gold_to_cand):
|
for j, cand_j in enumerate(gold_to_cand):
|
||||||
|
|
|
@ -166,10 +166,10 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
spaces = [True, True, True, True, True, False, False]
|
spaces = [True, True, True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||||
gp = GoldParse(
|
gold_words =["I", "flew to", "San Francisco Valley", "."]
|
||||||
doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
)
|
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2]
|
||||||
assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""]
|
||||||
|
|
||||||
# misaligned
|
# misaligned
|
||||||
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user