mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 09:00:36 +03:00
Fix many-to-one IOB codes
This commit is contained in:
parent
12886b787b
commit
a28f8f369e
|
@ -95,6 +95,20 @@ cdef class Example:
|
|||
else:
|
||||
output.append(gold_values[gold_i])
|
||||
|
||||
if field in ["ENT_IOB"]:
|
||||
# Fix many-to-one IOB codes
|
||||
prev_j = -1
|
||||
prev_value = -1
|
||||
for i, value in enumerate(output):
|
||||
if i in i2j_multi:
|
||||
j = i2j_multi[i]
|
||||
if j == prev_j and prev_value == value == 3:
|
||||
output[i] = 1 # set B to I
|
||||
prev_j = j
|
||||
else:
|
||||
prev_j = -1
|
||||
prev_value = value
|
||||
|
||||
if field in ["ENT_IOB", "ENT_TYPE"]:
|
||||
# Assign O/- for one-to-many O/- NER tags
|
||||
for j, cand_j in enumerate(gold_to_cand):
|
||||
|
|
|
@ -166,10 +166,10 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
|||
spaces = [True, True, True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
gp = GoldParse(
|
||||
doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities
|
||||
)
|
||||
assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
gold_words =["I", "flew to", "San Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""]
|
||||
|
||||
# misaligned
|
||||
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
||||
|
|
Loading…
Reference in New Issue
Block a user