From a28f8f369ebd18da025a40f833942dd168361d8d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 15 Jun 2020 23:06:22 +0200 Subject: [PATCH] Fix many-to-one IOB codes --- spacy/gold/example.pyx | 14 ++++++++++++++ spacy/tests/test_gold.py | 8 ++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 425254320..99a1bbafa 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -95,6 +95,20 @@ cdef class Example: else: output.append(gold_values[gold_i]) + if field in ["ENT_IOB"]: + # Fix many-to-one IOB codes + prev_j = -1 + prev_value = -1 + for i, value in enumerate(output): + if i in i2j_multi: + j = i2j_multi[i] + if j == prev_j and prev_value == value == 3: + output[i] = 1 # set B to I + prev_j = j + else: + prev_j = -1 + prev_value = value + if field in ["ENT_IOB", "ENT_TYPE"]: # Assign O/- for one-to-many O/- NER tags for j, cand_j in enumerate(gold_to_cand): diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 5cf3b4c01..f604f4b53 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -166,10 +166,10 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): spaces = [True, True, True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gp = GoldParse( - doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities - ) - assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] + gold_words =["I", "flew to", "San Francisco Valley", "."] + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) + assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2] + assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""] # misaligned words = ["I flew", "to", "San Francisco", "Valley", "."]