From 44a0f9c2c82859578393bd3652de4e8b35bc0154 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 16 Jun 2020 15:21:20 +0200 Subject: [PATCH] test_gold_biluo_different_tokenization works --- spacy/gold/example.pyx | 15 ++++++++------- spacy/tests/test_gold.py | 34 +++++++++++++++------------------- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index adae9335b..09a32ee4d 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -85,17 +85,17 @@ cdef class Example: vocab = self.reference.vocab gold_values = self.reference.to_array([field]) - output = [] + output = [None] * len(self.predicted) for i, gold_i in enumerate(cand_to_gold): if self.predicted[i].text.isspace(): - output.append(None) - elif gold_i is None: + output[i] = None + if gold_i is None: if i in i2j_multi: - output.append(gold_values[i2j_multi[i]]) + output[i] = gold_values[i2j_multi[i]] else: - output.append(None) + output[i] = None else: - output.append(gold_values[gold_i]) + output[i] = gold_values[gold_i] if field in ["ENT_IOB"]: # Fix many-to-one IOB codes @@ -117,7 +117,8 @@ cdef class Example: if cand_j is None: if j in j2i_multi: i = j2i_multi[j] - output[i] = gold_values[j] + if output[i] is None: + output[i] = gold_values[j] if as_string: output = [vocab.strings[o] if o is not None else o for o in output] diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 5f92a476c..ea9c460ac 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -158,7 +158,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) - assert example.get_aligned("ENT_IOB") == [2, 2, 1, 2] + assert example.get_aligned("ENT_IOB") == [2, 2, 3, 2] assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", ""] # many-to-one @@ -195,25 +195,21 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""] # from issue #4791 - data = ( - "I'll return the ₹54 amount", - { - "words": ["I", "'ll", "return", "the", "₹", "54", "amount"], - "entities": [(16, 19, "MONEY")], - }, - ) - gp = GoldParse(en_tokenizer(data[0]), **data[1]) - assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"] + doc = en_tokenizer("I'll return the ₹54 amount") + gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"] + gold_spaces = [False, True, True, True, False, True, False] + entities = [(16, 19, "MONEY")] + example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}) + assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 2] + assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", ""] - data = ( - "I'll return the $54 amount", - { - "words": ["I", "'ll", "return", "the", "$", "54", "amount"], - "entities": [(16, 19, "MONEY")], - }, - ) - gp = GoldParse(en_tokenizer(data[0]), **data[1]) - assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"] + doc = en_tokenizer("I'll return the $54 amount") + gold_words = ["I", "'ll", "return", "the", "$", "54", "amount"] + gold_spaces = [False, True, True, True, False, True, False] + entities = [(16, 19, "MONEY")] + example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}) + assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2] + assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", "MONEY", ""] def test_roundtrip_offsets_biluo_conversion(en_tokenizer):