mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 07:15:48 +03:00
test_gold_biluo_different_tokenization works
This commit is contained in:
parent
1c35b8efcd
commit
44a0f9c2c8
|
@ -85,17 +85,17 @@ cdef class Example:
|
||||||
|
|
||||||
vocab = self.reference.vocab
|
vocab = self.reference.vocab
|
||||||
gold_values = self.reference.to_array([field])
|
gold_values = self.reference.to_array([field])
|
||||||
output = []
|
output = [None] * len(self.predicted)
|
||||||
for i, gold_i in enumerate(cand_to_gold):
|
for i, gold_i in enumerate(cand_to_gold):
|
||||||
if self.predicted[i].text.isspace():
|
if self.predicted[i].text.isspace():
|
||||||
output.append(None)
|
output[i] = None
|
||||||
elif gold_i is None:
|
if gold_i is None:
|
||||||
if i in i2j_multi:
|
if i in i2j_multi:
|
||||||
output.append(gold_values[i2j_multi[i]])
|
output[i] = gold_values[i2j_multi[i]]
|
||||||
else:
|
else:
|
||||||
output.append(None)
|
output[i] = None
|
||||||
else:
|
else:
|
||||||
output.append(gold_values[gold_i])
|
output[i] = gold_values[gold_i]
|
||||||
|
|
||||||
if field in ["ENT_IOB"]:
|
if field in ["ENT_IOB"]:
|
||||||
# Fix many-to-one IOB codes
|
# Fix many-to-one IOB codes
|
||||||
|
@ -117,7 +117,8 @@ cdef class Example:
|
||||||
if cand_j is None:
|
if cand_j is None:
|
||||||
if j in j2i_multi:
|
if j in j2i_multi:
|
||||||
i = j2i_multi[j]
|
i = j2i_multi[j]
|
||||||
output[i] = gold_values[j]
|
if output[i] is None:
|
||||||
|
output[i] = gold_values[j]
|
||||||
|
|
||||||
if as_string:
|
if as_string:
|
||||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||||
|
|
|
@ -158,7 +158,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
assert example.get_aligned("ENT_IOB") == [2, 2, 1, 2]
|
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 2]
|
||||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", ""]
|
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", ""]
|
||||||
|
|
||||||
# many-to-one
|
# many-to-one
|
||||||
|
@ -195,25 +195,21 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""]
|
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""]
|
||||||
|
|
||||||
# from issue #4791
|
# from issue #4791
|
||||||
data = (
|
doc = en_tokenizer("I'll return the ₹54 amount")
|
||||||
"I'll return the ₹54 amount",
|
gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
|
||||||
{
|
gold_spaces = [False, True, True, True, False, True, False]
|
||||||
"words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
|
entities = [(16, 19, "MONEY")]
|
||||||
"entities": [(16, 19, "MONEY")],
|
example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
|
||||||
},
|
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 2]
|
||||||
)
|
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", ""]
|
||||||
gp = GoldParse(en_tokenizer(data[0]), **data[1])
|
|
||||||
assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"]
|
|
||||||
|
|
||||||
data = (
|
doc = en_tokenizer("I'll return the $54 amount")
|
||||||
"I'll return the $54 amount",
|
gold_words = ["I", "'ll", "return", "the", "$", "54", "amount"]
|
||||||
{
|
gold_spaces = [False, True, True, True, False, True, False]
|
||||||
"words": ["I", "'ll", "return", "the", "$", "54", "amount"],
|
entities = [(16, 19, "MONEY")]
|
||||||
"entities": [(16, 19, "MONEY")],
|
example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
|
||||||
},
|
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2]
|
||||||
)
|
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", "MONEY", ""]
|
||||||
gp = GoldParse(en_tokenizer(data[0]), **data[1])
|
|
||||||
assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user