fix ENT_IOB conversion and enable unit test

This commit is contained in:
svlandeg 2020-06-12 11:30:24 +02:00
parent 6a67a11682
commit 3aed177a35
5 changed files with 36 additions and 5 deletions

View File

@ -581,6 +581,8 @@ class Errors(object):
# TODO: fix numbering after merging develop into master
E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
"into {values}, but found {value}.")
E986 = ("Could not create any training batches: check your input. "
"Perhaps discard_oversize should be set to False ?")
E987 = ("The text of an example training instance is either a Doc or "

View File

@ -1,4 +1,6 @@
import numpy
from ..tokens import Token
from ..tokens.doc cimport Doc
from ..attrs import IDS
from .align cimport Alignment
@ -97,6 +99,13 @@ def _annot2array(strings, tok_annot, doc_annot):
elif key == "SENT_START":
attrs.append(key)
values.append(value)
elif key == "ENT_IOB":
iob_strings = Token.iob_strings()
attrs.append(key)
try:
values.append([iob_strings.index(v) for v in value])
except ValueError:
raise ValueError(Errors.E985.format(values=iob_strings, value=values))
else:
attrs.append(key)
values.append([strings.add(v) for v in value])

View File

@ -29,7 +29,7 @@ def test_Example_from_dict_invalid(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
with pytest.raises(ValueError):
eg = Example.from_dict(predicted, annots)
Example.from_dict(predicted, annots)
@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
@ -46,8 +46,8 @@ def test_Example_from_dict_with_tags(annots):
"annots",
[
{
"words": ["I", "like", "London", "and", "Berlin", "."],
"entities": [(7, 13, "LOC"), (18, 24, "LOC")],
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
}
],
)
@ -56,6 +56,16 @@ def test_Example_from_dict_with_entities(annots):
predicted = Doc(vocab, words=annots["words"])
eg = Example.from_dict(predicted, annots)
assert len(list(eg.reference.ents)) == 2
assert eg.reference[0].ent_iob_ == "O"
assert eg.reference[1].ent_iob_ == "O"
assert eg.reference[2].ent_iob_ == "B"
assert eg.reference[3].ent_iob_ == "I"
assert eg.reference[4].ent_iob_ == "O"
assert eg.reference[5].ent_iob_ == "B"
assert eg.reference[6].ent_iob_ == "O"
assert eg.reference[2].ent_type_ == "LOC"
assert eg.reference[3].ent_type_ == "LOC"
assert eg.reference[5].ent_type_ == "LOC"
@pytest.mark.parametrize(

View File

@ -825,6 +825,13 @@ cdef class Doc:
for i in range(length):
if array[i, col] != 0:
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
# Verify ENT_IOB are proper integers
if ENT_IOB in attrs:
iob_strings = Token.iob_strings()
col = attrs.index(ENT_IOB)
for i in range(length):
if array[i, col] not in range(0, len(iob_strings)):
raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col]))
# Now load the data
for i in range(length):
token = &self.c[i]

View File

@ -778,6 +778,10 @@ cdef class Token:
"""
return self.c.ent_iob
@classmethod
def iob_strings(cls):
return ("", "I", "O", "B")
@property
def ent_iob_(self):
"""IOB code of named entity tag. "B" means the token begins an entity,
@ -787,8 +791,7 @@ cdef class Token:
RETURNS (str): IOB code of named entity tag.
"""
iob_strings = ("", "I", "O", "B")
return iob_strings[self.c.ent_iob]
return self.iob_strings()[self.c.ent_iob]
property ent_id:
"""RETURNS (uint64): ID of the entity the token is an instance of,