fix ENT_IOB conversion and enable unit test

This commit is contained in:
svlandeg 2020-06-12 11:30:24 +02:00
parent 6a67a11682
commit 3aed177a35
5 changed files with 36 additions and 5 deletions

View File

@ -581,6 +581,8 @@ class Errors(object):
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
"into {values}, but found {value}.")
E986 = ("Could not create any training batches: check your input. " E986 = ("Could not create any training batches: check your input. "
"Perhaps discard_oversize should be set to False ?") "Perhaps discard_oversize should be set to False ?")
E987 = ("The text of an example training instance is either a Doc or " E987 = ("The text of an example training instance is either a Doc or "

View File

@ -1,4 +1,6 @@
import numpy import numpy
from ..tokens import Token
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..attrs import IDS from ..attrs import IDS
from .align cimport Alignment from .align cimport Alignment
@ -97,6 +99,13 @@ def _annot2array(strings, tok_annot, doc_annot):
elif key == "SENT_START": elif key == "SENT_START":
attrs.append(key) attrs.append(key)
values.append(value) values.append(value)
elif key == "ENT_IOB":
iob_strings = Token.iob_strings()
attrs.append(key)
try:
values.append([iob_strings.index(v) for v in value])
except ValueError:
raise ValueError(Errors.E985.format(values=iob_strings, value=values))
else: else:
attrs.append(key) attrs.append(key)
values.append([strings.add(v) for v in value]) values.append([strings.add(v) for v in value])

View File

@ -29,7 +29,7 @@ def test_Example_from_dict_invalid(annots):
vocab = Vocab() vocab = Vocab()
predicted = Doc(vocab, words=annots["words"]) predicted = Doc(vocab, words=annots["words"])
with pytest.raises(ValueError): with pytest.raises(ValueError):
eg = Example.from_dict(predicted, annots) Example.from_dict(predicted, annots)
@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}]) @pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
@ -46,8 +46,8 @@ def test_Example_from_dict_with_tags(annots):
"annots", "annots",
[ [
{ {
"words": ["I", "like", "London", "and", "Berlin", "."], "words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 13, "LOC"), (18, 24, "LOC")], "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
} }
], ],
) )
@ -56,6 +56,16 @@ def test_Example_from_dict_with_entities(annots):
predicted = Doc(vocab, words=annots["words"]) predicted = Doc(vocab, words=annots["words"])
eg = Example.from_dict(predicted, annots) eg = Example.from_dict(predicted, annots)
assert len(list(eg.reference.ents)) == 2 assert len(list(eg.reference.ents)) == 2
assert eg.reference[0].ent_iob_ == "O"
assert eg.reference[1].ent_iob_ == "O"
assert eg.reference[2].ent_iob_ == "B"
assert eg.reference[3].ent_iob_ == "I"
assert eg.reference[4].ent_iob_ == "O"
assert eg.reference[5].ent_iob_ == "B"
assert eg.reference[6].ent_iob_ == "O"
assert eg.reference[2].ent_type_ == "LOC"
assert eg.reference[3].ent_type_ == "LOC"
assert eg.reference[5].ent_type_ == "LOC"
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -825,6 +825,13 @@ cdef class Doc:
for i in range(length): for i in range(length):
if array[i, col] != 0: if array[i, col] != 0:
self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
# Verify ENT_IOB are proper integers
if ENT_IOB in attrs:
iob_strings = Token.iob_strings()
col = attrs.index(ENT_IOB)
for i in range(length):
if array[i, col] not in range(0, len(iob_strings)):
raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col]))
# Now load the data # Now load the data
for i in range(length): for i in range(length):
token = &self.c[i] token = &self.c[i]

View File

@ -778,6 +778,10 @@ cdef class Token:
""" """
return self.c.ent_iob return self.c.ent_iob
@classmethod
def iob_strings(cls):
return ("", "I", "O", "B")
@property @property
def ent_iob_(self): def ent_iob_(self):
"""IOB code of named entity tag. "B" means the token begins an entity, """IOB code of named entity tag. "B" means the token begins an entity,
@ -787,8 +791,7 @@ cdef class Token:
RETURNS (str): IOB code of named entity tag. RETURNS (str): IOB code of named entity tag.
""" """
iob_strings = ("", "I", "O", "B") return self.iob_strings()[self.c.ent_iob]
return iob_strings[self.c.ent_iob]
property ent_id: property ent_id:
"""RETURNS (uint64): ID of the entity the token is an instance of, """RETURNS (uint64): ID of the entity the token is an instance of,