mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 00:50:33 +03:00
fix ENT_IOB conversion and enable unit test
This commit is contained in:
parent
6a67a11682
commit
3aed177a35
|
@ -581,6 +581,8 @@ class Errors(object):
|
|||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
|
||||
E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
||||
"into {values}, but found {value}.")
|
||||
E986 = ("Could not create any training batches: check your input. "
|
||||
"Perhaps discard_oversize should be set to False ?")
|
||||
E987 = ("The text of an example training instance is either a Doc or "
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import numpy
|
||||
|
||||
from ..tokens import Token
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..attrs import IDS
|
||||
from .align cimport Alignment
|
||||
|
@ -97,6 +99,13 @@ def _annot2array(strings, tok_annot, doc_annot):
|
|||
elif key == "SENT_START":
|
||||
attrs.append(key)
|
||||
values.append(value)
|
||||
elif key == "ENT_IOB":
|
||||
iob_strings = Token.iob_strings()
|
||||
attrs.append(key)
|
||||
try:
|
||||
values.append([iob_strings.index(v) for v in value])
|
||||
except ValueError:
|
||||
raise ValueError(Errors.E985.format(values=iob_strings, value=values))
|
||||
else:
|
||||
attrs.append(key)
|
||||
values.append([strings.add(v) for v in value])
|
||||
|
|
|
@ -29,7 +29,7 @@ def test_Example_from_dict_invalid(annots):
|
|||
vocab = Vocab()
|
||||
predicted = Doc(vocab, words=annots["words"])
|
||||
with pytest.raises(ValueError):
|
||||
eg = Example.from_dict(predicted, annots)
|
||||
Example.from_dict(predicted, annots)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
|
||||
|
@ -46,8 +46,8 @@ def test_Example_from_dict_with_tags(annots):
|
|||
"annots",
|
||||
[
|
||||
{
|
||||
"words": ["I", "like", "London", "and", "Berlin", "."],
|
||||
"entities": [(7, 13, "LOC"), (18, 24, "LOC")],
|
||||
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
@ -56,6 +56,16 @@ def test_Example_from_dict_with_entities(annots):
|
|||
predicted = Doc(vocab, words=annots["words"])
|
||||
eg = Example.from_dict(predicted, annots)
|
||||
assert len(list(eg.reference.ents)) == 2
|
||||
assert eg.reference[0].ent_iob_ == "O"
|
||||
assert eg.reference[1].ent_iob_ == "O"
|
||||
assert eg.reference[2].ent_iob_ == "B"
|
||||
assert eg.reference[3].ent_iob_ == "I"
|
||||
assert eg.reference[4].ent_iob_ == "O"
|
||||
assert eg.reference[5].ent_iob_ == "B"
|
||||
assert eg.reference[6].ent_iob_ == "O"
|
||||
assert eg.reference[2].ent_type_ == "LOC"
|
||||
assert eg.reference[3].ent_type_ == "LOC"
|
||||
assert eg.reference[5].ent_type_ == "LOC"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
|
@ -825,6 +825,13 @@ cdef class Doc:
|
|||
for i in range(length):
|
||||
if array[i, col] != 0:
|
||||
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
||||
# Verify ENT_IOB are proper integers
|
||||
if ENT_IOB in attrs:
|
||||
iob_strings = Token.iob_strings()
|
||||
col = attrs.index(ENT_IOB)
|
||||
for i in range(length):
|
||||
if array[i, col] not in range(0, len(iob_strings)):
|
||||
raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col]))
|
||||
# Now load the data
|
||||
for i in range(length):
|
||||
token = &self.c[i]
|
||||
|
|
|
@ -778,6 +778,10 @@ cdef class Token:
|
|||
"""
|
||||
return self.c.ent_iob
|
||||
|
||||
@classmethod
|
||||
def iob_strings(cls):
|
||||
return ("", "I", "O", "B")
|
||||
|
||||
@property
|
||||
def ent_iob_(self):
|
||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||
|
@ -787,8 +791,7 @@ cdef class Token:
|
|||
|
||||
RETURNS (str): IOB code of named entity tag.
|
||||
"""
|
||||
iob_strings = ("", "I", "O", "B")
|
||||
return iob_strings[self.c.ent_iob]
|
||||
return self.iob_strings()[self.c.ent_iob]
|
||||
|
||||
property ent_id:
|
||||
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
||||
|
|
Loading…
Reference in New Issue
Block a user