mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 09:00:36 +03:00
fix ENT_IOB conversion and enable unit test
This commit is contained in:
parent
6a67a11682
commit
3aed177a35
|
@ -581,6 +581,8 @@ class Errors(object):
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
|
||||||
|
E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
||||||
|
"into {values}, but found {value}.")
|
||||||
E986 = ("Could not create any training batches: check your input. "
|
E986 = ("Could not create any training batches: check your input. "
|
||||||
"Perhaps discard_oversize should be set to False ?")
|
"Perhaps discard_oversize should be set to False ?")
|
||||||
E987 = ("The text of an example training instance is either a Doc or "
|
E987 = ("The text of an example training instance is either a Doc or "
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
from ..tokens import Token
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
from .align cimport Alignment
|
from .align cimport Alignment
|
||||||
|
@ -97,6 +99,13 @@ def _annot2array(strings, tok_annot, doc_annot):
|
||||||
elif key == "SENT_START":
|
elif key == "SENT_START":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append(value)
|
values.append(value)
|
||||||
|
elif key == "ENT_IOB":
|
||||||
|
iob_strings = Token.iob_strings()
|
||||||
|
attrs.append(key)
|
||||||
|
try:
|
||||||
|
values.append([iob_strings.index(v) for v in value])
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(Errors.E985.format(values=iob_strings, value=values))
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([strings.add(v) for v in value])
|
values.append([strings.add(v) for v in value])
|
||||||
|
|
|
@ -29,7 +29,7 @@ def test_Example_from_dict_invalid(annots):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
eg = Example.from_dict(predicted, annots)
|
Example.from_dict(predicted, annots)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
|
@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
|
||||||
|
@ -46,8 +46,8 @@ def test_Example_from_dict_with_tags(annots):
|
||||||
"annots",
|
"annots",
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"words": ["I", "like", "London", "and", "Berlin", "."],
|
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||||
"entities": [(7, 13, "LOC"), (18, 24, "LOC")],
|
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -56,6 +56,16 @@ def test_Example_from_dict_with_entities(annots):
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
eg = Example.from_dict(predicted, annots)
|
eg = Example.from_dict(predicted, annots)
|
||||||
assert len(list(eg.reference.ents)) == 2
|
assert len(list(eg.reference.ents)) == 2
|
||||||
|
assert eg.reference[0].ent_iob_ == "O"
|
||||||
|
assert eg.reference[1].ent_iob_ == "O"
|
||||||
|
assert eg.reference[2].ent_iob_ == "B"
|
||||||
|
assert eg.reference[3].ent_iob_ == "I"
|
||||||
|
assert eg.reference[4].ent_iob_ == "O"
|
||||||
|
assert eg.reference[5].ent_iob_ == "B"
|
||||||
|
assert eg.reference[6].ent_iob_ == "O"
|
||||||
|
assert eg.reference[2].ent_type_ == "LOC"
|
||||||
|
assert eg.reference[3].ent_type_ == "LOC"
|
||||||
|
assert eg.reference[5].ent_type_ == "LOC"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|
|
@ -825,6 +825,13 @@ cdef class Doc:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
if array[i, col] != 0:
|
if array[i, col] != 0:
|
||||||
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
||||||
|
# Verify ENT_IOB are proper integers
|
||||||
|
if ENT_IOB in attrs:
|
||||||
|
iob_strings = Token.iob_strings()
|
||||||
|
col = attrs.index(ENT_IOB)
|
||||||
|
for i in range(length):
|
||||||
|
if array[i, col] not in range(0, len(iob_strings)):
|
||||||
|
raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col]))
|
||||||
# Now load the data
|
# Now load the data
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
token = &self.c[i]
|
token = &self.c[i]
|
||||||
|
|
|
@ -778,6 +778,10 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self.c.ent_iob
|
return self.c.ent_iob
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def iob_strings(cls):
|
||||||
|
return ("", "I", "O", "B")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ent_iob_(self):
|
def ent_iob_(self):
|
||||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||||
|
@ -787,8 +791,7 @@ cdef class Token:
|
||||||
|
|
||||||
RETURNS (str): IOB code of named entity tag.
|
RETURNS (str): IOB code of named entity tag.
|
||||||
"""
|
"""
|
||||||
iob_strings = ("", "I", "O", "B")
|
return self.iob_strings()[self.c.ent_iob]
|
||||||
return iob_strings[self.c.ent_iob]
|
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user