From 3aed177a35ced290cd6eee9773cd73d012202745 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 12 Jun 2020 11:30:24 +0200 Subject: [PATCH] fix ENT_IOB conversion and enable unit test --- spacy/errors.py | 2 ++ spacy/gold/new_example.pyx | 9 +++++++++ spacy/tests/test_new_example.py | 16 +++++++++++++--- spacy/tokens/doc.pyx | 7 +++++++ spacy/tokens/token.pyx | 7 +++++-- 5 files changed, 36 insertions(+), 5 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 94a0218a7..8efef8333 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -581,6 +581,8 @@ class Errors(object): # TODO: fix numbering after merging develop into master + E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing " + "into {values}, but found {value}.") E986 = ("Could not create any training batches: check your input. " "Perhaps discard_oversize should be set to False ?") E987 = ("The text of an example training instance is either a Doc or " diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index fa50e4369..51007e8c3 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -1,4 +1,6 @@ import numpy + +from ..tokens import Token from ..tokens.doc cimport Doc from ..attrs import IDS from .align cimport Alignment @@ -97,6 +99,13 @@ def _annot2array(strings, tok_annot, doc_annot): elif key == "SENT_START": attrs.append(key) values.append(value) + elif key == "ENT_IOB": + iob_strings = Token.iob_strings() + attrs.append(key) + try: + values.append([iob_strings.index(v) for v in value]) + except ValueError: + raise ValueError(Errors.E985.format(values=iob_strings, value=values)) else: attrs.append(key) values.append([strings.add(v) for v in value]) diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index a8651dfee..7a43cd9a6 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -29,7 +29,7 @@ def test_Example_from_dict_invalid(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) with pytest.raises(ValueError): - eg = Example.from_dict(predicted, annots) + Example.from_dict(predicted, annots) @pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}]) @@ -46,8 +46,8 @@ def test_Example_from_dict_with_tags(annots): "annots", [ { - "words": ["I", "like", "London", "and", "Berlin", "."], - "entities": [(7, 13, "LOC"), (18, 24, "LOC")], + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], } ], ) @@ -56,6 +56,16 @@ def test_Example_from_dict_with_entities(annots): predicted = Doc(vocab, words=annots["words"]) eg = Example.from_dict(predicted, annots) assert len(list(eg.reference.ents)) == 2 + assert eg.reference[0].ent_iob_ == "O" + assert eg.reference[1].ent_iob_ == "O" + assert eg.reference[2].ent_iob_ == "B" + assert eg.reference[3].ent_iob_ == "I" + assert eg.reference[4].ent_iob_ == "O" + assert eg.reference[5].ent_iob_ == "B" + assert eg.reference[6].ent_iob_ == "O" + assert eg.reference[2].ent_type_ == "LOC" + assert eg.reference[3].ent_type_ == "LOC" + assert eg.reference[5].ent_type_ == "LOC" @pytest.mark.parametrize( diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 81cef4492..c4581d0a8 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -825,6 +825,13 @@ cdef class Doc: for i in range(length): if array[i, col] != 0: self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) + # Verify ENT_IOB are proper integers + if ENT_IOB in attrs: + iob_strings = Token.iob_strings() + col = attrs.index(ENT_IOB) + for i in range(length): + if array[i, col] not in range(0, len(iob_strings)): + raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col])) # Now load the data for i in range(length): token = &self.c[i] diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 320cfaad5..f85a17d69 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -778,6 +778,10 @@ cdef class Token: """ return self.c.ent_iob + @classmethod + def iob_strings(cls): + return ("", "I", "O", "B") + @property def ent_iob_(self): """IOB code of named entity tag. "B" means the token begins an entity, @@ -787,8 +791,7 @@ cdef class Token: RETURNS (str): IOB code of named entity tag. """ - iob_strings = ("", "I", "O", "B") - return iob_strings[self.c.ent_iob] + return self.iob_strings()[self.c.ent_iob] property ent_id: """RETURNS (uint64): ID of the entity the token is an instance of,