fix ENT_IOB conversion and enable unit test

2025-07-15 02:32:37 +03:00 · 2020-06-12 11:30:24 +02:00 · 2020-06-12 11:30:24 +02:00 · 3aed177a35
commit 3aed177a35
parent 6a67a11682
5 changed files with 36 additions and 5 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -581,6 +581,8 @@ class Errors(object):

    # TODO: fix numbering after merging develop into master

+    E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
+            "into {values}, but found {value}.")
    E986 = ("Could not create any training batches: check your input. "
            "Perhaps discard_oversize should be set to False ?")
    E987 = ("The text of an example training instance is either a Doc or "
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@ -1,4 +1,6 @@
 import numpy
+
+from ..tokens import Token
 from ..tokens.doc cimport Doc
 from ..attrs import IDS
 from .align cimport Alignment
@ -97,6 +99,13 @@ def _annot2array(strings, tok_annot, doc_annot):
        elif key == "SENT_START":
            attrs.append(key)
            values.append(value)
+        elif key == "ENT_IOB":
+            iob_strings = Token.iob_strings()
+            attrs.append(key)
+            try:
+                values.append([iob_strings.index(v) for v in value])
+            except ValueError:
+                raise ValueError(Errors.E985.format(values=iob_strings, value=values))
        else:
            attrs.append(key)
            values.append([strings.add(v) for v in value])
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@ -29,7 +29,7 @@ def test_Example_from_dict_invalid(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    with pytest.raises(ValueError):
-        eg = Example.from_dict(predicted, annots)
+        Example.from_dict(predicted, annots)


@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
@ -46,8 +46,8 @@ def test_Example_from_dict_with_tags(annots):
    "annots",
    [
        {
-            "words": ["I", "like", "London", "and", "Berlin", "."],
-            "entities": [(7, 13, "LOC"), (18, 24, "LOC")],
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
        }
    ],
 )
@ -56,6 +56,16 @@ def test_Example_from_dict_with_entities(annots):
    predicted = Doc(vocab, words=annots["words"])
    eg = Example.from_dict(predicted, annots)
    assert len(list(eg.reference.ents)) == 2
+    assert eg.reference[0].ent_iob_ == "O"
+    assert eg.reference[1].ent_iob_ == "O"
+    assert eg.reference[2].ent_iob_ == "B"
+    assert eg.reference[3].ent_iob_ == "I"
+    assert eg.reference[4].ent_iob_ == "O"
+    assert eg.reference[5].ent_iob_ == "B"
+    assert eg.reference[6].ent_iob_ == "O"
+    assert eg.reference[2].ent_type_ == "LOC"
+    assert eg.reference[3].ent_type_ == "LOC"
+    assert eg.reference[5].ent_type_ == "LOC"


@pytest.mark.parametrize(
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -825,6 +825,13 @@ cdef class Doc:
            for i in range(length):
                if array[i, col] != 0:
                    self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
+        # Verify ENT_IOB are proper integers
+        if ENT_IOB in attrs:
+            iob_strings = Token.iob_strings()
+            col = attrs.index(ENT_IOB)
+            for i in range(length):
+                if array[i, col] not in range(0, len(iob_strings)):
+                    raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col]))
        # Now load the data
        for i in range(length):
            token = &self.c[i]
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -778,6 +778,10 @@ cdef class Token:
        """
        return self.c.ent_iob

+    @classmethod
+    def iob_strings(cls):
+        return ("", "I", "O", "B")
+
    @property
    def ent_iob_(self):
        """IOB code of named entity tag. "B" means the token begins an entity,
@ -787,8 +791,7 @@ cdef class Token:

        RETURNS (str): IOB code of named entity tag.
        """
-        iob_strings = ("", "I", "O", "B")
-        return iob_strings[self.c.ent_iob]
+        return self.iob_strings()[self.c.ent_iob]

    property ent_id:
        """RETURNS (uint64): ID of the entity the token is an instance of,