From 3aed177a35ced290cd6eee9773cd73d012202745 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 12 Jun 2020 11:30:24 +0200
Subject: [PATCH] fix ENT_IOB conversion and enable unit test

---
 spacy/errors.py                 |  2 ++
 spacy/gold/new_example.pyx      |  9 +++++++++
 spacy/tests/test_new_example.py | 16 +++++++++++++---
 spacy/tokens/doc.pyx            |  7 +++++++
 spacy/tokens/token.pyx          |  7 +++++--
 5 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 94a0218a7..8efef8333 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -581,6 +581,8 @@ class Errors(object):
 
     # TODO: fix numbering after merging develop into master
 
+    E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
+            "into {values}, but found {value}.")
     E986 = ("Could not create any training batches: check your input. "
             "Perhaps discard_oversize should be set to False ?")
     E987 = ("The text of an example training instance is either a Doc or "
diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx
index fa50e4369..51007e8c3 100644
--- a/spacy/gold/new_example.pyx
+++ b/spacy/gold/new_example.pyx
@@ -1,4 +1,6 @@
 import numpy
+
+from ..tokens import Token
 from ..tokens.doc cimport Doc
 from ..attrs import IDS
 from .align cimport Alignment
@@ -97,6 +99,13 @@ def _annot2array(strings, tok_annot, doc_annot):
         elif key == "SENT_START":
             attrs.append(key)
             values.append(value)
+        elif key == "ENT_IOB":
+            iob_strings = Token.iob_strings()
+            attrs.append(key)
+            try:
+                values.append([iob_strings.index(v) for v in value])
+            except ValueError:
+                raise ValueError(Errors.E985.format(values=iob_strings, value=values))
         else:
             attrs.append(key)
             values.append([strings.add(v) for v in value])
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index a8651dfee..7a43cd9a6 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -29,7 +29,7 @@ def test_Example_from_dict_invalid(annots):
     vocab = Vocab()
     predicted = Doc(vocab, words=annots["words"])
     with pytest.raises(ValueError):
-        eg = Example.from_dict(predicted, annots)
+        Example.from_dict(predicted, annots)
 
 
 @pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
@@ -46,8 +46,8 @@ def test_Example_from_dict_with_tags(annots):
     "annots",
     [
         {
-            "words": ["I", "like", "London", "and", "Berlin", "."],
-            "entities": [(7, 13, "LOC"), (18, 24, "LOC")],
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
         }
     ],
 )
@@ -56,6 +56,16 @@ def test_Example_from_dict_with_entities(annots):
     predicted = Doc(vocab, words=annots["words"])
     eg = Example.from_dict(predicted, annots)
     assert len(list(eg.reference.ents)) == 2
+    assert eg.reference[0].ent_iob_ == "O"
+    assert eg.reference[1].ent_iob_ == "O"
+    assert eg.reference[2].ent_iob_ == "B"
+    assert eg.reference[3].ent_iob_ == "I"
+    assert eg.reference[4].ent_iob_ == "O"
+    assert eg.reference[5].ent_iob_ == "B"
+    assert eg.reference[6].ent_iob_ == "O"
+    assert eg.reference[2].ent_type_ == "LOC"
+    assert eg.reference[3].ent_type_ == "LOC"
+    assert eg.reference[5].ent_type_ == "LOC"
 
 
 @pytest.mark.parametrize(
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 81cef4492..c4581d0a8 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -825,6 +825,13 @@ cdef class Doc:
             for i in range(length):
                 if array[i, col] != 0:
                     self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
+        # Verify ENT_IOB are proper integers
+        if ENT_IOB in attrs:
+            iob_strings = Token.iob_strings()
+            col = attrs.index(ENT_IOB)
+            for i in range(length):
+                if array[i, col] not in range(0, len(iob_strings)):
+                    raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col]))
         # Now load the data
         for i in range(length):
             token = &self.c[i]
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 320cfaad5..f85a17d69 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -778,6 +778,10 @@ cdef class Token:
         """
         return self.c.ent_iob
 
+    @classmethod
+    def iob_strings(cls):
+        return ("", "I", "O", "B")
+
     @property
     def ent_iob_(self):
         """IOB code of named entity tag. "B" means the token begins an entity,
@@ -787,8 +791,7 @@ cdef class Token:
 
         RETURNS (str): IOB code of named entity tag.
         """
-        iob_strings = ("", "I", "O", "B")
-        return iob_strings[self.c.ent_iob]
+        return self.iob_strings()[self.c.ent_iob]
 
     property ent_id:
         """RETURNS (uint64): ID of the entity the token is an instance of,