From 44a0f9c2c82859578393bd3652de4e8b35bc0154 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 16 Jun 2020 15:21:20 +0200
Subject: [PATCH] test_gold_biluo_different_tokenization works

---
 spacy/gold/example.pyx   | 15 ++++++++-------
 spacy/tests/test_gold.py | 34 +++++++++++++++-------------------
 2 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index adae9335b..09a32ee4d 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -85,17 +85,17 @@ cdef class Example:
 
         vocab = self.reference.vocab
         gold_values = self.reference.to_array([field])
-        output = []
+        output = [None] * len(self.predicted)
         for i, gold_i in enumerate(cand_to_gold):
             if self.predicted[i].text.isspace():
-                output.append(None)
-            elif gold_i is None:
+                output[i] = None
+            if gold_i is None:
                 if i in i2j_multi:
-                    output.append(gold_values[i2j_multi[i]])
+                    output[i] = gold_values[i2j_multi[i]]
                 else:
-                    output.append(None)
+                    output[i] = None
             else:
-                output.append(gold_values[gold_i])
+                output[i] = gold_values[gold_i]
 
         if field in ["ENT_IOB"]:
             # Fix many-to-one IOB codes
@@ -117,7 +117,8 @@ cdef class Example:
                 if cand_j is None:
                     if j in j2i_multi:
                         i = j2i_multi[j]
-                        output[i] = gold_values[j]
+                        if output[i] is None:
+                            output[i] = gold_values[j]
 
         if as_string:
             output = [vocab.strings[o] if o is not None else o for o in output]
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 5f92a476c..ea9c460ac 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -158,7 +158,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
     gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
-    assert example.get_aligned("ENT_IOB") == [2, 2, 1, 2]
+    assert example.get_aligned("ENT_IOB") == [2, 2, 3, 2]
     assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", ""]
 
     # many-to-one
@@ -195,25 +195,21 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
     assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""]
 
     # from issue #4791
-    data = (
-        "I'll return the ₹54 amount",
-        {
-            "words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
-            "entities": [(16, 19, "MONEY")],
-        },
-    )
-    gp = GoldParse(en_tokenizer(data[0]), **data[1])
-    assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"]
+    doc = en_tokenizer("I'll return the ₹54 amount")
+    gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
+    gold_spaces = [False, True, True, True, False, True, False]
+    entities = [(16, 19, "MONEY")]
+    example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
+    assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 2]
+    assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", ""]
 
-    data = (
-        "I'll return the $54 amount",
-        {
-            "words": ["I", "'ll", "return", "the", "$", "54", "amount"],
-            "entities": [(16, 19, "MONEY")],
-        },
-    )
-    gp = GoldParse(en_tokenizer(data[0]), **data[1])
-    assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"]
+    doc = en_tokenizer("I'll return the $54 amount")
+    gold_words = ["I", "'ll", "return", "the", "$", "54", "amount"]
+    gold_spaces = [False, True, True, True, False, True, False]
+    entities = [(16, 19, "MONEY")]
+    example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
+    assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2]
+    assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", "MONEY", ""]
 
 
 def test_roundtrip_offsets_biluo_conversion(en_tokenizer):