From a28f8f369ebd18da025a40f833942dd168361d8d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 15 Jun 2020 23:06:22 +0200
Subject: [PATCH] Fix many-to-one IOB codes

---
 spacy/gold/example.pyx   | 14 ++++++++++++++
 spacy/tests/test_gold.py |  8 ++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 425254320..99a1bbafa 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -95,6 +95,20 @@ cdef class Example:
             else:
                 output.append(gold_values[gold_i])
 
+        if field in ["ENT_IOB"]:
+            # Fix many-to-one IOB codes
+            prev_j = -1
+            prev_value = -1
+            for i, value in enumerate(output):
+                if i in i2j_multi:
+                    j = i2j_multi[i]
+                    if j == prev_j and prev_value == value == 3:
+                        output[i] = 1  # set B to I
+                    prev_j = j
+                else:
+                    prev_j = -1
+                prev_value = value
+
         if field in ["ENT_IOB", "ENT_TYPE"]:
             # Assign O/- for one-to-many O/- NER tags
             for j, cand_j in enumerate(gold_to_cand):
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 5cf3b4c01..f604f4b53 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -166,10 +166,10 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
     spaces = [True, True, True, True, True, False, False]
     doc = Doc(en_vocab, words=words, spaces=spaces)
     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    gp = GoldParse(
-        doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities
-    )
-    assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
+    gold_words =["I", "flew to", "San Francisco Valley", "."]
+    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
+    assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2]
+    assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""]
 
     # misaligned
     words = ["I flew", "to", "San Francisco", "Valley", "."]