From a3d0cb15d314b66fef73d051fd34d44323a6934c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 26 Mar 2018 07:16:06 +0200
Subject: [PATCH] Fix ent_iob tags in doc.merge to avoid inconsistent sequences

---
 spacy/tokens/doc.pyx | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index bde0719ab..e3fbb4552 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -955,6 +955,13 @@ cdef class Doc:
                 self.vocab.morphology.assign_tag(token, attr_value)
             else:
                 Token.set_struct_attr(token, attr_name, attr_value)
+        # Make sure ent_iob remains consistent
+        if self.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
+            if token.ent_type == self.c[end].ent_type:
+                token.ent_iob = 3
+            else:
+                # If they're not the same entity type, let them be two entities
+                self.c[end].ent_iob = 3
         # Begin by setting all the head indices to absolute token positions
         # This is easier to work with for now than the offsets
         # Before thinking of something simpler, beware the case where a
@@ -980,8 +987,6 @@ cdef class Doc:
                 self.c[i].head = start
             elif head_idx >= end:
                 self.c[i].head -= offset
-        token.ent_iob = span[0].ent_iob
-        token.ent_type = span[0].ent_type
         # Now compress the token array
         for i in range(end, self.length):
             self.c[i - offset] = self.c[i]
@@ -992,7 +997,6 @@ cdef class Doc:
         for i in range(self.length):
             # ...And, set heads back to a relative position
             self.c[i].head -= i
-        # TODO: Fix entity IOB
         # Set the left/right children, left/right edges
         set_children_from_heads(self.c, self.length)
         # Clear the cached Python objects