From a3d0cb15d314b66fef73d051fd34d44323a6934c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 26 Mar 2018 07:16:06 +0200 Subject: [PATCH] Fix ent_iob tags in doc.merge to avoid inconsistent sequences --- spacy/tokens/doc.pyx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index bde0719ab..e3fbb4552 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -955,6 +955,13 @@ cdef class Doc: self.vocab.morphology.assign_tag(token, attr_value) else: Token.set_struct_attr(token, attr_name, attr_value) + # Make sure ent_iob remains consistent + if self.c[end].ent_iob == 1 and token.ent_iob in (0, 2): + if token.ent_type == self.c[end].ent_type: + token.ent_iob = 3 + else: + # If they're not the same entity type, let them be two entities + self.c[end].ent_iob = 3 # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets # Before thinking of something simpler, beware the case where a @@ -980,8 +987,6 @@ cdef class Doc: self.c[i].head = start elif head_idx >= end: self.c[i].head -= offset - token.ent_iob = span[0].ent_iob - token.ent_type = span[0].ent_type # Now compress the token array for i in range(end, self.length): self.c[i - offset] = self.c[i] @@ -992,7 +997,6 @@ cdef class Doc: for i in range(self.length): # ...And, set heads back to a relative position self.c[i].head -= i - # TODO: Fix entity IOB # Set the left/right children, left/right edges set_children_from_heads(self.c, self.length) # Clear the cached Python objects