From 6906af3d8f8777e48c5897d1ca3fbd69c32e9ee4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 24 Mar 2023 15:18:31 +0100
Subject: [PATCH] NER: Ensure zero-cost sequence with sentence split in entity

If we use a sentence splitter as one of the annotating components during
training, an entity can become split in the predicted `Doc`. Before this
change, training would fail, because no zero-cost transition sequence
could be found.

This fixes two scenarios:

1. When the gold action is `B` and a split occurs after the current
   token, the `BEGIN` action is invalid. However, this was the only
   possible zero-cost action. This change makes `OUT` a zero-cost
   action in this case.
2. When the gold action is `I` and a split occurs after the current
   token, the `IN` action is invalid, removing the only zero-cost
   action. This change makes `LAST` a zero-cost action, so that the
   entity can be properly closed.
---
 spacy/pipeline/_parser_internals/ner.pyx | 18 +++++++++++------
 spacy/tests/parser/test_ner.py           | 25 ++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index fab872f00..883b48ccf 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -120,6 +120,10 @@ cdef bint _entity_is_sunk(const StateC* state, Transition* golds) nogil:
         return False
 
 
+cdef bint _next_is_sent_start(const StateC* state) nogil:
+    return state.B(1) != -1 and state.B_(1).sent_start == 1
+
+
 cdef class BiluoPushDown(TransitionSystem):
     def __init__(self, *args, **kwargs):
         TransitionSystem.__init__(self, *args, **kwargs)
@@ -388,7 +392,7 @@ cdef class Begin:
         elif st.B_(1).ent_iob == 3:
             # If the next word is B, we can't B now
             return False
-        elif st.B_(1).sent_start == 1:
+        elif _next_is_sent_start(st):
             # Don't allow entities to extend across sentence boundaries
             return False
         # Don't allow entities to start on whitespace
@@ -466,7 +470,7 @@ cdef class In:
                 # Otherwise, force acceptance, even if we're across a sentence
                 # boundary or the token is whitespace.
                 return True
-        elif st.B(1) != -1 and st.B_(1).sent_start == 1:
+        elif _next_is_sent_start(st):
             # Don't allow entities to extend across sentence boundaries
             return False
         else:
@@ -558,8 +562,9 @@ cdef class Last:
             # L, Gold B --> True
             pass
         elif g_act == IN:
-            # L, Gold I --> True iff this entity sunk
-            cost += not _entity_is_sunk(s, gold.ner)
+            # L, Gold I --> True iff this entity sunk or there is sentence
+            #               sentence break after the first buffer token.
+            cost += not (_entity_is_sunk(s, gold.ner) or _next_is_sent_start(s))
         elif g_act == LAST:
             # L, Gold L --> True
             pass
@@ -674,8 +679,9 @@ cdef class Out:
         if g_act == MISSING:
             pass
         elif g_act == BEGIN:
-            # O, Gold B --> False
-            cost += 1
+            # O, Gold B --> False, unless there is a sentence break after
+            #               the next buffer token.
+            cost += not _next_is_sent_start(s)
         elif g_act == IN:
             # O, Gold I --> True
             pass
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 030182a63..eb3e67740 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -816,6 +816,31 @@ def test_ner_warns_no_lookups(caplog):
         assert "W033" not in caplog.text
 
 
+def test_train_sent_split_in_entity():
+    # Check that we can train on inputs when entities are sentence-split
+    # by an annotating component.
+    nlp = English()
+    ner = nlp.add_pipe("ner", config={"update_with_oracle_cut_size": 3})
+
+    eg = Example.from_dict(
+        nlp.make_doc("I like the Kinesis Advantage2 LF very much."),
+        {"entities": [(11, 32, "MISC")]},
+    )
+
+    # Go bezerk, put a boundary on every combination of tokens.
+    train_examples = []
+    for i in range(1, len(eg.predicted)):
+        for j in range(1, len(eg.predicted)):
+            eg_ij = eg.copy()
+            eg_ij.predicted[i].is_sent_start = True
+            eg_ij.predicted[j].is_sent_start = True
+            train_examples.append(eg_ij)
+
+    ner.add_label("MISC")
+    nlp.initialize()
+    nlp.update(train_examples, sgd=False, annotates=[])
+
+
 @Language.factory("blocker")
 class BlockerComponent1:
     def __init__(self, nlp, start, end, name="my_blocker"):