From 178feae00ab71a27657250bff95b53aba1a9a4e8 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 4 Jul 2022 19:37:42 +0900
Subject: [PATCH] Add tests to give up with whitespace differences

Docs in Examples are allowed to have arbitrarily different whitespace.
Handling that properly would be nice but isn't required, but for now
check for it and blow up.
---
 spacy/pipeline/coref.py                     |  9 ++++++++-
 spacy/pipeline/span_predictor.py            |  7 +++++++
 spacy/tests/pipeline/test_coref.py          | 17 +++++++++++++++++
 spacy/tests/pipeline/test_span_predictor.py | 18 +++++++++++++++++-
 4 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 1e11a0417..af40d9b06 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -218,6 +218,13 @@ class CoreferenceResolver(TrainablePipe):
         total_loss = 0
 
         for eg in examples:
+            if eg.x.text != eg.y.text:
+                # TODO assign error number
+                raise ValueError(
+                    """Text, including whitespace, must match between reference and
+                    predicted docs in coref training.
+                    """
+                )
             # TODO check this causes no issues (in practice it runs)
             preds, backprop = self.model.begin_update([eg.predicted])
             score_matrix, mention_idx = preds
@@ -277,7 +284,7 @@ class CoreferenceResolver(TrainablePipe):
                 if span is None:
                     # TODO log more details
                     raise IndexError(Errors.E1043)
-                cc.append( (span.start, span.end) )
+                cc.append((span.start, span.end))
             clusters.append(cc)
 
         span_idxs = create_head_span_idxs(ops, len(example.predicted))
diff --git a/spacy/pipeline/span_predictor.py b/spacy/pipeline/span_predictor.py
index c9343a97e..aee11ba8e 100644
--- a/spacy/pipeline/span_predictor.py
+++ b/spacy/pipeline/span_predictor.py
@@ -178,6 +178,13 @@ class SpanPredictor(TrainablePipe):
 
         total_loss = 0
         for eg in examples:
+            if eg.x.text != eg.y.text:
+                # TODO assign error number
+                raise ValueError(
+                    """Text, including whitespace, must match between reference and
+                    predicted docs in span predictor training.
+                    """
+                )
             span_scores, backprop = self.model.begin_update([eg.predicted])
             # FIXME, this only happens once in the first 1000 docs of OntoNotes
             # and I'm not sure yet why.
diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 9a969acdd..7fc4864a3 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -218,3 +218,20 @@ def test_sentence_map(snlp):
     doc = snlp("I like text. This is text.")
     sm = get_sentence_ids(doc)
     assert sm == [0, 0, 0, 0, 1, 1, 1, 1]
+
+
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
+def test_whitespace_mismatch(nlp):
+    train_examples = []
+    for text, annot in TRAIN_DATA:
+        eg = Example.from_dict(nlp.make_doc(text), annot)
+        eg.predicted = nlp.make_doc("  " + text)
+        train_examples.append(eg)
+
+    nlp.add_pipe("coref", config=CONFIG)
+    optimizer = nlp.initialize()
+    test_text = TRAIN_DATA[0][0]
+    doc = nlp(test_text)
+
+    with pytest.raises(ValueError, match="whitespace"):
+        nlp.update(train_examples, sgd=optimizer)
diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
index 3a3111bd4..a79756d88 100644
--- a/spacy/tests/pipeline/test_span_predictor.py
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -106,7 +106,7 @@ def test_overfitting_IO(nlp):
         pred = eg.predicted
         for key, spans in ref.spans.items():
             if key.startswith("coref_head_clusters"):
-                pred.spans[key] = [pred[span.start:span.end] for span in spans]
+                pred.spans[key] = [pred[span.start : span.end] for span in spans]
 
         train_examples.append(eg)
     nlp.add_pipe("span_predictor", config=CONFIG)
@@ -209,3 +209,19 @@ def test_tokenization_mismatch(nlp):
     assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
     assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
 
+
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
+def test_whitespace_mismatch(nlp):
+    train_examples = []
+    for text, annot in TRAIN_DATA:
+        eg = Example.from_dict(nlp.make_doc(text), annot)
+        eg.predicted = nlp.make_doc("  " + text)
+        train_examples.append(eg)
+
+    nlp.add_pipe("span_predictor", config=CONFIG)
+    optimizer = nlp.initialize()
+    test_text = TRAIN_DATA[0][0]
+    doc = nlp(test_text)
+
+    with pytest.raises(ValueError, match="whitespace"):
+        nlp.update(train_examples, sgd=optimizer)