From af6d5ae2fee7a2a95192720a22fb4c4864886a7f Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 28 Jun 2022 19:04:24 +0900
Subject: [PATCH 01/17] Initial test of mismatched tokenization

This runs, but the results are nonsense because the indices are off.
---
 spacy/tests/pipeline/test_coref.py | 56 ++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 53f0b2011..358da6b03 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -152,6 +152,62 @@ def test_overfitting_IO(nlp):
     # assert_equal(batch_deps_1, batch_deps_2)
     # assert_equal(batch_deps_1, no_batch_deps)
 
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
+def test_tokenization_mismatch(nlp):
+    train_examples = []
+    for text, annot in TRAIN_DATA:
+        eg = Example.from_dict(nlp.make_doc(text), annot)
+        ref = eg.reference
+        char_spans = {}
+        for key, cluster in ref.spans.items():
+            char_spans[key] = []
+            for span in cluster:
+                char_spans[key].append( (span[0].idx, span[-1].idx + len(span[-1])) )
+        with ref.retokenize() as retokenizer:
+            # merge "many friends"
+            retokenizer.merge(ref[5:7])
+
+        # Note this works because it's the same doc and we know the keys
+        for key, _ in ref.spans.items():
+            spans = char_spans[key]
+            ref.spans[key] = [ref.char_span(*span) for span in spans]
+
+        train_examples.append(eg)
+
+    nlp.add_pipe("coref")
+    optimizer = nlp.initialize()
+    test_text = TRAIN_DATA[0][0]
+    doc = nlp(test_text)
+
+    for i in range(15):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+        doc = nlp(test_text)
+        print(i, doc.spans)
+
+    # test the trained model
+    doc = nlp(test_text)
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+
+    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
+    texts = [
+        test_text,
+        "I noticed many friends around me",
+        "They received it. They received the SMS.",
+    ]
+
+    # save the docs so they don't get garbage collected
+    docs = list(nlp.pipe(texts))
+    batch_deps_1 = [doc.spans for doc in docs]
+    docs = list(nlp.pipe(texts))
+    batch_deps_2 = [doc.spans for doc in docs]
+    docs = [nlp(text) for text in texts]
+    no_batch_deps = [doc.spans for doc in docs]
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_crossing_spans():

From ef5762d78eca97f487efd93dfeb233b4fcea0d30 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 28 Jun 2022 19:06:13 +0900
Subject: [PATCH 02/17] Bad hack to get tests to run

This changes the tok2vec size in coref to hardcoded 64 to get tests to
run. This should be reverted and hopefully replaced with proper shape
inference.
---
 spacy/ml/models/coref.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index a8c880a39..5fa75800e 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -22,7 +22,7 @@ def build_wl_coref_model(
     tok2vec_size: int = 768,  # tok2vec size
 ):
     # TODO add model return types
-    # dim = tok2vec.maybe_get_dim("n0")
+    tok2vec_size = 64
 
     with Model.define_operators({">>": chain}):
         coref_clusterer = PyTorchWrapper(

From d1ff933e9b77b723b7ed326a6c339340fd47d318 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 28 Jun 2022 19:15:33 +0900
Subject: [PATCH 03/17] Test works

This may not be done yet, as the test is just for consistency, and not
overfitting correctly yet.
---
 spacy/errors.py                    |  1 +
 spacy/ml/models/coref_util.py      | 12 +++++----
 spacy/pipeline/coref.py            | 14 +++++++++-
 spacy/tests/pipeline/test_coref.py | 41 +++++++++++++++++-------------
 4 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index c82ffe882..837bfd740 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -919,6 +919,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E1035 = ("Token index {i} out of bounds ({length})")
     E1036 = ("Cannot index into NoneNode")
     E1037 = ("Invalid attribute value '{attr}'.")
+    E1038 = ("Misalignment in coref. Head token has no match in training doc.")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/ml/models/coref_util.py b/spacy/ml/models/coref_util.py
index a004a69d7..bd577e65f 100644
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@@ -143,16 +143,18 @@ def create_head_span_idxs(ops, doclen: int):
 
 
 def get_clusters_from_doc(doc) -> List[List[Tuple[int, int]]]:
-    """Given a Doc, convert the cluster spans to simple int tuple lists."""
+    """Given a Doc, convert the cluster spans to simple int tuple lists. The 
+    ints are char spans, to be tokenization independent.
+    """
     out = []
     for key, val in doc.spans.items():
         cluster = []
         for span in val:
-            # TODO check that there isn't an off-by-one error here
-            # cluster.append((span.start, span.end))
-            # TODO This conversion should be happening earlier in processing
+
             head_i = span.root.i
-            cluster.append((head_i, head_i + 1))
+            head = doc[head_i]
+            char_span = (head.idx, head.idx + len(head))
+            cluster.append(char_span)
 
         # don't want duplicates
         cluster = list(set(cluster))
diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index cd07f80e8..630502f6d 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -267,7 +267,19 @@ class CoreferenceResolver(TrainablePipe):
         example = list(examples)[0]
         cidx = mention_idx
 
-        clusters = get_clusters_from_doc(example.reference)
+        clusters_by_char = get_clusters_from_doc(example.reference)
+        # convert to token clusters, and give up if necessary
+        clusters = []
+        for cluster in clusters_by_char:
+            cc = []
+            for start_char, end_char in cluster:
+                span = example.predicted.char_span(start_char, end_char)
+                if span is None:
+                    # TODO log more details
+                    raise IndexError(Errors.E1038)
+                cc.append( (span.start, span.end) )
+            clusters.append(cc)
+
         span_idxs = create_head_span_idxs(ops, len(example.predicted))
         gscores = create_gold_scores(span_idxs, clusters)
         # TODO fix type here. This is bools but asarray2f wants ints.
diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 358da6b03..584db99b8 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -34,6 +34,15 @@ TRAIN_DATA = [
 ]
 # fmt: on
 
+def spans2ints(doc):
+    """Convert doc.spans to nested list of ints for comparison.
+
+    This is useful for checking consistency of predictions.
+    """
+    out = []
+    for key, cluster in doc.spans.items():
+        out.append( [(ss.start, ss.end) for ss in cluster] )
+    return out
 
 @pytest.fixture
 def nlp():
@@ -108,7 +117,7 @@ def test_coref_serialization(nlp):
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_overfitting_IO(nlp):
-    # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly
+    # Simple test to try and quickly overfit - ensuring the ML models work correctly
     train_examples = []
     for text, annot in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annot))
@@ -117,25 +126,21 @@ def test_overfitting_IO(nlp):
     optimizer = nlp.initialize()
     test_text = TRAIN_DATA[0][0]
     doc = nlp(test_text)
-    print("BEFORE", doc.spans)
 
-    for i in range(5):
+    # Needs ~12 epochs to converge
+    for i in range(15):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
         doc = nlp(test_text)
-        print(i, doc.spans)
-    print(losses["coref"])  # < 0.001
 
     # test the trained model
     doc = nlp(test_text)
-    print("AFTER", doc.spans)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
         nlp2 = util.load_model_from_path(tmp_dir)
         doc2 = nlp2(test_text)
-        print("doc2", doc2.spans)
 
     # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
     texts = [
@@ -143,12 +148,16 @@ def test_overfitting_IO(nlp):
         "I noticed many friends around me",
         "They received it. They received the SMS.",
     ]
-    batch_deps_1 = [doc.spans for doc in nlp.pipe(texts)]
+    docs = list(nlp.pipe(texts))
+    batch_deps_1 = [doc.spans for doc in docs]
     print(batch_deps_1)
-    batch_deps_2 = [doc.spans for doc in nlp.pipe(texts)]
+    docs = list(nlp.pipe(texts))
+    batch_deps_2 = [doc.spans for doc in docs]
     print(batch_deps_2)
-    no_batch_deps = [doc.spans for doc in [nlp(text) for text in texts]]
+    docs = [nlp(text) for text in texts]
+    no_batch_deps = [doc.spans for doc in docs]
     print(no_batch_deps)
+    print("FINISH")
     # assert_equal(batch_deps_1, batch_deps_2)
     # assert_equal(batch_deps_1, no_batch_deps)
 
@@ -183,7 +192,6 @@ def test_tokenization_mismatch(nlp):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
         doc = nlp(test_text)
-        print(i, doc.spans)
 
     # test the trained model
     doc = nlp(test_text)
@@ -202,12 +210,11 @@ def test_tokenization_mismatch(nlp):
     ]
 
     # save the docs so they don't get garbage collected
-    docs = list(nlp.pipe(texts))
-    batch_deps_1 = [doc.spans for doc in docs]
-    docs = list(nlp.pipe(texts))
-    batch_deps_2 = [doc.spans for doc in docs]
-    docs = [nlp(text) for text in texts]
-    no_batch_deps = [doc.spans for doc in docs]
+    docs1 = list(nlp.pipe(texts))
+    docs2 = list(nlp.pipe(texts))
+    docs3 = [nlp(text) for text in texts]
+    assert spans2ints(docs1[0]) == spans2ints(docs2[0])
+    assert spans2ints(docs1[0]) == spans2ints(docs3[0])
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_crossing_spans():

From 5192ac16170c51e4a3ed0c8d930a4988853c4dd2 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 3 Jul 2022 14:48:42 +0900
Subject: [PATCH 04/17] Clean tests.

---
 spacy/tests/pipeline/test_coref.py | 31 +++++++++---------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 584db99b8..73c09b48e 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -35,7 +35,8 @@ TRAIN_DATA = [
 # fmt: on
 
 def spans2ints(doc):
-    """Convert doc.spans to nested list of ints for comparison.
+    """Convert doc.spans to nested list of ints for comparison. 
+    The ints are token indices.
 
     This is useful for checking consistency of predictions.
     """
@@ -98,21 +99,14 @@ def test_coref_serialization(nlp):
     assert nlp.pipe_names == ["coref"]
     text = "She gave me her pen."
     doc = nlp(text)
-    spans_result = doc.spans
 
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
         nlp2 = spacy.load(tmp_dir)
         assert nlp2.pipe_names == ["coref"]
         doc2 = nlp2(text)
-        spans_result2 = doc2.spans
-        print(1, [(k, len(v)) for k, v in spans_result.items()])
-        print(2, [(k, len(v)) for k, v in spans_result2.items()])
-        # Note: spans do not compare equal because docs are different and docs
-        # use object identity for equality
-        for k, v in spans_result.items():
-            assert str(spans_result[k]) == str(spans_result2[k])
-        # assert spans_result == spans_result2
+
+        assert spans2ints(doc) == spans2ints(doc2)
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -148,18 +142,11 @@ def test_overfitting_IO(nlp):
         "I noticed many friends around me",
         "They received it. They received the SMS.",
     ]
-    docs = list(nlp.pipe(texts))
-    batch_deps_1 = [doc.spans for doc in docs]
-    print(batch_deps_1)
-    docs = list(nlp.pipe(texts))
-    batch_deps_2 = [doc.spans for doc in docs]
-    print(batch_deps_2)
-    docs = [nlp(text) for text in texts]
-    no_batch_deps = [doc.spans for doc in docs]
-    print(no_batch_deps)
-    print("FINISH")
-    # assert_equal(batch_deps_1, batch_deps_2)
-    # assert_equal(batch_deps_1, no_batch_deps)
+    docs1 = list(nlp.pipe(texts))
+    docs2 = list(nlp.pipe(texts))
+    docs3 = [nlp(text) for text in texts]
+    assert spans2ints(docs1[0]) == spans2ints(docs2[0])
+    assert spans2ints(docs1[0]) == spans2ints(docs3[0])
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_tokenization_mismatch(nlp):

From 1dacecbbfbf1648ec0d6a44d0c53d722de0c2c40 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 3 Jul 2022 14:49:02 +0900
Subject: [PATCH 05/17] Run black

---
 spacy/tests/pipeline/test_coref.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 73c09b48e..4b8ca1653 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -34,17 +34,19 @@ TRAIN_DATA = [
 ]
 # fmt: on
 
+
 def spans2ints(doc):
-    """Convert doc.spans to nested list of ints for comparison. 
+    """Convert doc.spans to nested list of ints for comparison.
     The ints are token indices.
 
     This is useful for checking consistency of predictions.
     """
     out = []
     for key, cluster in doc.spans.items():
-        out.append( [(ss.start, ss.end) for ss in cluster] )
+        out.append([(ss.start, ss.end) for ss in cluster])
     return out
 
+
 @pytest.fixture
 def nlp():
     return English()
@@ -70,6 +72,7 @@ def test_not_initialized(nlp):
     with pytest.raises(ValueError, match="E109"):
         nlp(text)
 
+
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_initialized(nlp):
     nlp.add_pipe("coref")
@@ -148,6 +151,7 @@ def test_overfitting_IO(nlp):
     assert spans2ints(docs1[0]) == spans2ints(docs2[0])
     assert spans2ints(docs1[0]) == spans2ints(docs3[0])
 
+
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_tokenization_mismatch(nlp):
     train_examples = []
@@ -158,7 +162,7 @@ def test_tokenization_mismatch(nlp):
         for key, cluster in ref.spans.items():
             char_spans[key] = []
             for span in cluster:
-                char_spans[key].append( (span[0].idx, span[-1].idx + len(span[-1])) )
+                char_spans[key].append((span[0].idx, span[-1].idx + len(span[-1])))
         with ref.retokenize() as retokenizer:
             # merge "many friends"
             retokenizer.merge(ref[5:7])
@@ -203,6 +207,7 @@ def test_tokenization_mismatch(nlp):
     assert spans2ints(docs1[0]) == spans2ints(docs2[0])
     assert spans2ints(docs1[0]) == spans2ints(docs3[0])
 
+
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_crossing_spans():
     starts = [6, 10, 0, 1, 0, 1, 0, 1, 2, 2, 2]
@@ -215,6 +220,7 @@ def test_crossing_spans():
     guess = sorted(guess)
     assert gold == guess
 
+
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_sentence_map(snlp):
     doc = snlp("I like text. This is text.")

From 201731df2d16d33c74dac248261f0a35808eb32d Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 3 Jul 2022 15:12:53 +0900
Subject: [PATCH 06/17] Move spans2ints to util

---
 spacy/ml/models/coref_util.py      | 14 +++++++++++++-
 spacy/tests/pipeline/test_coref.py | 11 +----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/spacy/ml/models/coref_util.py b/spacy/ml/models/coref_util.py
index bd577e65f..9281ad0c7 100644
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@@ -143,7 +143,7 @@ def create_head_span_idxs(ops, doclen: int):
 
 
 def get_clusters_from_doc(doc) -> List[List[Tuple[int, int]]]:
-    """Given a Doc, convert the cluster spans to simple int tuple lists. The 
+    """Given a Doc, convert the cluster spans to simple int tuple lists. The
     ints are char spans, to be tokenization independent.
     """
     out = []
@@ -203,3 +203,15 @@ def create_gold_scores(
 
     # caller needs to convert to array, and add placeholder
     return out
+
+
+def spans2ints(doc):
+    """Convert doc.spans to nested list of ints for comparison.
+    The ints are token indices.
+
+    This is useful for checking consistency of predictions.
+    """
+    out = []
+    for key, cluster in doc.spans.items():
+        out.append([(ss.start, ss.end) for ss in cluster])
+    return out
diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 4b8ca1653..3bde6ad34 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -9,6 +9,7 @@ from spacy.ml.models.coref_util import (
     DEFAULT_CLUSTER_PREFIX,
     select_non_crossing_spans,
     get_sentence_ids,
+    spans2ints,
 )
 
 from thinc.util import has_torch
@@ -35,16 +36,6 @@ TRAIN_DATA = [
 # fmt: on
 
 
-def spans2ints(doc):
-    """Convert doc.spans to nested list of ints for comparison.
-    The ints are token indices.
-
-    This is useful for checking consistency of predictions.
-    """
-    out = []
-    for key, cluster in doc.spans.items():
-        out.append([(ss.start, ss.end) for ss in cluster])
-    return out
 
 
 @pytest.fixture

From 1a4dbb702d6da012db691f2a346588476f27ca4d Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 3 Jul 2022 15:13:15 +0900
Subject: [PATCH 07/17] Add basic span predictor tests

---
 spacy/tests/pipeline/test_span_predictor.py | 129 ++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 spacy/tests/pipeline/test_span_predictor.py

diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
new file mode 100644
index 000000000..1adaecd3f
--- /dev/null
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -0,0 +1,129 @@
+import pytest
+import spacy
+
+from spacy import util
+from spacy.training import Example
+from spacy.lang.en import English
+from spacy.tests.util import make_tempdir
+from spacy.ml.models.coref_util import (
+    DEFAULT_CLUSTER_PREFIX,
+    select_non_crossing_spans,
+    get_sentence_ids,
+    spans2ints,
+)
+
+from thinc.util import has_torch
+
+# fmt: off
+TRAIN_DATA = [
+    (
+        "John Smith picked up the red ball and he threw it away.",
+        {
+            "spans": {
+                f"{DEFAULT_CLUSTER_PREFIX}_1": [
+                    (0, 11, "MENTION"),      # John Smith
+                    (38, 41, "MENTION"),     # he
+
+                ],
+                f"{DEFAULT_CLUSTER_PREFIX}_2": [
+                    (25, 33, "MENTION"),     # red ball
+                    (47, 50, "MENTION"),     # it
+                ],
+                f"coref_head_clusters_1": [
+                    (5, 11, "MENTION"),      # Smith
+                    (38, 41, "MENTION"),     # he
+
+                ],
+                f"coref_head_clusters_2": [
+                    (29, 33, "MENTION"),     # red ball
+                    (47, 50, "MENTION"),     # it
+                ]
+            }
+        },
+    ),
+]
+# fmt: on
+
+
+@pytest.fixture
+def nlp():
+    return English()
+
+
+@pytest.fixture
+def snlp():
+    en = English()
+    en.add_pipe("sentencizer")
+    return en
+
+
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
+def test_add_pipe(nlp):
+    nlp.add_pipe("span_predictor")
+    assert nlp.pipe_names == ["span_predictor"]
+
+
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
+def test_not_initialized(nlp):
+    nlp.add_pipe("span_predictor")
+    text = "She gave me her pen."
+    with pytest.raises(ValueError, match="E109"):
+        nlp(text)
+
+
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
+def test_span_predictor_serialization(nlp):
+    # Test that the span predictor component can be serialized
+    nlp.add_pipe("span_predictor", last=True)
+    nlp.initialize()
+    assert nlp.pipe_names == ["span_predictor"]
+    text = "She gave me her pen."
+    doc = nlp(text)
+
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = spacy.load(tmp_dir)
+        assert nlp2.pipe_names == ["span_predictor"]
+        doc2 = nlp2(text)
+
+        assert spans2ints(doc) == spans2ints(doc2)
+
+
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
+def test_overfitting_IO(nlp):
+    # Simple test to try and quickly overfit - ensuring the ML models work correctly
+    train_examples = []
+    for text, annot in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annot))
+
+    nlp.add_pipe("span_predictor")
+    optimizer = nlp.initialize()
+    test_text = TRAIN_DATA[0][0]
+    doc = nlp(test_text)
+
+    # Needs ~12 epochs to converge
+    for i in range(15):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+        doc = nlp(test_text)
+
+    # test the trained model
+    doc = nlp(test_text)
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+
+    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
+    texts = [
+        test_text,
+        "I noticed many friends around me",
+        "They received it. They received the SMS.",
+    ]
+    docs1 = list(nlp.pipe(texts))
+    docs2 = list(nlp.pipe(texts))
+    docs3 = [nlp(text) for text in texts]
+    assert spans2ints(docs1[0]) == spans2ints(docs2[0])
+    assert spans2ints(docs1[0]) == spans2ints(docs3[0])

From 619b1102e66c68a8ecb9db31e1959764b29035ab Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 3 Jul 2022 15:32:35 +0900
Subject: [PATCH 08/17] Use config to specify tok2vec_size

---
 spacy/ml/models/coref.py                    |  1 -
 spacy/tests/pipeline/test_coref.py          | 11 ++++++-----
 spacy/tests/pipeline/test_span_predictor.py |  6 ++++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 1963a4127..22234390e 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -25,7 +25,6 @@ def build_wl_coref_model(
     tok2vec_size: int = 768,  # tok2vec size
 ):
     # TODO add model return types
-    tok2vec_size = 64
 
     with Model.define_operators({">>": chain}):
         coref_clusterer = PyTorchWrapper(
diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 3bde6ad34..89906c87b 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -36,6 +36,7 @@ TRAIN_DATA = [
 # fmt: on
 
 
+CONFIG = {"model": {"@architectures": "spacy.Coref.v1", "tok2vec_size": 64}}
 
 
 @pytest.fixture
@@ -66,7 +67,7 @@ def test_not_initialized(nlp):
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_initialized(nlp):
-    nlp.add_pipe("coref")
+    nlp.add_pipe("coref", config=CONFIG)
     nlp.initialize()
     assert nlp.pipe_names == ["coref"]
     text = "She gave me her pen."
@@ -78,7 +79,7 @@ def test_initialized(nlp):
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_initialized_short(nlp):
-    nlp.add_pipe("coref")
+    nlp.add_pipe("coref", config=CONFIG)
     nlp.initialize()
     assert nlp.pipe_names == ["coref"]
     text = "Hi there"
@@ -88,7 +89,7 @@ def test_initialized_short(nlp):
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_coref_serialization(nlp):
     # Test that the coref component can be serialized
-    nlp.add_pipe("coref", last=True)
+    nlp.add_pipe("coref", last=True, config=CONFIG)
     nlp.initialize()
     assert nlp.pipe_names == ["coref"]
     text = "She gave me her pen."
@@ -110,7 +111,7 @@ def test_overfitting_IO(nlp):
     for text, annot in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annot))
 
-    nlp.add_pipe("coref")
+    nlp.add_pipe("coref", config=CONFIG)
     optimizer = nlp.initialize()
     test_text = TRAIN_DATA[0][0]
     doc = nlp(test_text)
@@ -165,7 +166,7 @@ def test_tokenization_mismatch(nlp):
 
         train_examples.append(eg)
 
-    nlp.add_pipe("coref")
+    nlp.add_pipe("coref", config=CONFIG)
     optimizer = nlp.initialize()
     test_text = TRAIN_DATA[0][0]
     doc = nlp(test_text)
diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
index 1adaecd3f..7d7a75279 100644
--- a/spacy/tests/pipeline/test_span_predictor.py
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -44,6 +44,8 @@ TRAIN_DATA = [
 ]
 # fmt: on
 
+CONFIG = {"model": {"@architectures": "spacy.SpanPredictor.v1", "tok2vec_size": 64}}
+
 
 @pytest.fixture
 def nlp():
@@ -74,7 +76,7 @@ def test_not_initialized(nlp):
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
 def test_span_predictor_serialization(nlp):
     # Test that the span predictor component can be serialized
-    nlp.add_pipe("span_predictor", last=True)
+    nlp.add_pipe("span_predictor", last=True, config=CONFIG)
     nlp.initialize()
     assert nlp.pipe_names == ["span_predictor"]
     text = "She gave me her pen."
@@ -96,7 +98,7 @@ def test_overfitting_IO(nlp):
     for text, annot in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annot))
 
-    nlp.add_pipe("span_predictor")
+    nlp.add_pipe("span_predictor", config=CONFIG)
     optimizer = nlp.initialize()
     test_text = TRAIN_DATA[0][0]
     doc = nlp(test_text)

From a46bc03abb0857ee8cd11bc83257e3c70aeed705 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 3 Jul 2022 16:01:27 +0900
Subject: [PATCH 09/17] Add failing test with tokenization mismatch

This test only fails due to the explicity assert False at the moment,
but the debug output shows that the learned spans are all off by one due
to misalignment. So the code still needs fixing.
---
 spacy/tests/pipeline/test_span_predictor.py | 84 +++++++++++++++++++--
 1 file changed, 78 insertions(+), 6 deletions(-)

diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
index 7d7a75279..9281df354 100644
--- a/spacy/tests/pipeline/test_span_predictor.py
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -21,22 +21,22 @@ TRAIN_DATA = [
         {
             "spans": {
                 f"{DEFAULT_CLUSTER_PREFIX}_1": [
-                    (0, 11, "MENTION"),      # John Smith
-                    (38, 41, "MENTION"),     # he
+                    (0, 10, "MENTION"),      # John Smith
+                    (38, 40, "MENTION"),     # he
 
                 ],
                 f"{DEFAULT_CLUSTER_PREFIX}_2": [
                     (25, 33, "MENTION"),     # red ball
-                    (47, 50, "MENTION"),     # it
+                    (47, 49, "MENTION"),     # it
                 ],
                 f"coref_head_clusters_1": [
-                    (5, 11, "MENTION"),      # Smith
-                    (38, 41, "MENTION"),     # he
+                    (5, 10, "MENTION"),      # Smith
+                    (38, 40, "MENTION"),     # he
 
                 ],
                 f"coref_head_clusters_2": [
                     (29, 33, "MENTION"),     # red ball
-                    (47, 50, "MENTION"),     # it
+                    (47, 49, "MENTION"),     # it
                 ]
             }
         },
@@ -129,3 +129,75 @@ def test_overfitting_IO(nlp):
     docs3 = [nlp(text) for text in texts]
     assert spans2ints(docs1[0]) == spans2ints(docs2[0])
     assert spans2ints(docs1[0]) == spans2ints(docs3[0])
+
+
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
+def test_tokenization_mismatch(nlp):
+    train_examples = []
+    for text, annot in TRAIN_DATA:
+        eg = Example.from_dict(nlp.make_doc(text), annot)
+        ref = eg.reference
+        char_spans = {}
+        for key, cluster in ref.spans.items():
+            char_spans[key] = []
+            for span in cluster:
+                char_spans[key].append((span[0].idx, span[-1].idx + len(span[-1])))
+        with ref.retokenize() as retokenizer:
+            # merge "picked up"
+            retokenizer.merge(ref[2:4])
+
+        # Note this works because it's the same doc and we know the keys
+        for key, _ in ref.spans.items():
+            spans = char_spans[key]
+            ref.spans[key] = [ref.char_span(*span) for span in spans]
+
+        # Finally, copy over the head spans to the pred
+        pred = eg.predicted
+        for key, val in ref.spans.items():
+            if key.startswith("coref_head_clusters"):
+                spans = char_spans[key]
+                pred.spans[key] = [pred.char_span(*span) for span in spans]
+
+        train_examples.append(eg)
+
+    nlp.add_pipe("span_predictor", config=CONFIG)
+    optimizer = nlp.initialize()
+    test_text = TRAIN_DATA[0][0]
+    doc = nlp(test_text)
+
+    for i in range(100):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+        doc = nlp(test_text)
+
+    # test the trained model; need to use doc with head spans on it already
+    test_doc = train_examples[0].predicted
+    doc = nlp(test_doc)
+
+    # XXX DEBUG
+    print("SPANS", len(doc.spans))
+    for key, val in doc.spans.items():
+        print(key, val)
+        print("...")
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+
+    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
+    texts = [
+        test_text,
+        "I noticed many friends around me",
+        "They received it. They received the SMS.",
+    ]
+
+    # save the docs so they don't get garbage collected
+    docs1 = list(nlp.pipe(texts))
+    docs2 = list(nlp.pipe(texts))
+    docs3 = [nlp(text) for text in texts]
+    assert spans2ints(docs1[0]) == spans2ints(docs2[0])
+    assert spans2ints(docs1[0]) == spans2ints(docs3[0])
+    assert False
+

From fd574a89c4ab45dd2b317bc2348a958712efe531 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 3 Jul 2022 19:34:15 +0900
Subject: [PATCH 10/17] Update overfitting test

---
 spacy/tests/pipeline/test_span_predictor.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
index 9281df354..4434b6651 100644
--- a/spacy/tests/pipeline/test_span_predictor.py
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -98,19 +98,29 @@ def test_overfitting_IO(nlp):
     for text, annot in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annot))
 
+    train_examples = []
+    for text, annot in TRAIN_DATA:
+        eg = Example.from_dict(nlp.make_doc(text), annot)
+        ref = eg.reference
+        # Finally, copy over the head spans to the pred
+        pred = eg.predicted
+        for key, spans in ref.spans.items():
+            if key.startswith("coref_head_clusters"):
+                pred.spans[key] = [pred[span.start:span.end] for span in spans]
+
+        train_examples.append(eg)
     nlp.add_pipe("span_predictor", config=CONFIG)
     optimizer = nlp.initialize()
     test_text = TRAIN_DATA[0][0]
     doc = nlp(test_text)
 
-    # Needs ~12 epochs to converge
-    for i in range(15):
+    for i in range(1500):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
         doc = nlp(test_text)
 
-    # test the trained model
-    doc = nlp(test_text)
+    # test the trained model, using the pred since it has heads
+    doc = nlp(train_examples[0].predicted)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:

From cf33b48fe06a4469be6079f9712deae04c254137 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 3 Jul 2022 20:10:53 +0900
Subject: [PATCH 11/17] Update tests

---
 spacy/ml/models/coref_util.py               |  8 +++++---
 spacy/tests/pipeline/test_span_predictor.py | 16 +++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/spacy/ml/models/coref_util.py b/spacy/ml/models/coref_util.py
index 9281ad0c7..00d501f80 100644
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@@ -207,11 +207,13 @@ def create_gold_scores(
 
 def spans2ints(doc):
     """Convert doc.spans to nested list of ints for comparison.
-    The ints are token indices.
+    The ints are character indices, and the spans groups are sorted by key first.
 
     This is useful for checking consistency of predictions.
     """
     out = []
-    for key, cluster in doc.spans.items():
-        out.append([(ss.start, ss.end) for ss in cluster])
+    keys = sorted([key for key in doc.spans])
+    for key in keys:
+        cluster = doc.spans[key]
+        out.append([(ss.start_char, ss.end_char) for ss in cluster])
     return out
diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
index 4434b6651..3d88b9548 100644
--- a/spacy/tests/pipeline/test_span_predictor.py
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -114,13 +114,15 @@ def test_overfitting_IO(nlp):
     test_text = TRAIN_DATA[0][0]
     doc = nlp(test_text)
 
-    for i in range(1500):
+    for i in range(15):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
         doc = nlp(test_text)
 
     # test the trained model, using the pred since it has heads
     doc = nlp(train_examples[0].predicted)
+    # XXX This actually tests that it can overfit
+    assert spans2ints(doc) == spans2ints(train_examples[0].reference)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
@@ -134,6 +136,7 @@ def test_overfitting_IO(nlp):
         "I noticed many friends around me",
         "They received it. They received the SMS.",
     ]
+    # XXX Note these have no predictions because they have no input spans
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
@@ -175,7 +178,7 @@ def test_tokenization_mismatch(nlp):
     test_text = TRAIN_DATA[0][0]
     doc = nlp(test_text)
 
-    for i in range(100):
+    for i in range(15):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
         doc = nlp(test_text)
@@ -183,12 +186,8 @@ def test_tokenization_mismatch(nlp):
     # test the trained model; need to use doc with head spans on it already
     test_doc = train_examples[0].predicted
     doc = nlp(test_doc)
-
-    # XXX DEBUG
-    print("SPANS", len(doc.spans))
-    for key, val in doc.spans.items():
-        print(key, val)
-        print("...")
+    # XXX This actually tests that it can overfit
+    assert spans2ints(doc) == spans2ints(train_examples[0].reference)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
@@ -209,5 +208,4 @@ def test_tokenization_mismatch(nlp):
     docs3 = [nlp(text) for text in texts]
     assert spans2ints(docs1[0]) == spans2ints(docs2[0])
     assert spans2ints(docs1[0]) == spans2ints(docs3[0])
-    assert False
 

From b09bbc7f5eb8f04e2441f43f063492d3e2fc1d22 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 3 Jul 2022 20:11:03 +0900
Subject: [PATCH 12/17] Fix alignment issues

I believe this resolves issues with tokenization mismatches.
---
 spacy/pipeline/span_predictor.py | 33 ++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/spacy/pipeline/span_predictor.py b/spacy/pipeline/span_predictor.py
index d7e96a4b2..c9343a97e 100644
--- a/spacy/pipeline/span_predictor.py
+++ b/spacy/pipeline/span_predictor.py
@@ -231,16 +231,29 @@ class SpanPredictor(TrainablePipe):
         for eg in examples:
             starts = []
             ends = []
+            keeps = []
+            sidx = 0
             for key, sg in eg.reference.spans.items():
                 if key.startswith(self.output_prefix):
-                    for mention in sg:
-                        starts.append(mention.start)
-                        ends.append(mention.end)
+                    for ii, mention in enumerate(sg):
+                        sidx += 1
+                        # convert to span in pred
+                        sch, ech = (mention.start_char, mention.end_char)
+                        span = eg.predicted.char_span(sch, ech)
+                        # TODO add to errors.py
+                        if span is None:
+                            warnings.warn("Could not align gold span in span predictor, skipping")
+                            continue
+                        starts.append(span.start)
+                        ends.append(span.end)
+                        keeps.append(sidx - 1)
 
             starts = self.model.ops.xp.asarray(starts)
             ends = self.model.ops.xp.asarray(ends)
-            start_scores = span_scores[:, :, 0]
-            end_scores = span_scores[:, :, 1]
+            start_scores = span_scores[:, :, 0][keeps]
+            end_scores = span_scores[:, :, 1][keeps]
+
+
             n_classes = start_scores.shape[1]
             start_probs = ops.softmax(start_scores, axis=1)
             end_probs = ops.softmax(end_scores, axis=1)
@@ -248,7 +261,14 @@ class SpanPredictor(TrainablePipe):
             end_targets = to_categorical(ends, n_classes)
             start_grads = start_probs - start_targets
             end_grads = end_probs - end_targets
-            grads = ops.xp.stack((start_grads, end_grads), axis=2)
+            # now return to original shape, with 0s
+            final_start_grads = ops.alloc2f(*span_scores[:, :, 0].shape)
+            final_start_grads[keeps] = start_grads
+            final_end_grads = ops.alloc2f(*final_start_grads.shape)
+            final_end_grads[keeps] = end_grads
+            # XXX Note this only works with fake batching
+            grads = ops.xp.stack((final_start_grads, final_end_grads), axis=2)
+
             loss = float((grads**2).sum())
         return loss, grads
 
@@ -267,6 +287,7 @@ class SpanPredictor(TrainablePipe):
             if not ex.predicted.spans:
                 # set placeholder for shape inference
                 doc = ex.predicted
+                # TODO should be able to check if there are some valid docs in the batch
                 assert len(doc) > 2, "Coreference requires at least two tokens"
                 doc.spans[f"{self.input_prefix}_0"] = [doc[0:1], doc[1:2]]
             X.append(ex.predicted)

From c7f333d5938e665b93ac175e89de4f2050284780 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 4 Jul 2022 19:28:35 +0900
Subject: [PATCH 13/17] Rename spans2ints > _spans_to_offsets

---
 spacy/ml/models/coref_util.py               |  2 +-
 spacy/tests/pipeline/test_coref.py          | 12 ++++++------
 spacy/tests/pipeline/test_span_predictor.py | 16 ++++++++--------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/spacy/ml/models/coref_util.py b/spacy/ml/models/coref_util.py
index 00d501f80..772306dec 100644
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@@ -205,7 +205,7 @@ def create_gold_scores(
     return out
 
 
-def spans2ints(doc):
+def _spans_to_offsets(doc):
     """Convert doc.spans to nested list of ints for comparison.
     The ints are character indices, and the spans groups are sorted by key first.
 
diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 89906c87b..9a969acdd 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -9,7 +9,7 @@ from spacy.ml.models.coref_util import (
     DEFAULT_CLUSTER_PREFIX,
     select_non_crossing_spans,
     get_sentence_ids,
-    spans2ints,
+    _spans_to_offsets,
 )
 
 from thinc.util import has_torch
@@ -101,7 +101,7 @@ def test_coref_serialization(nlp):
         assert nlp2.pipe_names == ["coref"]
         doc2 = nlp2(text)
 
-        assert spans2ints(doc) == spans2ints(doc2)
+        assert _spans_to_offsets(doc) == _spans_to_offsets(doc2)
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -140,8 +140,8 @@ def test_overfitting_IO(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert spans2ints(docs1[0]) == spans2ints(docs2[0])
-    assert spans2ints(docs1[0]) == spans2ints(docs3[0])
+    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
+    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -196,8 +196,8 @@ def test_tokenization_mismatch(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert spans2ints(docs1[0]) == spans2ints(docs2[0])
-    assert spans2ints(docs1[0]) == spans2ints(docs3[0])
+    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
+    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
index 3d88b9548..3a3111bd4 100644
--- a/spacy/tests/pipeline/test_span_predictor.py
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -9,7 +9,7 @@ from spacy.ml.models.coref_util import (
     DEFAULT_CLUSTER_PREFIX,
     select_non_crossing_spans,
     get_sentence_ids,
-    spans2ints,
+    _spans_to_offsets,
 )
 
 from thinc.util import has_torch
@@ -88,7 +88,7 @@ def test_span_predictor_serialization(nlp):
         assert nlp2.pipe_names == ["span_predictor"]
         doc2 = nlp2(text)
 
-        assert spans2ints(doc) == spans2ints(doc2)
+        assert _spans_to_offsets(doc) == _spans_to_offsets(doc2)
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -122,7 +122,7 @@ def test_overfitting_IO(nlp):
     # test the trained model, using the pred since it has heads
     doc = nlp(train_examples[0].predicted)
     # XXX This actually tests that it can overfit
-    assert spans2ints(doc) == spans2ints(train_examples[0].reference)
+    assert _spans_to_offsets(doc) == _spans_to_offsets(train_examples[0].reference)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
@@ -140,8 +140,8 @@ def test_overfitting_IO(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert spans2ints(docs1[0]) == spans2ints(docs2[0])
-    assert spans2ints(docs1[0]) == spans2ints(docs3[0])
+    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
+    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -187,7 +187,7 @@ def test_tokenization_mismatch(nlp):
     test_doc = train_examples[0].predicted
     doc = nlp(test_doc)
     # XXX This actually tests that it can overfit
-    assert spans2ints(doc) == spans2ints(train_examples[0].reference)
+    assert _spans_to_offsets(doc) == _spans_to_offsets(train_examples[0].reference)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
@@ -206,6 +206,6 @@ def test_tokenization_mismatch(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert spans2ints(docs1[0]) == spans2ints(docs2[0])
-    assert spans2ints(docs1[0]) == spans2ints(docs3[0])
+    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
+    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
 

From 178feae00ab71a27657250bff95b53aba1a9a4e8 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 4 Jul 2022 19:37:42 +0900
Subject: [PATCH 14/17] Add tests to give up with whitespace differences

Docs in Examples are allowed to have arbitrarily different whitespace.
Handling that properly would be nice but isn't required, but for now
check for it and blow up.
---
 spacy/pipeline/coref.py                     |  9 ++++++++-
 spacy/pipeline/span_predictor.py            |  7 +++++++
 spacy/tests/pipeline/test_coref.py          | 17 +++++++++++++++++
 spacy/tests/pipeline/test_span_predictor.py | 18 +++++++++++++++++-
 4 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 1e11a0417..af40d9b06 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -218,6 +218,13 @@ class CoreferenceResolver(TrainablePipe):
         total_loss = 0
 
         for eg in examples:
+            if eg.x.text != eg.y.text:
+                # TODO assign error number
+                raise ValueError(
+                    """Text, including whitespace, must match between reference and
+                    predicted docs in coref training.
+                    """
+                )
             # TODO check this causes no issues (in practice it runs)
             preds, backprop = self.model.begin_update([eg.predicted])
             score_matrix, mention_idx = preds
@@ -277,7 +284,7 @@ class CoreferenceResolver(TrainablePipe):
                 if span is None:
                     # TODO log more details
                     raise IndexError(Errors.E1043)
-                cc.append( (span.start, span.end) )
+                cc.append((span.start, span.end))
             clusters.append(cc)
 
         span_idxs = create_head_span_idxs(ops, len(example.predicted))
diff --git a/spacy/pipeline/span_predictor.py b/spacy/pipeline/span_predictor.py
index c9343a97e..aee11ba8e 100644
--- a/spacy/pipeline/span_predictor.py
+++ b/spacy/pipeline/span_predictor.py
@@ -178,6 +178,13 @@ class SpanPredictor(TrainablePipe):
 
         total_loss = 0
         for eg in examples:
+            if eg.x.text != eg.y.text:
+                # TODO assign error number
+                raise ValueError(
+                    """Text, including whitespace, must match between reference and
+                    predicted docs in span predictor training.
+                    """
+                )
             span_scores, backprop = self.model.begin_update([eg.predicted])
             # FIXME, this only happens once in the first 1000 docs of OntoNotes
             # and I'm not sure yet why.
diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 9a969acdd..7fc4864a3 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -218,3 +218,20 @@ def test_sentence_map(snlp):
     doc = snlp("I like text. This is text.")
     sm = get_sentence_ids(doc)
     assert sm == [0, 0, 0, 0, 1, 1, 1, 1]
+
+
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
+def test_whitespace_mismatch(nlp):
+    train_examples = []
+    for text, annot in TRAIN_DATA:
+        eg = Example.from_dict(nlp.make_doc(text), annot)
+        eg.predicted = nlp.make_doc("  " + text)
+        train_examples.append(eg)
+
+    nlp.add_pipe("coref", config=CONFIG)
+    optimizer = nlp.initialize()
+    test_text = TRAIN_DATA[0][0]
+    doc = nlp(test_text)
+
+    with pytest.raises(ValueError, match="whitespace"):
+        nlp.update(train_examples, sgd=optimizer)
diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
index 3a3111bd4..a79756d88 100644
--- a/spacy/tests/pipeline/test_span_predictor.py
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -106,7 +106,7 @@ def test_overfitting_IO(nlp):
         pred = eg.predicted
         for key, spans in ref.spans.items():
             if key.startswith("coref_head_clusters"):
-                pred.spans[key] = [pred[span.start:span.end] for span in spans]
+                pred.spans[key] = [pred[span.start : span.end] for span in spans]
 
         train_examples.append(eg)
     nlp.add_pipe("span_predictor", config=CONFIG)
@@ -209,3 +209,19 @@ def test_tokenization_mismatch(nlp):
     assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
     assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
 
+
+@pytest.mark.skipif(not has_torch, reason="Torch not available")
+def test_whitespace_mismatch(nlp):
+    train_examples = []
+    for text, annot in TRAIN_DATA:
+        eg = Example.from_dict(nlp.make_doc(text), annot)
+        eg.predicted = nlp.make_doc("  " + text)
+        train_examples.append(eg)
+
+    nlp.add_pipe("span_predictor", config=CONFIG)
+    optimizer = nlp.initialize()
+    test_text = TRAIN_DATA[0][0]
+    doc = nlp(test_text)
+
+    with pytest.raises(ValueError, match="whitespace"):
+        nlp.update(train_examples, sgd=optimizer)

From 63e27b5e443626038782e35b1928c964ca806c00 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 6 Jul 2022 13:46:02 +0900
Subject: [PATCH 15/17] Update spacy/ml/models/coref_util.py

Co-authored-by: kadarakos <kadar.akos@gmail.com>
---
 spacy/ml/models/coref_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ml/models/coref_util.py b/spacy/ml/models/coref_util.py
index 772306dec..38af629d5 100644
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@@ -143,7 +143,7 @@ def create_head_span_idxs(ops, doclen: int):
 
 
 def get_clusters_from_doc(doc) -> List[List[Tuple[int, int]]]:
-    """Given a Doc, convert the cluster spans to simple int tuple lists. The
+    """Convert the span clusters in a Doc to simple integer tuple lists. The
     ints are char spans, to be tokenization independent.
     """
     out = []

From 8f598d7b01745c86069da5a409787c933ef39883 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 6 Jul 2022 14:03:09 +0900
Subject: [PATCH 16/17] Feedback from code review

---
 spacy/ml/models/coref_util.py               | 2 +-
 spacy/tests/pipeline/test_span_predictor.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/ml/models/coref_util.py b/spacy/ml/models/coref_util.py
index 38af629d5..3be0bd835 100644
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@@ -205,7 +205,7 @@ def create_gold_scores(
     return out
 
 
-def _spans_to_offsets(doc):
+def _spans_to_offsets(doc: Doc) -> List[List[Tuple[int, int]]]:
     """Convert doc.spans to nested list of ints for comparison.
     The ints are character indices, and the spans groups are sorted by key first.
 
diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
index a79756d88..c0e59e914 100644
--- a/spacy/tests/pipeline/test_span_predictor.py
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -154,7 +154,7 @@ def test_tokenization_mismatch(nlp):
         for key, cluster in ref.spans.items():
             char_spans[key] = []
             for span in cluster:
-                char_spans[key].append((span[0].idx, span[-1].idx + len(span[-1])))
+                char_spans[key].append((span.start_char, span.end_char))
         with ref.retokenize() as retokenizer:
             # merge "picked up"
             retokenizer.merge(ref[2:4])

From 6f5cf838ecb45e5d8ea85aa99a199525f30df1c5 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 6 Jul 2022 14:05:05 +0900
Subject: [PATCH 17/17] Remove _spans_to_offsets

Basically the same as get_clusters_from_doc
---
 spacy/ml/models/coref_util.py               | 14 --------------
 spacy/tests/pipeline/test_coref.py          | 12 ++++++------
 spacy/tests/pipeline/test_span_predictor.py | 16 ++++++++--------
 3 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/spacy/ml/models/coref_util.py b/spacy/ml/models/coref_util.py
index 3be0bd835..1a6bc6364 100644
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@@ -203,17 +203,3 @@ def create_gold_scores(
 
     # caller needs to convert to array, and add placeholder
     return out
-
-
-def _spans_to_offsets(doc: Doc) -> List[List[Tuple[int, int]]]:
-    """Convert doc.spans to nested list of ints for comparison.
-    The ints are character indices, and the spans groups are sorted by key first.
-
-    This is useful for checking consistency of predictions.
-    """
-    out = []
-    keys = sorted([key for key in doc.spans])
-    for key in keys:
-        cluster = doc.spans[key]
-        out.append([(ss.start_char, ss.end_char) for ss in cluster])
-    return out
diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 7fc4864a3..3e297ddcd 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -9,7 +9,7 @@ from spacy.ml.models.coref_util import (
     DEFAULT_CLUSTER_PREFIX,
     select_non_crossing_spans,
     get_sentence_ids,
-    _spans_to_offsets,
+    get_clusters_from_doc,
 )
 
 from thinc.util import has_torch
@@ -101,7 +101,7 @@ def test_coref_serialization(nlp):
         assert nlp2.pipe_names == ["coref"]
         doc2 = nlp2(text)
 
-        assert _spans_to_offsets(doc) == _spans_to_offsets(doc2)
+        assert get_clusters_from_doc(doc) == get_clusters_from_doc(doc2)
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -140,8 +140,8 @@ def test_overfitting_IO(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -196,8 +196,8 @@ def test_tokenization_mismatch(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
index c0e59e914..8a6c62011 100644
--- a/spacy/tests/pipeline/test_span_predictor.py
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -9,7 +9,7 @@ from spacy.ml.models.coref_util import (
     DEFAULT_CLUSTER_PREFIX,
     select_non_crossing_spans,
     get_sentence_ids,
-    _spans_to_offsets,
+    get_clusters_from_doc,
 )
 
 from thinc.util import has_torch
@@ -88,7 +88,7 @@ def test_span_predictor_serialization(nlp):
         assert nlp2.pipe_names == ["span_predictor"]
         doc2 = nlp2(text)
 
-        assert _spans_to_offsets(doc) == _spans_to_offsets(doc2)
+        assert get_clusters_from_doc(doc) == get_clusters_from_doc(doc2)
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -122,7 +122,7 @@ def test_overfitting_IO(nlp):
     # test the trained model, using the pred since it has heads
     doc = nlp(train_examples[0].predicted)
     # XXX This actually tests that it can overfit
-    assert _spans_to_offsets(doc) == _spans_to_offsets(train_examples[0].reference)
+    assert get_clusters_from_doc(doc) == get_clusters_from_doc(train_examples[0].reference)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
@@ -140,8 +140,8 @@ def test_overfitting_IO(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -187,7 +187,7 @@ def test_tokenization_mismatch(nlp):
     test_doc = train_examples[0].predicted
     doc = nlp(test_doc)
     # XXX This actually tests that it can overfit
-    assert _spans_to_offsets(doc) == _spans_to_offsets(train_examples[0].reference)
+    assert get_clusters_from_doc(doc) == get_clusters_from_doc(train_examples[0].reference)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
@@ -206,8 +206,8 @@ def test_tokenization_mismatch(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")