From 6f5cf838ecb45e5d8ea85aa99a199525f30df1c5 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 6 Jul 2022 14:05:05 +0900
Subject: [PATCH] Remove _spans_to_offsets

Basically the same as get_clusters_from_doc
---
 spacy/ml/models/coref_util.py               | 14 --------------
 spacy/tests/pipeline/test_coref.py          | 12 ++++++------
 spacy/tests/pipeline/test_span_predictor.py | 16 ++++++++--------
 3 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/spacy/ml/models/coref_util.py b/spacy/ml/models/coref_util.py
index 3be0bd835..1a6bc6364 100644
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@@ -203,17 +203,3 @@ def create_gold_scores(
 
     # caller needs to convert to array, and add placeholder
     return out
-
-
-def _spans_to_offsets(doc: Doc) -> List[List[Tuple[int, int]]]:
-    """Convert doc.spans to nested list of ints for comparison.
-    The ints are character indices, and the spans groups are sorted by key first.
-
-    This is useful for checking consistency of predictions.
-    """
-    out = []
-    keys = sorted([key for key in doc.spans])
-    for key in keys:
-        cluster = doc.spans[key]
-        out.append([(ss.start_char, ss.end_char) for ss in cluster])
-    return out
diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py
index 7fc4864a3..3e297ddcd 100644
--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@@ -9,7 +9,7 @@ from spacy.ml.models.coref_util import (
     DEFAULT_CLUSTER_PREFIX,
     select_non_crossing_spans,
     get_sentence_ids,
-    _spans_to_offsets,
+    get_clusters_from_doc,
 )
 
 from thinc.util import has_torch
@@ -101,7 +101,7 @@ def test_coref_serialization(nlp):
         assert nlp2.pipe_names == ["coref"]
         doc2 = nlp2(text)
 
-        assert _spans_to_offsets(doc) == _spans_to_offsets(doc2)
+        assert get_clusters_from_doc(doc) == get_clusters_from_doc(doc2)
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -140,8 +140,8 @@ def test_overfitting_IO(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -196,8 +196,8 @@ def test_tokenization_mismatch(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py
index c0e59e914..8a6c62011 100644
--- a/spacy/tests/pipeline/test_span_predictor.py
+++ b/spacy/tests/pipeline/test_span_predictor.py
@@ -9,7 +9,7 @@ from spacy.ml.models.coref_util import (
     DEFAULT_CLUSTER_PREFIX,
     select_non_crossing_spans,
     get_sentence_ids,
-    _spans_to_offsets,
+    get_clusters_from_doc,
 )
 
 from thinc.util import has_torch
@@ -88,7 +88,7 @@ def test_span_predictor_serialization(nlp):
         assert nlp2.pipe_names == ["span_predictor"]
         doc2 = nlp2(text)
 
-        assert _spans_to_offsets(doc) == _spans_to_offsets(doc2)
+        assert get_clusters_from_doc(doc) == get_clusters_from_doc(doc2)
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -122,7 +122,7 @@ def test_overfitting_IO(nlp):
     # test the trained model, using the pred since it has heads
     doc = nlp(train_examples[0].predicted)
     # XXX This actually tests that it can overfit
-    assert _spans_to_offsets(doc) == _spans_to_offsets(train_examples[0].reference)
+    assert get_clusters_from_doc(doc) == get_clusters_from_doc(train_examples[0].reference)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
@@ -140,8 +140,8 @@ def test_overfitting_IO(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")
@@ -187,7 +187,7 @@ def test_tokenization_mismatch(nlp):
     test_doc = train_examples[0].predicted
     doc = nlp(test_doc)
     # XXX This actually tests that it can overfit
-    assert _spans_to_offsets(doc) == _spans_to_offsets(train_examples[0].reference)
+    assert get_clusters_from_doc(doc) == get_clusters_from_doc(train_examples[0].reference)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
@@ -206,8 +206,8 @@ def test_tokenization_mismatch(nlp):
     docs1 = list(nlp.pipe(texts))
     docs2 = list(nlp.pipe(texts))
     docs3 = [nlp(text) for text in texts]
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0])
-    assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0])
+    assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0])
 
 
 @pytest.mark.skipif(not has_torch, reason="Torch not available")