From 6f5cf838ecb45e5d8ea85aa99a199525f30df1c5 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 6 Jul 2022 14:05:05 +0900 Subject: [PATCH] Remove _spans_to_offsets Basically the same as get_clusters_from_doc --- spacy/ml/models/coref_util.py | 14 -------------- spacy/tests/pipeline/test_coref.py | 12 ++++++------ spacy/tests/pipeline/test_span_predictor.py | 16 ++++++++-------- 3 files changed, 14 insertions(+), 28 deletions(-) diff --git a/spacy/ml/models/coref_util.py b/spacy/ml/models/coref_util.py index 3be0bd835..1a6bc6364 100644 --- a/spacy/ml/models/coref_util.py +++ b/spacy/ml/models/coref_util.py @@ -203,17 +203,3 @@ def create_gold_scores( # caller needs to convert to array, and add placeholder return out - - -def _spans_to_offsets(doc: Doc) -> List[List[Tuple[int, int]]]: - """Convert doc.spans to nested list of ints for comparison. - The ints are character indices, and the spans groups are sorted by key first. - - This is useful for checking consistency of predictions. - """ - out = [] - keys = sorted([key for key in doc.spans]) - for key in keys: - cluster = doc.spans[key] - out.append([(ss.start_char, ss.end_char) for ss in cluster]) - return out diff --git a/spacy/tests/pipeline/test_coref.py b/spacy/tests/pipeline/test_coref.py index 7fc4864a3..3e297ddcd 100644 --- a/spacy/tests/pipeline/test_coref.py +++ b/spacy/tests/pipeline/test_coref.py @@ -9,7 +9,7 @@ from spacy.ml.models.coref_util import ( DEFAULT_CLUSTER_PREFIX, select_non_crossing_spans, get_sentence_ids, - _spans_to_offsets, + get_clusters_from_doc, ) from thinc.util import has_torch @@ -101,7 +101,7 @@ def test_coref_serialization(nlp): assert nlp2.pipe_names == ["coref"] doc2 = nlp2(text) - assert _spans_to_offsets(doc) == _spans_to_offsets(doc2) + assert get_clusters_from_doc(doc) == get_clusters_from_doc(doc2) @pytest.mark.skipif(not has_torch, reason="Torch not available") @@ -140,8 +140,8 @@ def test_overfitting_IO(nlp): docs1 = list(nlp.pipe(texts)) docs2 = list(nlp.pipe(texts)) docs3 = [nlp(text) for text in texts] - assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0]) - assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0]) + assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0]) + assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0]) @pytest.mark.skipif(not has_torch, reason="Torch not available") @@ -196,8 +196,8 @@ def test_tokenization_mismatch(nlp): docs1 = list(nlp.pipe(texts)) docs2 = list(nlp.pipe(texts)) docs3 = [nlp(text) for text in texts] - assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0]) - assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0]) + assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0]) + assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0]) @pytest.mark.skipif(not has_torch, reason="Torch not available") diff --git a/spacy/tests/pipeline/test_span_predictor.py b/spacy/tests/pipeline/test_span_predictor.py index c0e59e914..8a6c62011 100644 --- a/spacy/tests/pipeline/test_span_predictor.py +++ b/spacy/tests/pipeline/test_span_predictor.py @@ -9,7 +9,7 @@ from spacy.ml.models.coref_util import ( DEFAULT_CLUSTER_PREFIX, select_non_crossing_spans, get_sentence_ids, - _spans_to_offsets, + get_clusters_from_doc, ) from thinc.util import has_torch @@ -88,7 +88,7 @@ def test_span_predictor_serialization(nlp): assert nlp2.pipe_names == ["span_predictor"] doc2 = nlp2(text) - assert _spans_to_offsets(doc) == _spans_to_offsets(doc2) + assert get_clusters_from_doc(doc) == get_clusters_from_doc(doc2) @pytest.mark.skipif(not has_torch, reason="Torch not available") @@ -122,7 +122,7 @@ def test_overfitting_IO(nlp): # test the trained model, using the pred since it has heads doc = nlp(train_examples[0].predicted) # XXX This actually tests that it can overfit - assert _spans_to_offsets(doc) == _spans_to_offsets(train_examples[0].reference) + assert get_clusters_from_doc(doc) == get_clusters_from_doc(train_examples[0].reference) # Also test the results are still the same after IO with make_tempdir() as tmp_dir: @@ -140,8 +140,8 @@ def test_overfitting_IO(nlp): docs1 = list(nlp.pipe(texts)) docs2 = list(nlp.pipe(texts)) docs3 = [nlp(text) for text in texts] - assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0]) - assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0]) + assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0]) + assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0]) @pytest.mark.skipif(not has_torch, reason="Torch not available") @@ -187,7 +187,7 @@ def test_tokenization_mismatch(nlp): test_doc = train_examples[0].predicted doc = nlp(test_doc) # XXX This actually tests that it can overfit - assert _spans_to_offsets(doc) == _spans_to_offsets(train_examples[0].reference) + assert get_clusters_from_doc(doc) == get_clusters_from_doc(train_examples[0].reference) # Also test the results are still the same after IO with make_tempdir() as tmp_dir: @@ -206,8 +206,8 @@ def test_tokenization_mismatch(nlp): docs1 = list(nlp.pipe(texts)) docs2 = list(nlp.pipe(texts)) docs3 = [nlp(text) for text in texts] - assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs2[0]) - assert _spans_to_offsets(docs1[0]) == _spans_to_offsets(docs3[0]) + assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs2[0]) + assert get_clusters_from_doc(docs1[0]) == get_clusters_from_doc(docs3[0]) @pytest.mark.skipif(not has_torch, reason="Torch not available")