From 513b6370f4d1cbfe5a4f3376ab6195f86d23208d Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 1 Dec 2022 15:02:16 +0100
Subject: [PATCH] Add failing test for span-sentence mapping.

---
 spacy/tests/doc/test_span.py | 39 ++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 3676b35af..24b421457 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -6,6 +6,7 @@ from spacy.attrs import ORTH, LENGTH
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span, Token
 from spacy.vocab import Vocab
+from spacy import load
 from spacy.util import filter_spans
 from thinc.api import get_current_ops
 
@@ -680,3 +681,41 @@ def test_span_group_copy(doc):
     assert len(doc.spans["test"]) == 3
     # check that the copy spans were not modified and this is an isolated doc
     assert len(doc_copy.spans["test"]) == 2
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("use_double_space", [False, True])
+def test_span_sentence_mapping(use_double_space: bool):
+    """Tests correct mapping of spans to sentences. This is currently failing due to some issue with the
+    span-to-sentence mapping.
+    use_double_space (bool): Whether to use double space after end of first sentence.
+    """
+    nlp = load("en_core_web_sm")
+    space = " " if use_double_space else ""
+    raw_sents = [
+        "Well, you're taking your eyes off the road,\" said a governmental affairs representative for Sprint. "
+        + space,
+        "New Jersey, New York, and the District of Columbia already ban holding a cell phone while driving; a "
+        '"hands-free" cell phone is legal.',
+    ]
+    doc = nlp("".join(raw_sents))
+
+    # Ensure sentence splitting works as expected before testing span-to-sentence mapping.
+    # Note that the sentence splitting behavior is already different when using double spaces, which shouldn't be the
+    sents = list(doc.sents)
+    assert len(sents) == 2
+    assert sents[0].text == raw_sents[0] if use_double_space else raw_sents[0][:-1]
+    assert sents[1].text == raw_sents[1]
+
+    # Select span for test.
+    start = 100
+    end = 111 if use_double_space else 110
+    span = doc.char_span(start, end)
+    assert span.text == doc.text[start:end] == space + "New Jersey"
+
+    # Test span-to-sentence mapping. Since the span in question doesn't cross sentence boundaries, there should only be
+    # one sentence.
+    span_sents = list(span.sents)
+    span_sent = span.sent
+    assert len(span_sents) == 1
+    assert span_sent.text == sents[1].text