From 513b6370f4d1cbfe5a4f3376ab6195f86d23208d Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 1 Dec 2022 15:02:16 +0100 Subject: [PATCH] Add failing test for span-sentence mapping. --- spacy/tests/doc/test_span.py | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 3676b35af..24b421457 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -6,6 +6,7 @@ from spacy.attrs import ORTH, LENGTH from spacy.lang.en import English from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab +from spacy import load from spacy.util import filter_spans from thinc.api import get_current_ops @@ -680,3 +681,41 @@ def test_span_group_copy(doc): assert len(doc.spans["test"]) == 3 # check that the copy spans were not modified and this is an isolated doc assert len(doc_copy.spans["test"]) == 2 + + +@pytest.mark.xfail +@pytest.mark.parametrize("use_double_space", [False, True]) +def test_span_sentence_mapping(use_double_space: bool): + """Tests correct mapping of spans to sentences. This is currently failing due to some issue with the + span-to-sentence mapping. + use_double_space (bool): Whether to use double space after end of first sentence. + """ + nlp = load("en_core_web_sm") + space = " " if use_double_space else "" + raw_sents = [ + "Well, you're taking your eyes off the road,\" said a governmental affairs representative for Sprint. " + + space, + "New Jersey, New York, and the District of Columbia already ban holding a cell phone while driving; a " + '"hands-free" cell phone is legal.', + ] + doc = nlp("".join(raw_sents)) + + # Ensure sentence splitting works as expected before testing span-to-sentence mapping. + # Note that the sentence splitting behavior is already different when using double spaces, which shouldn't be the + sents = list(doc.sents) + assert len(sents) == 2 + assert sents[0].text == raw_sents[0] if use_double_space else raw_sents[0][:-1] + assert sents[1].text == raw_sents[1] + + # Select span for test. + start = 100 + end = 111 if use_double_space else 110 + span = doc.char_span(start, end) + assert span.text == doc.text[start:end] == space + "New Jersey" + + # Test span-to-sentence mapping. Since the span in question doesn't cross sentence boundaries, there should only be + # one sentence. + span_sents = list(span.sents) + span_sent = span.sent + assert len(span_sents) == 1 + assert span_sent.text == sents[1].text