diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index a56e3ebe1..3536ee7ee 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -6,7 +6,6 @@ from spacy.attrs import ORTH, LENGTH from spacy.lang.en import English from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab -from spacy import load from spacy.util import filter_spans from thinc.api import get_current_ops @@ -16,13 +15,40 @@ from .test_underscore import clean_underscore # noqa: F401 @pytest.fixture def doc(en_tokenizer): - # fmt: off text = "This is a sentence. This is another sentence. And a third." heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12] - deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det", - "attr", "punct", "ROOT", "det", "npadvmod", "punct"] - ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O", - "O", "O", "O", "O", "O"] + deps = [ + "nsubj", + "ROOT", + "det", + "attr", + "punct", + "nsubj", + "ROOT", + "det", + "attr", + "punct", + "ROOT", + "det", + "npadvmod", + "punct", + ] + ents = [ + "O", + "O", + "B-ENT", + "I-ENT", + "I-ENT", + "I-ENT", + "I-ENT", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + ] # fmt: on tokens = en_tokenizer(text) lemmas = [t.text for t in tokens] # this is not correct, just a placeholder @@ -683,39 +709,46 @@ def test_span_group_copy(doc): assert len(doc_copy.spans["test"]) == 2 -@pytest.mark.xfail -@pytest.mark.parametrize("use_double_space", [False, True]) -def test_span_sentence_mapping(use_double_space: bool): +# @pytest.mark.xfail +@pytest.mark.parametrize("use_double_space", [True, False]) +def test_span_sentence_mapping(en_tokenizer, use_double_space: bool): """Tests correct mapping of spans to sentences. This is currently failing due to some issue with the span-to-sentence mapping. use_double_space (bool): Whether to use double space after end of first sentence. """ - nlp = load("en_core_web_sm") space = " " if use_double_space else "" raw_sents = [ - "Well, you're taking your eyes off the road,\" said a governmental affairs representative for Sprint. " - + space, - "New Jersey, New York, and the District of Columbia already ban holding a cell phone while driving; a " - '"hands-free" cell phone is legal.', + "This is a sentence. " + space, + "This is another sentence. ", + "And a third.", ] - doc = nlp("".join(raw_sents)) + text = "".join(raw_sents) + tokens = en_tokenizer(text) + spaces = [bool(t.whitespace_) for t in tokens] - # Ensure sentence splitting works as expected before testing span-to-sentence mapping. - # Note that the sentence splitting behavior is already different when using double spaces, which shouldn't be the - sents = list(doc.sents) - assert len(sents) == 2 - assert sents[0].text == raw_sents[0] if use_double_space else raw_sents[0][:-1] - assert sents[1].text == raw_sents[1] + doc = Doc( + tokens.vocab, + words=[t.text for t in tokens], + spaces=spaces, + sent_starts=[ + 1, + *[0] * (5 if use_double_space else 4), + 1, + *[0] * 4, + 1, + *[0] * 3, + ], + ) # Select span for test. - start = 100 - end = 111 if use_double_space else 110 + start = 20 + end = 25 if use_double_space else 24 span = doc.char_span(start, end) - assert span.text == doc.text[start:end] == space + "New Jersey" + assert span.text == doc.text[start:end] == space + "This" # Test span-to-sentence mapping. Since the span in question doesn't cross sentence boundaries, there should only be # one sentence. span_sents = list(span.sents) # type: ignore span_sent = span.sent assert len(span_sents) == 1 - assert span_sent.text == sents[1].text + assert span_sent.text == list(doc.sents)[1].text