Add failing test for span-sentence mapping.

This commit is contained in:
Raphael Mitsch 2022-12-01 15:02:16 +01:00
parent 6f9d630f7e
commit 513b6370f4

View File

@ -6,6 +6,7 @@ from spacy.attrs import ORTH, LENGTH
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc, Span, Token from spacy.tokens import Doc, Span, Token
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy import load
from spacy.util import filter_spans from spacy.util import filter_spans
from thinc.api import get_current_ops from thinc.api import get_current_ops
@ -680,3 +681,41 @@ def test_span_group_copy(doc):
assert len(doc.spans["test"]) == 3 assert len(doc.spans["test"]) == 3
# check that the copy spans were not modified and this is an isolated doc # check that the copy spans were not modified and this is an isolated doc
assert len(doc_copy.spans["test"]) == 2 assert len(doc_copy.spans["test"]) == 2
@pytest.mark.xfail
@pytest.mark.parametrize("use_double_space", [False, True])
def test_span_sentence_mapping(use_double_space: bool):
"""Tests correct mapping of spans to sentences. This is currently failing due to some issue with the
span-to-sentence mapping.
use_double_space (bool): Whether to use double space after end of first sentence.
"""
nlp = load("en_core_web_sm")
space = " " if use_double_space else ""
raw_sents = [
"Well, you're taking your eyes off the road,\" said a governmental affairs representative for Sprint. "
+ space,
"New Jersey, New York, and the District of Columbia already ban holding a cell phone while driving; a "
'"hands-free" cell phone is legal.',
]
doc = nlp("".join(raw_sents))
# Ensure sentence splitting works as expected before testing span-to-sentence mapping.
# Note that the sentence splitting behavior is already different when using double spaces, which shouldn't be the
sents = list(doc.sents)
assert len(sents) == 2
assert sents[0].text == raw_sents[0] if use_double_space else raw_sents[0][:-1]
assert sents[1].text == raw_sents[1]
# Select span for test.
start = 100
end = 111 if use_double_space else 110
span = doc.char_span(start, end)
assert span.text == doc.text[start:end] == space + "New Jersey"
# Test span-to-sentence mapping. Since the span in question doesn't cross sentence boundaries, there should only be
# one sentence.
span_sents = list(span.sents)
span_sent = span.sent
assert len(span_sents) == 1
assert span_sent.text == sents[1].text