Create doc manually instead of generating from pipeline.

This commit is contained in:
Raphael Mitsch 2022-12-01 16:49:21 +01:00
parent af47a4ad46
commit 5349c1811c

View File

@ -6,7 +6,6 @@ from spacy.attrs import ORTH, LENGTH
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc, Span, Token from spacy.tokens import Doc, Span, Token
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy import load
from spacy.util import filter_spans from spacy.util import filter_spans
from thinc.api import get_current_ops from thinc.api import get_current_ops
@ -16,13 +15,40 @@ from .test_underscore import clean_underscore # noqa: F401
@pytest.fixture @pytest.fixture
def doc(en_tokenizer): def doc(en_tokenizer):
# fmt: off
text = "This is a sentence. This is another sentence. And a third." text = "This is a sentence. This is another sentence. And a third."
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12] heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det", deps = [
"attr", "punct", "ROOT", "det", "npadvmod", "punct"] "nsubj",
ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O", "ROOT",
"O", "O", "O", "O", "O"] "det",
"attr",
"punct",
"nsubj",
"ROOT",
"det",
"attr",
"punct",
"ROOT",
"det",
"npadvmod",
"punct",
]
ents = [
"O",
"O",
"B-ENT",
"I-ENT",
"I-ENT",
"I-ENT",
"I-ENT",
"O",
"O",
"O",
"O",
"O",
"O",
"O",
]
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
lemmas = [t.text for t in tokens] # this is not correct, just a placeholder lemmas = [t.text for t in tokens] # this is not correct, just a placeholder
@ -683,39 +709,46 @@ def test_span_group_copy(doc):
assert len(doc_copy.spans["test"]) == 2 assert len(doc_copy.spans["test"]) == 2
@pytest.mark.xfail # @pytest.mark.xfail
@pytest.mark.parametrize("use_double_space", [False, True]) @pytest.mark.parametrize("use_double_space", [True, False])
def test_span_sentence_mapping(use_double_space: bool): def test_span_sentence_mapping(en_tokenizer, use_double_space: bool):
"""Tests correct mapping of spans to sentences. This is currently failing due to some issue with the """Tests correct mapping of spans to sentences. This is currently failing due to some issue with the
span-to-sentence mapping. span-to-sentence mapping.
use_double_space (bool): Whether to use double space after end of first sentence. use_double_space (bool): Whether to use double space after end of first sentence.
""" """
nlp = load("en_core_web_sm")
space = " " if use_double_space else "" space = " " if use_double_space else ""
raw_sents = [ raw_sents = [
"Well, you're taking your eyes off the road,\" said a governmental affairs representative for Sprint. " "This is a sentence. " + space,
+ space, "This is another sentence. ",
"New Jersey, New York, and the District of Columbia already ban holding a cell phone while driving; a " "And a third.",
'"hands-free" cell phone is legal.',
] ]
doc = nlp("".join(raw_sents)) text = "".join(raw_sents)
tokens = en_tokenizer(text)
spaces = [bool(t.whitespace_) for t in tokens]
# Ensure sentence splitting works as expected before testing span-to-sentence mapping. doc = Doc(
# Note that the sentence splitting behavior is already different when using double spaces, which shouldn't be the tokens.vocab,
sents = list(doc.sents) words=[t.text for t in tokens],
assert len(sents) == 2 spaces=spaces,
assert sents[0].text == raw_sents[0] if use_double_space else raw_sents[0][:-1] sent_starts=[
assert sents[1].text == raw_sents[1] 1,
*[0] * (5 if use_double_space else 4),
1,
*[0] * 4,
1,
*[0] * 3,
],
)
# Select span for test. # Select span for test.
start = 100 start = 20
end = 111 if use_double_space else 110 end = 25 if use_double_space else 24
span = doc.char_span(start, end) span = doc.char_span(start, end)
assert span.text == doc.text[start:end] == space + "New Jersey" assert span.text == doc.text[start:end] == space + "This"
# Test span-to-sentence mapping. Since the span in question doesn't cross sentence boundaries, there should only be # Test span-to-sentence mapping. Since the span in question doesn't cross sentence boundaries, there should only be
# one sentence. # one sentence.
span_sents = list(span.sents) # type: ignore span_sents = list(span.sents) # type: ignore
span_sent = span.sent span_sent = span.sent
assert len(span_sents) == 1 assert len(span_sents) == 1
assert span_sent.text == sents[1].text assert span_sent.text == list(doc.sents)[1].text