mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 05:40:20 +03:00
Create doc manually instead of generating from pipeline.
This commit is contained in:
parent
af47a4ad46
commit
5349c1811c
|
@ -6,7 +6,6 @@ from spacy.attrs import ORTH, LENGTH
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, Span, Token
|
from spacy.tokens import Doc, Span, Token
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy import load
|
|
||||||
from spacy.util import filter_spans
|
from spacy.util import filter_spans
|
||||||
from thinc.api import get_current_ops
|
from thinc.api import get_current_ops
|
||||||
|
|
||||||
|
@ -16,13 +15,40 @@ from .test_underscore import clean_underscore # noqa: F401
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc(en_tokenizer):
|
def doc(en_tokenizer):
|
||||||
# fmt: off
|
|
||||||
text = "This is a sentence. This is another sentence. And a third."
|
text = "This is a sentence. This is another sentence. And a third."
|
||||||
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
|
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
|
||||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
deps = [
|
||||||
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
|
"nsubj",
|
||||||
ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O",
|
"ROOT",
|
||||||
"O", "O", "O", "O", "O"]
|
"det",
|
||||||
|
"attr",
|
||||||
|
"punct",
|
||||||
|
"nsubj",
|
||||||
|
"ROOT",
|
||||||
|
"det",
|
||||||
|
"attr",
|
||||||
|
"punct",
|
||||||
|
"ROOT",
|
||||||
|
"det",
|
||||||
|
"npadvmod",
|
||||||
|
"punct",
|
||||||
|
]
|
||||||
|
ents = [
|
||||||
|
"O",
|
||||||
|
"O",
|
||||||
|
"B-ENT",
|
||||||
|
"I-ENT",
|
||||||
|
"I-ENT",
|
||||||
|
"I-ENT",
|
||||||
|
"I-ENT",
|
||||||
|
"O",
|
||||||
|
"O",
|
||||||
|
"O",
|
||||||
|
"O",
|
||||||
|
"O",
|
||||||
|
"O",
|
||||||
|
"O",
|
||||||
|
]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
lemmas = [t.text for t in tokens] # this is not correct, just a placeholder
|
lemmas = [t.text for t in tokens] # this is not correct, just a placeholder
|
||||||
|
@ -683,39 +709,46 @@ def test_span_group_copy(doc):
|
||||||
assert len(doc_copy.spans["test"]) == 2
|
assert len(doc_copy.spans["test"]) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
# @pytest.mark.xfail
|
||||||
@pytest.mark.parametrize("use_double_space", [False, True])
|
@pytest.mark.parametrize("use_double_space", [True, False])
|
||||||
def test_span_sentence_mapping(use_double_space: bool):
|
def test_span_sentence_mapping(en_tokenizer, use_double_space: bool):
|
||||||
"""Tests correct mapping of spans to sentences. This is currently failing due to some issue with the
|
"""Tests correct mapping of spans to sentences. This is currently failing due to some issue with the
|
||||||
span-to-sentence mapping.
|
span-to-sentence mapping.
|
||||||
use_double_space (bool): Whether to use double space after end of first sentence.
|
use_double_space (bool): Whether to use double space after end of first sentence.
|
||||||
"""
|
"""
|
||||||
nlp = load("en_core_web_sm")
|
|
||||||
space = " " if use_double_space else ""
|
space = " " if use_double_space else ""
|
||||||
raw_sents = [
|
raw_sents = [
|
||||||
"Well, you're taking your eyes off the road,\" said a governmental affairs representative for Sprint. "
|
"This is a sentence. " + space,
|
||||||
+ space,
|
"This is another sentence. ",
|
||||||
"New Jersey, New York, and the District of Columbia already ban holding a cell phone while driving; a "
|
"And a third.",
|
||||||
'"hands-free" cell phone is legal.',
|
|
||||||
]
|
]
|
||||||
doc = nlp("".join(raw_sents))
|
text = "".join(raw_sents)
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
spaces = [bool(t.whitespace_) for t in tokens]
|
||||||
|
|
||||||
# Ensure sentence splitting works as expected before testing span-to-sentence mapping.
|
doc = Doc(
|
||||||
# Note that the sentence splitting behavior is already different when using double spaces, which shouldn't be the
|
tokens.vocab,
|
||||||
sents = list(doc.sents)
|
words=[t.text for t in tokens],
|
||||||
assert len(sents) == 2
|
spaces=spaces,
|
||||||
assert sents[0].text == raw_sents[0] if use_double_space else raw_sents[0][:-1]
|
sent_starts=[
|
||||||
assert sents[1].text == raw_sents[1]
|
1,
|
||||||
|
*[0] * (5 if use_double_space else 4),
|
||||||
|
1,
|
||||||
|
*[0] * 4,
|
||||||
|
1,
|
||||||
|
*[0] * 3,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
# Select span for test.
|
# Select span for test.
|
||||||
start = 100
|
start = 20
|
||||||
end = 111 if use_double_space else 110
|
end = 25 if use_double_space else 24
|
||||||
span = doc.char_span(start, end)
|
span = doc.char_span(start, end)
|
||||||
assert span.text == doc.text[start:end] == space + "New Jersey"
|
assert span.text == doc.text[start:end] == space + "This"
|
||||||
|
|
||||||
# Test span-to-sentence mapping. Since the span in question doesn't cross sentence boundaries, there should only be
|
# Test span-to-sentence mapping. Since the span in question doesn't cross sentence boundaries, there should only be
|
||||||
# one sentence.
|
# one sentence.
|
||||||
span_sents = list(span.sents) # type: ignore
|
span_sents = list(span.sents) # type: ignore
|
||||||
span_sent = span.sent
|
span_sent = span.sent
|
||||||
assert len(span_sents) == 1
|
assert len(span_sents) == 1
|
||||||
assert span_sent.text == sents[1].text
|
assert span_sent.text == list(doc.sents)[1].text
|
||||||
|
|
Loading…
Reference in New Issue
Block a user