Create doc manually instead of generating from pipeline.

2025-09-22 03:52:39 +03:00 · 2022-12-01 16:49:21 +01:00 · 2022-12-01 16:49:21 +01:00 · 5349c1811c
commit 5349c1811c
parent af47a4ad46
1 changed files with 58 additions and 25 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -6,7 +6,6 @@ from spacy.attrs import ORTH, LENGTH
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span, Token
 from spacy.vocab import Vocab
 from spacy import load
 from spacy.util import filter_spans
 from thinc.api import get_current_ops
@ -16,13 +15,40 @@ from .test_underscore import clean_underscore  # noqa: F401
@pytest.fixture
 def doc(en_tokenizer):
    # fmt: off
    text = "This is a sentence. This is another sentence. And a third."
    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
-    deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
+    deps = [
-            "attr", "punct", "ROOT", "det", "npadvmod", "punct"]
+        "nsubj",
-    ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O",
+        "ROOT",
-            "O", "O", "O", "O", "O"]
+        "det",
        "attr",
        "punct",
        "nsubj",
        "ROOT",
        "det",
        "attr",
        "punct",
        "ROOT",
        "det",
        "npadvmod",
        "punct",
    ]
    ents = [
        "O",
        "O",
        "B-ENT",
        "I-ENT",
        "I-ENT",
        "I-ENT",
        "I-ENT",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
    ]
    # fmt: on
    tokens = en_tokenizer(text)
    lemmas = [t.text for t in tokens]  # this is not correct, just a placeholder
@ -683,39 +709,46 @@ def test_span_group_copy(doc):
    assert len(doc_copy.spans["test"]) == 2
-@pytest.mark.xfail
+# @pytest.mark.xfail
-@pytest.mark.parametrize("use_double_space", [False, True])
+@pytest.mark.parametrize("use_double_space", [True, False])
-def test_span_sentence_mapping(use_double_space: bool):
+def test_span_sentence_mapping(en_tokenizer, use_double_space: bool):
    """Tests correct mapping of spans to sentences. This is currently failing due to some issue with the
    span-to-sentence mapping.
    use_double_space (bool): Whether to use double space after end of first sentence.
    """
    nlp = load("en_core_web_sm")
    space = " " if use_double_space else ""
    raw_sents = [
-        "Well, you're taking your eyes off the road,\" said a governmental affairs representative for Sprint. "
+        "This is a sentence. " + space,
-        + space,
+        "This is another sentence. ",
-        "New Jersey, New York, and the District of Columbia already ban holding a cell phone while driving; a "
+        "And a third.",
        '"hands-free" cell phone is legal.',
    ]
-    doc = nlp("".join(raw_sents))
+    text = "".join(raw_sents)
    tokens = en_tokenizer(text)
    spaces = [bool(t.whitespace_) for t in tokens]
-    # Ensure sentence splitting works as expected before testing span-to-sentence mapping.
+    doc = Doc(
-    # Note that the sentence splitting behavior is already different when using double spaces, which shouldn't be the
+        tokens.vocab,
-    sents = list(doc.sents)
+        words=[t.text for t in tokens],
-    assert len(sents) == 2
+        spaces=spaces,
-    assert sents[0].text == raw_sents[0] if use_double_space else raw_sents[0][:-1]
+        sent_starts=[
-    assert sents[1].text == raw_sents[1]
+            1,
            *[0] * (5 if use_double_space else 4),
            1,
            *[0] * 4,
            1,
            *[0] * 3,
        ],
    )
    # Select span for test.
-    start = 100
+    start = 20
-    end = 111 if use_double_space else 110
+    end = 25 if use_double_space else 24
    span = doc.char_span(start, end)
-    assert span.text == doc.text[start:end] == space + "New Jersey"
+    assert span.text == doc.text[start:end] == space + "This"
    # Test span-to-sentence mapping. Since the span in question doesn't cross sentence boundaries, there should only be
    # one sentence.
    span_sents = list(span.sents)  # type: ignore
    span_sent = span.sent
    assert len(span_sents) == 1
-    assert span_sent.text == sents[1].text
+    assert span_sent.text == list(doc.sents)[1].text