Change span lemmas to use original whitespace (fix #8368) (#8391)

* Change span lemmas to use original whitespace (fix #8368) This is a redo of #8371 based off master. The test for this required some changes to existing tests. I don't think the changes were significant but I'd like someone to check them. * Remove mystery docstring This sentence was uncompleted for years, and now we will never know how it ends.
2025-10-16 16:54:38 +03:00 · 2021-06-15 20:24:54 +09:00 · 2021-06-15 20:24:54 +09:00 · 94e1346f44
commit 94e1346f44
parent 2c105cdbce
2 changed files with 21 additions and 6 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -20,7 +20,17 @@ def doc(en_tokenizer):
            "O", "O", "O", "O", "O"]
    # fmt: on
    tokens = en_tokenizer(text)
-    return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, ents=ents)
+    lemmas = [t.text for t in tokens]  # this is not correct, just a placeholder
+    spaces = [bool(t.whitespace_) for t in tokens]
+    return Doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        spaces=spaces,
+        heads=heads,
+        deps=deps,
+        ents=ents,
+        lemmas=lemmas,
+    )


@pytest.fixture
@ -84,7 +94,7 @@ def test_spans_span_sent(doc, doc_not_parsed):
    """Test span.sent property"""
    assert len(list(doc.sents))
    assert doc[:2].sent.root.text == "is"
-    assert doc[:2].sent.text == "This is a sentence ."
+    assert doc[:2].sent.text == "This is a sentence."
    assert doc[6:7].sent.root.left_edge.text == "This"
    # test on manual sbd
    doc_not_parsed[0].is_sent_start = True
@ -249,7 +259,7 @@ def test_span_as_doc(doc):

@pytest.mark.usefixtures("clean_underscore")
 def test_span_as_doc_user_data(doc):
-    """Test that the user_data can be preserved (but not by default). """
+    """Test that the user_data can be preserved (but not by default)."""
    my_key = "my_info"
    my_value = 342
    doc.user_data[my_key] = my_value
@ -286,7 +296,6 @@ def test_span_attrs_writable(doc):


 def test_span_ents_property(doc):
-    """Test span.ents for the """
    doc.ents = [
        (doc.vocab.strings["PRODUCT"], 0, 1),
        (doc.vocab.strings["PRODUCT"], 7, 8),
@ -308,7 +317,7 @@ def test_span_ents_property(doc):
    assert sentences[1].ents[0].start == 7
    assert sentences[1].ents[0].end == 8
    # Third sentence ents, Also tests end of sentence
-    assert sentences[2].ents[0].text == "a third ."
+    assert sentences[2].ents[0].text == "a third."
    assert sentences[2].ents[0].label_ == "PRODUCT"
    assert sentences[2].ents[0].start == 11
    assert sentences[2].ents[0].end == 14
@ -361,6 +370,12 @@ def test_span_boundaries(doc):
        span[5]


+def test_span_lemma(doc):
+    # span lemmas should have the same number of spaces as the span
+    sp = doc[1:5]
+    assert len(sp.text.split(" ")) == len(sp.lemma_.split(" "))
+
+
 def test_sent(en_tokenizer):
    doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.")
    span = doc[1:3]
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -754,7 +754,7 @@ cdef class Span:
    @property
    def lemma_(self):
        """RETURNS (str): The span's lemma."""
-        return " ".join([t.lemma_ for t in self]).strip()
+        return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()

    property label_:
        """RETURNS (str): The span's label."""