Change span lemmas to use original whitespace (fix #8368) (#8391)

* Change span lemmas to use original whitespace (fix #8368)

This is a redo of #8371 based off master.

The test for this required some changes to existing tests. I don't think
the changes were significant but I'd like someone to check them.

* Remove mystery docstring

This sentence was uncompleted for years, and now we will never know how
it ends.
This commit is contained in:
Paul O'Leary McCann 2021-06-15 20:24:54 +09:00 committed by GitHub
parent 2c105cdbce
commit 94e1346f44
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 6 deletions

View File

@ -20,7 +20,17 @@ def doc(en_tokenizer):
"O", "O", "O", "O", "O"] "O", "O", "O", "O", "O"]
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, ents=ents) lemmas = [t.text for t in tokens] # this is not correct, just a placeholder
spaces = [bool(t.whitespace_) for t in tokens]
return Doc(
tokens.vocab,
words=[t.text for t in tokens],
spaces=spaces,
heads=heads,
deps=deps,
ents=ents,
lemmas=lemmas,
)
@pytest.fixture @pytest.fixture
@ -286,7 +296,6 @@ def test_span_attrs_writable(doc):
def test_span_ents_property(doc): def test_span_ents_property(doc):
"""Test span.ents for the """
doc.ents = [ doc.ents = [
(doc.vocab.strings["PRODUCT"], 0, 1), (doc.vocab.strings["PRODUCT"], 0, 1),
(doc.vocab.strings["PRODUCT"], 7, 8), (doc.vocab.strings["PRODUCT"], 7, 8),
@ -361,6 +370,12 @@ def test_span_boundaries(doc):
span[5] span[5]
def test_span_lemma(doc):
# span lemmas should have the same number of spaces as the span
sp = doc[1:5]
assert len(sp.text.split(" ")) == len(sp.lemma_.split(" "))
def test_sent(en_tokenizer): def test_sent(en_tokenizer):
doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.") doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.")
span = doc[1:3] span = doc[1:3]

View File

@ -754,7 +754,7 @@ cdef class Span:
@property @property
def lemma_(self): def lemma_(self):
"""RETURNS (str): The span's lemma.""" """RETURNS (str): The span's lemma."""
return " ".join([t.lemma_ for t in self]).strip() return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
property label_: property label_:
"""RETURNS (str): The span's label.""" """RETURNS (str): The span's label."""