mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Change span lemmas to use original whitespace (fix #8368) This is a redo of #8371 based off master. The test for this required some changes to existing tests. I don't think the changes were significant but I'd like someone to check them. * Remove mystery docstring This sentence was uncompleted for years, and now we will never know how it ends.
This commit is contained in:
parent
2c105cdbce
commit
94e1346f44
|
@ -20,7 +20,17 @@ def doc(en_tokenizer):
|
|||
"O", "O", "O", "O", "O"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, ents=ents)
|
||||
lemmas = [t.text for t in tokens] # this is not correct, just a placeholder
|
||||
spaces = [bool(t.whitespace_) for t in tokens]
|
||||
return Doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
spaces=spaces,
|
||||
heads=heads,
|
||||
deps=deps,
|
||||
ents=ents,
|
||||
lemmas=lemmas,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -84,7 +94,7 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
|||
"""Test span.sent property"""
|
||||
assert len(list(doc.sents))
|
||||
assert doc[:2].sent.root.text == "is"
|
||||
assert doc[:2].sent.text == "This is a sentence ."
|
||||
assert doc[:2].sent.text == "This is a sentence."
|
||||
assert doc[6:7].sent.root.left_edge.text == "This"
|
||||
# test on manual sbd
|
||||
doc_not_parsed[0].is_sent_start = True
|
||||
|
@ -249,7 +259,7 @@ def test_span_as_doc(doc):
|
|||
|
||||
@pytest.mark.usefixtures("clean_underscore")
|
||||
def test_span_as_doc_user_data(doc):
|
||||
"""Test that the user_data can be preserved (but not by default). """
|
||||
"""Test that the user_data can be preserved (but not by default)."""
|
||||
my_key = "my_info"
|
||||
my_value = 342
|
||||
doc.user_data[my_key] = my_value
|
||||
|
@ -286,7 +296,6 @@ def test_span_attrs_writable(doc):
|
|||
|
||||
|
||||
def test_span_ents_property(doc):
|
||||
"""Test span.ents for the """
|
||||
doc.ents = [
|
||||
(doc.vocab.strings["PRODUCT"], 0, 1),
|
||||
(doc.vocab.strings["PRODUCT"], 7, 8),
|
||||
|
@ -308,7 +317,7 @@ def test_span_ents_property(doc):
|
|||
assert sentences[1].ents[0].start == 7
|
||||
assert sentences[1].ents[0].end == 8
|
||||
# Third sentence ents, Also tests end of sentence
|
||||
assert sentences[2].ents[0].text == "a third ."
|
||||
assert sentences[2].ents[0].text == "a third."
|
||||
assert sentences[2].ents[0].label_ == "PRODUCT"
|
||||
assert sentences[2].ents[0].start == 11
|
||||
assert sentences[2].ents[0].end == 14
|
||||
|
@ -361,6 +370,12 @@ def test_span_boundaries(doc):
|
|||
span[5]
|
||||
|
||||
|
||||
def test_span_lemma(doc):
|
||||
# span lemmas should have the same number of spaces as the span
|
||||
sp = doc[1:5]
|
||||
assert len(sp.text.split(" ")) == len(sp.lemma_.split(" "))
|
||||
|
||||
|
||||
def test_sent(en_tokenizer):
|
||||
doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.")
|
||||
span = doc[1:3]
|
||||
|
|
|
@ -754,7 +754,7 @@ cdef class Span:
|
|||
@property
|
||||
def lemma_(self):
|
||||
"""RETURNS (str): The span's lemma."""
|
||||
return " ".join([t.lemma_ for t in self]).strip()
|
||||
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
|
||||
|
||||
property label_:
|
||||
"""RETURNS (str): The span's label."""
|
||||
|
|
Loading…
Reference in New Issue
Block a user