mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Fix Sentencizer.pipe() for empty doc (#4940)
This commit is contained in:
parent
7ad000fce7
commit
a938566b62
|
@ -1492,20 +1492,21 @@ class Sentencizer(object):
|
|||
return guesses
|
||||
guesses = []
|
||||
for doc in docs:
|
||||
start = 0
|
||||
seen_period = False
|
||||
doc_guesses = [False] * len(doc)
|
||||
doc_guesses[0] = True
|
||||
for i, token in enumerate(doc):
|
||||
is_in_punct_chars = token.text in self.punct_chars
|
||||
if seen_period and not token.is_punct and not is_in_punct_chars:
|
||||
if len(doc) > 0:
|
||||
start = 0
|
||||
seen_period = False
|
||||
doc_guesses[0] = True
|
||||
for i, token in enumerate(doc):
|
||||
is_in_punct_chars = token.text in self.punct_chars
|
||||
if seen_period and not token.is_punct and not is_in_punct_chars:
|
||||
doc_guesses[start] = True
|
||||
start = token.i
|
||||
seen_period = False
|
||||
elif is_in_punct_chars:
|
||||
seen_period = True
|
||||
if start < len(doc):
|
||||
doc_guesses[start] = True
|
||||
start = token.i
|
||||
seen_period = False
|
||||
elif is_in_punct_chars:
|
||||
seen_period = True
|
||||
if start < len(doc):
|
||||
doc_guesses[start] = True
|
||||
guesses.append(doc_guesses)
|
||||
return guesses
|
||||
|
||||
|
|
|
@ -29,6 +29,22 @@ def test_sentencizer_pipe():
|
|||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
def test_sentencizer_empty_docs():
|
||||
one_empty_text = [""]
|
||||
many_empty_texts = ["", "", ""]
|
||||
some_empty_texts = ["hi", "", "This is a test. Here are two sentences.", ""]
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
|
||||
for doc in nlp.pipe(texts):
|
||||
assert doc.is_sentenced
|
||||
sent_starts = [t.is_sent_start for t in doc]
|
||||
if len(doc) == 0:
|
||||
assert sent_starts == []
|
||||
else:
|
||||
assert len(sent_starts) > 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"words,sent_starts,n_sents",
|
||||
[
|
||||
|
|
Loading…
Reference in New Issue
Block a user