Fix Sentencizer.pipe() for empty doc (#4940)

2025-12-22 09:34:23 +03:00 · 2020-01-28 11:36:49 +01:00 · 2020-01-28 11:36:49 +01:00 · a938566b62
commit a938566b62
parent 7ad000fce7
2 changed files with 29 additions and 12 deletions
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -1492,9 +1492,10 @@ class Sentencizer(object):
            return guesses
        guesses = []
        for doc in docs:
            doc_guesses = [False] * len(doc)
            if len(doc) > 0:
                start = 0
                seen_period = False
            doc_guesses = [False] * len(doc)
                doc_guesses[0] = True
                for i, token in enumerate(doc):
                    is_in_punct_chars = token.text in self.punct_chars
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@ -29,6 +29,22 @@ def test_sentencizer_pipe():
        assert len(list(doc.sents)) == 2
 def test_sentencizer_empty_docs():
    one_empty_text = [""]
    many_empty_texts = ["", "", ""]
    some_empty_texts = ["hi", "", "This is a test. Here are two sentences.", ""]
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
        for doc in nlp.pipe(texts):
            assert doc.is_sentenced
            sent_starts = [t.is_sent_start for t in doc]
            if len(doc) == 0:
                assert sent_starts == []
            else:
                assert len(sent_starts) > 0
@pytest.mark.parametrize(
    "words,sent_starts,n_sents",
    [