From a938566b62fe74c6bacac55e658cd271cc5d1e29 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Tue, 28 Jan 2020 11:36:49 +0100
Subject: [PATCH] Fix Sentencizer.pipe() for empty doc (#4940)

---
 spacy/pipeline/pipes.pyx                 | 25 ++++++++++++------------
 spacy/tests/pipeline/test_sentencizer.py | 16 +++++++++++++++
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index b51520777..b4fecf5cb 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1492,20 +1492,21 @@ class Sentencizer(object):
             return guesses
         guesses = []
         for doc in docs:
-            start = 0
-            seen_period = False
             doc_guesses = [False] * len(doc)
-            doc_guesses[0] = True
-            for i, token in enumerate(doc):
-                is_in_punct_chars = token.text in self.punct_chars
-                if seen_period and not token.is_punct and not is_in_punct_chars:
+            if len(doc) > 0:
+                start = 0
+                seen_period = False
+                doc_guesses[0] = True
+                for i, token in enumerate(doc):
+                    is_in_punct_chars = token.text in self.punct_chars
+                    if seen_period and not token.is_punct and not is_in_punct_chars:
+                        doc_guesses[start] = True
+                        start = token.i
+                        seen_period = False
+                    elif is_in_punct_chars:
+                        seen_period = True
+                if start < len(doc):
                     doc_guesses[start] = True
-                    start = token.i
-                    seen_period = False
-                elif is_in_punct_chars:
-                    seen_period = True
-            if start < len(doc):
-                doc_guesses[start] = True
             guesses.append(doc_guesses)
         return guesses
 
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index 359552c5b..d690958cc 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -29,6 +29,22 @@ def test_sentencizer_pipe():
         assert len(list(doc.sents)) == 2
 
 
+def test_sentencizer_empty_docs():
+    one_empty_text = [""]
+    many_empty_texts = ["", "", ""]
+    some_empty_texts = ["hi", "", "This is a test. Here are two sentences.", ""]
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
+        for doc in nlp.pipe(texts):
+            assert doc.is_sentenced
+            sent_starts = [t.is_sent_start for t in doc]
+            if len(doc) == 0:
+                assert sent_starts == []
+            else:
+                assert len(sent_starts) > 0
+
+
 @pytest.mark.parametrize(
     "words,sent_starts,n_sents",
     [