From a938566b62fe74c6bacac55e658cd271cc5d1e29 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 28 Jan 2020 11:36:49 +0100 Subject: [PATCH] Fix Sentencizer.pipe() for empty doc (#4940) --- spacy/pipeline/pipes.pyx | 25 ++++++++++++------------ spacy/tests/pipeline/test_sentencizer.py | 16 +++++++++++++++ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b51520777..b4fecf5cb 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1492,20 +1492,21 @@ class Sentencizer(object): return guesses guesses = [] for doc in docs: - start = 0 - seen_period = False doc_guesses = [False] * len(doc) - doc_guesses[0] = True - for i, token in enumerate(doc): - is_in_punct_chars = token.text in self.punct_chars - if seen_period and not token.is_punct and not is_in_punct_chars: + if len(doc) > 0: + start = 0 + seen_period = False + doc_guesses[0] = True + for i, token in enumerate(doc): + is_in_punct_chars = token.text in self.punct_chars + if seen_period and not token.is_punct and not is_in_punct_chars: + doc_guesses[start] = True + start = token.i + seen_period = False + elif is_in_punct_chars: + seen_period = True + if start < len(doc): doc_guesses[start] = True - start = token.i - seen_period = False - elif is_in_punct_chars: - seen_period = True - if start < len(doc): - doc_guesses[start] = True guesses.append(doc_guesses) return guesses diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 359552c5b..d690958cc 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -29,6 +29,22 @@ def test_sentencizer_pipe(): assert len(list(doc.sents)) == 2 +def test_sentencizer_empty_docs(): + one_empty_text = [""] + many_empty_texts = ["", "", ""] + some_empty_texts = ["hi", "", "This is a test. Here are two sentences.", ""] + nlp = English() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + for texts in [one_empty_text, many_empty_texts, some_empty_texts]: + for doc in nlp.pipe(texts): + assert doc.is_sentenced + sent_starts = [t.is_sent_start for t in doc] + if len(doc) == 0: + assert sent_starts == [] + else: + assert len(sent_starts) > 0 + + @pytest.mark.parametrize( "words,sent_starts,n_sents", [