diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 5ca651077..4f0f2469e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1711,12 +1711,24 @@ class Sentencizer(Pipe): return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1): - for docs in util.minibatch(stream, size=batch_size): - docs = list(docs) - tag_ids = self.predict(docs) - self.set_annotations(docs, tag_ids) - yield from docs + def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + for examples in util.minibatch(stream, size=batch_size): + docs = [self._get_doc(ex) for ex in examples] + predictions = self.predict(docs) + if isinstance(predictions, tuple) and len(tuple) == 2: + scores, tensors = predictions + self.set_annotations(docs, scores, tensors=tensors) + else: + self.set_annotations(docs, predictions) + + if as_example: + annotated_examples = [] + for ex, doc in zip(examples, docs): + ex.doc = doc + annotated_examples.append(ex) + yield from annotated_examples + else: + yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 78ab6d2d1..5f9c55dbb 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -24,6 +24,12 @@ def test_sentencizer_pipe(): sent_starts = [t.is_sent_start for t in doc] assert sent_starts == [True, False, True, False, False, False, False] assert len(list(doc.sents)) == 2 + for ex in nlp.pipe(texts, as_example=True): + doc = ex.doc + assert doc.is_sentenced + sent_starts = [t.is_sent_start for t in doc] + assert sent_starts == [True, False, True, False, False, False, False] + assert len(list(doc.sents)) == 2 @pytest.mark.parametrize(