Re-refactor Sentencizer with Pipe API (#7176)

Reapply the refactoring (#4721) so that `Sentencizer` uses the faster
`predict` and `set_annotations` for both `__call__` and `pipe`.
This commit is contained in:
Adriane Boyd 2021-02-26 09:48:14 +01:00 committed by GitHub
parent 592678fb7d
commit 10c930cc96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -66,26 +66,12 @@ class Sentencizer(Pipe):
"""
error_handler = self.get_error_handler()
try:
self._call(doc)
tags = self.predict([doc])
self.set_annotations([doc], tags)
return doc
except Exception as e:
error_handler(self.name, self, [doc], e)
def _call(self, doc):
start = 0
seen_period = False
for i, token in enumerate(doc):
is_in_punct_chars = token.text in self.punct_chars
token.is_sent_start = i == 0
if seen_period and not token.is_punct and not is_in_punct_chars:
doc[start].is_sent_start = True
start = token.i
seen_period = False
elif is_in_punct_chars:
seen_period = True
if start < len(doc):
doc[start].is_sent_start = True
def predict(self, docs):
"""Apply the pipe to a batch of docs, without modifying them.