From 5e6a9e7dcc6e03ebab39adfbbdb163eb33c28631 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 2 Sep 2017 12:53:38 +0200
Subject: [PATCH] Add rule-based SBD

---
 spacy/pipeline.pyx | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 8c3759778..9413fb6bb 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -46,6 +46,43 @@ from ._ml import build_text_classifier, build_tagger_model
 from .parts_of_speech import X
 
 
+class SentenceSegmenter(object):
+    '''A simple spaCy hook, to allow custom sentence boundary detection logic
+    (that doesn't require the dependency parse).
+
+    To change the sentence boundary detection strategy, pass a generator
+    function `strategy` on initialization, or assign a new strategy to
+    the .strategy attribute.
+
+    Sentence detection strategies should be generators that take `Doc` objects
+    and yield `Span` objects for each sentence.
+    '''
+    name = 'sbd'
+
+    def __init__(self, vocab, strategy=None):
+        self.vocab = vocab
+        if strategy is None or strategy == 'on_punct':
+            strategy = self.split_on_punct
+        self.strategy = strategy
+
+    def __call__(self, doc):
+        doc.user_hooks['sents'] = self.strategy
+
+    @staticmethod
+    def split_on_punct(doc):
+        start = 0
+        seen_period = False
+        for i, word in enumerate(doc):
+            if seen_period and not word.is_punct:
+                yield doc[start : word.i]
+                start = word.i
+                seen_period = False
+            elif word.text in ['.', '!', '?']:
+                seen_period = True
+        if start < len(doc):
+            yield doc[start : len(doc)]
+
+
 class BaseThincComponent(object):
     name = None