mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-06 04:43:17 +03:00
Add rule-based SBD
This commit is contained in:
parent
a824cf8f9a
commit
5e6a9e7dcc
|
@ -46,6 +46,43 @@ from ._ml import build_text_classifier, build_tagger_model
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
|
|
||||||
|
|
||||||
|
class SentenceSegmenter(object):
|
||||||
|
'''A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||||
|
(that doesn't require the dependency parse).
|
||||||
|
|
||||||
|
To change the sentence boundary detection strategy, pass a generator
|
||||||
|
function `strategy` on initialization, or assign a new strategy to
|
||||||
|
the .strategy attribute.
|
||||||
|
|
||||||
|
Sentence detection strategies should be generators that take `Doc` objects
|
||||||
|
and yield `Span` objects for each sentence.
|
||||||
|
'''
|
||||||
|
name = 'sbd'
|
||||||
|
|
||||||
|
def __init__(self, vocab, strategy=None):
|
||||||
|
self.vocab = vocab
|
||||||
|
if strategy is None or strategy == 'on_punct':
|
||||||
|
strategy = self.split_on_punct
|
||||||
|
self.strategy = strategy
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
doc.user_hooks['sents'] = self.strategy
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def split_on_punct(doc):
|
||||||
|
start = 0
|
||||||
|
seen_period = False
|
||||||
|
for i, word in enumerate(doc):
|
||||||
|
if seen_period and not word.is_punct:
|
||||||
|
yield doc[start : word.i]
|
||||||
|
start = word.i
|
||||||
|
seen_period = False
|
||||||
|
elif word.text in ['.', '!', '?']:
|
||||||
|
seen_period = True
|
||||||
|
if start < len(doc):
|
||||||
|
yield doc[start : len(doc)]
|
||||||
|
|
||||||
|
|
||||||
class BaseThincComponent(object):
|
class BaseThincComponent(object):
|
||||||
name = None
|
name = None
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user