💫 Add better and serializable sentencizer (#3471)

* Add better serializable sentencizer component

* Replace default factory

* Add tests

* Tidy up

* Pass test

* Update docs
This commit is contained in:
Ines Montani 2019-03-23 15:45:02 +01:00 committed by Matthew Honnibal
parent d9a07a7f6e
commit 06bf130890
17 changed files with 386 additions and 182 deletions

View File

@ -43,8 +43,9 @@ redirects = [
{from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"}, {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"},
{from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"}, {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"},
{from = "/models/comparison", to = "/models"}, {from = "/models/comparison", to = "/models"},
{from = "/api/#section-cython", to = "/api/cython"}, {from = "/api/#section-cython", to = "/api/cython", force = true},
{from = "/api/#cython", to = "/api/cython"}, {from = "/api/#cython", to = "/api/cython", force = true},
{from = "/api/sentencesegmenter", to="/api/sentencizer"},
{from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true}, {from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
{from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true}, {from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
] ]

View File

@ -15,7 +15,7 @@ from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .pipeline import EntityRuler from .pipeline import EntityRuler
from .compat import izip, basestring_ from .compat import izip, basestring_
@ -119,7 +119,7 @@ class Language(object):
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
"similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
"textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
"sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
"merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks, "merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
"merge_entities": lambda nlp, **cfg: merge_entities, "merge_entities": lambda nlp, **cfg: merge_entities,
"merge_subtokens": lambda nlp, **cfg: merge_subtokens, "merge_subtokens": lambda nlp, **cfg: merge_subtokens,

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .pipes import Tagger, DependencyParser, EntityRecognizer from .pipes import Tagger, DependencyParser, EntityRecognizer
from .pipes import TextCategorizer, Tensorizer, Pipe from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
from .entityruler import EntityRuler from .entityruler import EntityRuler
from .hooks import SentenceSegmenter, SimilarityHook from .hooks import SentenceSegmenter, SimilarityHook
from .functions import merge_entities, merge_noun_chunks, merge_subtokens from .functions import merge_entities, merge_noun_chunks, merge_subtokens
@ -15,6 +15,7 @@ __all__ = [
"Tensorizer", "Tensorizer",
"Pipe", "Pipe",
"EntityRuler", "EntityRuler",
"Sentencizer",
"SentenceSegmenter", "SentenceSegmenter",
"SimilarityHook", "SimilarityHook",
"merge_entities", "merge_entities",

View File

@ -191,7 +191,7 @@ class EntityRuler(object):
**kwargs: Other config paramters, mostly for consistency. **kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler. RETURNS (EntityRuler): The loaded entity ruler.
DOCS: https://spacy.io/api/entityruler DOCS: https://spacy.io/api/entityruler#to_disk
""" """
path = ensure_path(path) path = ensure_path(path)
path = path.with_suffix(".jsonl") path = path.with_suffix(".jsonl")

View File

@ -15,8 +15,6 @@ class SentenceSegmenter(object):
initialization, or assign a new strategy to the .strategy attribute. initialization, or assign a new strategy to the .strategy attribute.
Sentence detection strategies should be generators that take `Doc` objects Sentence detection strategies should be generators that take `Doc` objects
and yield `Span` objects for each sentence. and yield `Span` objects for each sentence.
DOCS: https://spacy.io/api/sentencesegmenter
""" """
name = "sentencizer" name = "sentencizer"
@ -35,12 +33,12 @@ class SentenceSegmenter(object):
def split_on_punct(doc): def split_on_punct(doc):
start = 0 start = 0
seen_period = False seen_period = False
for i, word in enumerate(doc): for i, token in enumerate(doc):
if seen_period and not word.is_punct: if seen_period and not token.is_punct:
yield doc[start : word.i] yield doc[start : token.i]
start = word.i start = token.i
seen_period = False seen_period = False
elif word.text in [".", "!", "?"]: elif token.text in [".", "!", "?"]:
seen_period = True seen_period = True
if start < len(doc): if start < len(doc):
yield doc[start : len(doc)] yield doc[start : len(doc)]

View File

@ -1058,4 +1058,90 @@ cdef class EntityRecognizer(Parser):
if move[0] in ("B", "I", "L", "U"))) if move[0] in ("B", "I", "L", "U")))
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer"] class Sentencizer(object):
"""Segment the Doc into sentences using a rule-based strategy.
DOCS: https://spacy.io/api/sentencizer
"""
name = "sentencizer"
default_punct_chars = [".", "!", "?"]
def __init__(self, punct_chars=None, **kwargs):
"""Initialize the sentencizer.
punct_chars (list): Punctuation characters to split on. Will be
serialized with the nlp object.
RETURNS (Sentencizer): The sentencizer component.
DOCS: https://spacy.io/api/sentencizer#init
"""
self.punct_chars = punct_chars or self.default_punct_chars
def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
doc (Doc): The document to process.
RETURNS (Doc): The processed Doc.
DOCS: https://spacy.io/api/sentencizer#call
"""
start = 0
seen_period = False
for i, token in enumerate(doc):
is_in_punct_chars = token.text in self.punct_chars
token.is_sent_start = i == 0
if seen_period and not token.is_punct and not is_in_punct_chars:
doc[start].is_sent_start = True
start = token.i
seen_period = False
elif is_in_punct_chars:
seen_period = True
if start < len(doc):
doc[start].is_sent_start = True
return doc
def to_bytes(self, **kwargs):
"""Serialize the sentencizer to a bytestring.
RETURNS (bytes): The serialized object.
DOCS: https://spacy.io/api/sentencizer#to_bytes
"""
return srsly.msgpack_dumps({"punct_chars": self.punct_chars})
def from_bytes(self, bytes_data, **kwargs):
"""Load the sentencizer from a bytestring.
bytes_data (bytes): The data to load.
returns (Sentencizer): The loaded object.
DOCS: https://spacy.io/api/sentencizer#from_bytes
"""
cfg = srsly.msgpack_loads(bytes_data)
self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
return self
def to_disk(self, path, exclude=tuple(), **kwargs):
"""Serialize the sentencizer to disk.
DOCS: https://spacy.io/api/sentencizer#to_disk
"""
path = util.ensure_path(path)
path = path.with_suffix(".json")
srsly.write_json(path, {"punct_chars": self.punct_chars})
def from_disk(self, path, exclude=tuple(), **kwargs):
"""Load the sentencizer from disk.
DOCS: https://spacy.io/api/sentencizer#from_disk
"""
path = util.ensure_path(path)
path = path.with_suffix(".json")
cfg = srsly.read_json(path)
self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
return self
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "Sentencizer"]

View File

@ -0,0 +1,87 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.pipeline import Sentencizer
from spacy.tokens import Doc
def test_sentencizer(en_vocab):
doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
sentencizer = Sentencizer()
doc = sentencizer(doc)
assert doc.is_sentenced
sent_starts = [t.is_sent_start for t in doc]
assert sent_starts == [True, False, True, False, False, False, False]
assert len(list(doc.sents)) == 2
@pytest.mark.parametrize(
"words,sent_starts,n_sents",
[
# The expected result here is that the duplicate punctuation gets merged
# onto the same sentence and no one-token sentence is created for them.
(
["Hello", "!", ".", "Test", ".", ".", "ok"],
[True, False, False, True, False, False, True],
3,
),
# We also want to make sure ¡ and ¿ aren't treated as sentence end
# markers, even though they're punctuation
(
["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"],
[True, False, False, False, True, False, False, False, False, False],
2,
),
# The Token.is_punct check ensures that quotes are handled as well
(
['"', "Nice", "!", '"', "I", "am", "happy", "."],
[True, False, False, False, True, False, False, False],
2,
),
],
)
def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents):
doc = Doc(en_vocab, words=words)
sentencizer = Sentencizer()
doc = sentencizer(doc)
assert doc.is_sentenced
assert [t.is_sent_start for t in doc] == sent_starts
assert len(list(doc.sents)) == n_sents
@pytest.mark.parametrize(
"punct_chars,words,sent_starts,n_sents",
[
(
["~", "?"],
["Hello", "world", "~", "A", ".", "B", "."],
[True, False, False, True, False, False, False],
2,
),
# Even thought it's not common, the punct_chars should be able to
# handle any tokens
(
[".", "ö"],
["Hello", ".", "Test", "ö", "Ok", "."],
[True, False, True, False, True, False],
3,
),
],
)
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents):
doc = Doc(en_vocab, words=words)
sentencizer = Sentencizer(punct_chars=punct_chars)
doc = sentencizer(doc)
assert doc.is_sentenced
assert [t.is_sent_start for t in doc] == sent_starts
assert len(list(doc.sents)) == n_sents
def test_sentencizer_serialize_bytes(en_vocab):
punct_chars = [".", "~", "+"]
sentencizer = Sentencizer(punct_chars=punct_chars)
assert sentencizer.punct_chars == punct_chars
bytes_data = sentencizer.to_bytes()
new_sentencizer = Sentencizer().from_bytes(bytes_data)
assert new_sentencizer.punct_chars == punct_chars

View File

@ -6,10 +6,9 @@ from spacy.lang.en import English
from spacy.tokens import Doc from spacy.tokens import Doc
@pytest.mark.xfail
def test_issue3468(): def test_issue3468():
"""Test that sentence boundaries are serialized if they're not set by the """Test that sentence boundaries are set correctly so Doc.is_sentenced can
dependency parser.""" be restored after serialization."""
nlp = English() nlp = English()
nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(nlp.create_pipe("sentencizer"))
doc = nlp("Hello world") doc = nlp("Hello world")

View File

@ -230,7 +230,7 @@ cdef class Doc:
defined as having at least one of the following: defined as having at least one of the following:
a) An entry "sents" in doc.user_hooks"; a) An entry "sents" in doc.user_hooks";
b) sent.is_parsed is set to True; b) Doc.is_parsed is set to True;
c) At least one token other than the first where sent_start is not None. c) At least one token other than the first where sent_start is not None.
""" """
if "sents" in self.user_hooks: if "sents" in self.user_hooks:

View File

@ -441,6 +441,7 @@ cdef class Token:
property sent_start: property sent_start:
def __get__(self): def __get__(self):
"""Deprecated: use Token.is_sent_start instead."""
# Raising a deprecation warning here causes errors for autocomplete # Raising a deprecation warning here causes errors for autocomplete
# Handle broken backwards compatibility case: doc[0].sent_start # Handle broken backwards compatibility case: doc[0].sent_start
# was False. # was False.

View File

@ -1,78 +0,0 @@
---
title: SentenceSegmenter
tag: class
source: spacy/pipeline/hooks.py
---
A simple spaCy hook, to allow custom sentence boundary detection logic that
doesn't require the dependency parse. By default, sentence segmentation is
performed by the [`DependencyParser`](/api/dependencyparser), so the
`SentenceSegmenter` lets you implement a simpler, rule-based strategy that
doesn't require a statistical model to be loaded. The component is also
available via the string name `"sentencizer"`. After initialization, it is
typically added to the processing pipeline using
[`nlp.add_pipe`](/api/language#add_pipe).
## SentenceSegmenter.\_\_init\_\_ {#init tag="method"}
Initialize the sentence segmenter. To change the sentence boundary detection
strategy, pass a generator function `strategy` on initialization, or assign a
new strategy to the `.strategy` attribute. Sentence detection strategies should
be generators that take `Doc` objects and yield `Span` objects for each
sentence.
> #### Example
>
> ```python
> # Construction via create_pipe
> sentencizer = nlp.create_pipe("sentencizer")
>
> # Construction from class
> from spacy.pipeline import SentenceSegmenter
> sentencizer = SentenceSegmenter(nlp.vocab)
> ```
| Name | Type | Description |
| ----------- | ------------------- | ----------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `strategy` | unicode / callable | The segmentation strategy to use. Defaults to `"on_punct"`. |
| **RETURNS** | `SentenceSegmenter` | The newly constructed object. |
## SentenceSegmenter.\_\_call\_\_ {#call tag="method"}
Apply the sentence segmenter on a `Doc`. Typically, this happens automatically
after the component has been added to the pipeline using
[`nlp.add_pipe`](/api/language#add_pipe).
> #### Example
>
> ```python
> from spacy.lang.en import English
>
> nlp = English()
> sentencizer = nlp.create_pipe("sentencizer")
> nlp.add_pipe(sentencizer)
> doc = nlp(u"This is a sentence. This is another sentence.")
> assert list(doc.sents) == 2
> ```
| Name | Type | Description |
| ----------- | ----- | ------------------------------------------------------------ |
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. |
## SentenceSegmenter.split_on_punct {#split_on_punct tag="staticmethod"}
Split the `Doc` on punctuation characters `.`, `!` and `?`. This is the default
strategy used by the `SentenceSegmenter.`
| Name | Type | Description |
| ---------- | ------ | ------------------------------ |
| `doc` | `Doc` | The `Doc` object to process. |
| **YIELDS** | `Span` | The sentences in the document. |
## Attributes {#attributes}
| Name | Type | Description |
| ---------- | -------- | ------------------------------------------------------------------- |
| `strategy` | callable | The segmentation strategy. Can be overwritten after initialization. |

View File

@ -0,0 +1,136 @@
---
title: Sentencizer
tag: class
source: spacy/pipeline/pipes.pyx
---
A simple pipeline component, to allow custom sentence boundary detection logic
that doesn't require the dependency parse. By default, sentence segmentation is
performed by the [`DependencyParser`](/api/dependencyparser), so the
`Sentencizer` lets you implement a simpler, rule-based strategy that doesn't
require a statistical model to be loaded. The component is also available via
the string name `"sentencizer"`. After initialization, it is typically added to
the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
<Infobox title="Important note" variant="warning">
Compared to the previous `SentenceSegmenter` class, the `Sentencizer` component
doesn't add a hook to `doc.user_hooks["sents"]`. Instead, it iterates over the
tokens in the `Doc` and sets the `Token.is_sent_start` property. The
`SentenceSegmenter` is still available if you import it directly:
```python
from spacy.pipeline import SentenceSegmenter
```
</Infobox>
## Sentencizer.\_\_init\_\_ {#init tag="method"}
Initialize the sentencizer.
> #### Example
>
> ```python
> # Construction via create_pipe
> sentencizer = nlp.create_pipe("sentencizer")
>
> # Construction from class
> from spacy.pipeline import Sentencizer
> sentencizer = Sentencizer()
> ```
| Name | Type | Description |
| ------------- | ------------- | ------------------------------------------------------------------------------------------------------ |
| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `[".", "!", "?"].` |
| **RETURNS** | `Sentencizer` | The newly constructed object. |
## Sentencizer.\_\_call\_\_ {#call tag="method"}
Apply the sentencizer on a `Doc`. Typically, this happens automatically after
the component has been added to the pipeline using
[`nlp.add_pipe`](/api/language#add_pipe).
> #### Example
>
> ```python
> from spacy.lang.en import English
>
> nlp = English()
> sentencizer = nlp.create_pipe("sentencizer")
> nlp.add_pipe(sentencizer)
> doc = nlp(u"This is a sentence. This is another sentence.")
> assert list(doc.sents) == 2
> ```
| Name | Type | Description |
| ----------- | ----- | ------------------------------------------------------------ |
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. |
## Sentencizer.to_disk {#to_disk tag="method"}
Save the sentencizer settings (punctuation characters) a directory. Will create
a file `sentencizer.json`. This also happens automatically when you save an
`nlp` object with a sentencizer added to its pipeline.
> #### Example
>
> ```python
> sentencizer = Sentencizer(punct_chars=[".", "?", "!", "。"])
> sentencizer.to_disk("/path/to/sentencizer.jsonl")
> ```
| Name | Type | Description |
| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Sentencizer.from_disk {#from_disk tag="method"}
Load the sentencizer settings from a file. Expects a JSON file. This also
happens automatically when you load an `nlp` object or model with a sentencizer
added to its pipeline.
> #### Example
>
> ```python
> sentencizer = Sentencizer()
> sentencizer.from_disk("/path/to/sentencizer.json")
> ```
| Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
## Sentencizer.to_bytes {#to_bytes tag="method"}
Serialize the sentencizer settings to a bytestring.
> #### Example
>
> ```python
> sentencizer = Sentencizer(punct_chars=[".", "?", "!", "。"])
> sentencizer_bytes = sentencizer.to_bytes()
> ```
| Name | Type | Description |
| ----------- | ----- | -------------------- |
| **RETURNS** | bytes | The serialized data. |
## Sentencizer.from_bytes {#from_bytes tag="method"}
Load the pipe from a bytestring. Modifies the object in place and returns it.
> #### Example
>
> ```python
> sentencizer_bytes = sentencizer.to_bytes()
> sentencizer = Sentencizer()
> sentencizer.from_bytes(sentencizer_bytes)
> ```
| Name | Type | Description |
| ------------ | ------------- | ---------------------------------- |
| `bytes_data` | bytes | The bytestring to load. |
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |

View File

@ -25,21 +25,21 @@ an **annotated document**. It also orchestrates training and serialization.
### Processing pipeline {#architecture-pipeline} ### Processing pipeline {#architecture-pipeline}
| Name | Description | | Name | Description |
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. | | [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. |
| [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. | | [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. |
| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. | | [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. |
| `Morphology` | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. | | `Morphology` | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
| [`Tagger`](/api/tagger) | Annotate part-of-speech tags on `Doc` objects. | | [`Tagger`](/api/tagger) | Annotate part-of-speech tags on `Doc` objects. |
| [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects. | | [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects. |
| [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects. | | [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects. |
| [`TextCategorizer`](/api/textcategorizer) | Assign categories or labels to `Doc` objects. | | [`TextCategorizer`](/api/textcategorizer) | Assign categories or labels to `Doc` objects. |
| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. | | [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. |
| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. | | [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. |
| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. | | [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
| [`SentenceSegmenter`](/api/sentencesegmenter) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. | | [`Sentencizer`](/api/sentencizer) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. |
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | | [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
### Other classes {#architecture-other} ### Other classes {#architecture-other}

View File

@ -1149,9 +1149,14 @@ but it also means you'll need a **statistical model** and accurate predictions.
If your texts are closer to general-purpose news or web text, this should work If your texts are closer to general-purpose news or web text, this should work
well out-of-the-box. For social media or conversational text that doesn't follow well out-of-the-box. For social media or conversational text that doesn't follow
the same rules, your application may benefit from a custom rule-based the same rules, your application may benefit from a custom rule-based
implementation. You can either plug a rule-based component into your implementation. You can either use the built-in
[processing pipeline](/usage/processing-pipelines) or use the [`Sentencizer`](/api/sentencizer) or plug an entirely custom rule-based function
`SentenceSegmenter` component with a custom strategy. into your [processing pipeline](/usage/processing-pipelines).
spaCy's dependency parser respects already set boundaries, so you can preprocess
your `Doc` using custom rules _before_ it's parsed. Depending on your text, this
may also improve accuracy, since the parser is constrained to predict parses
consistent with the sentence boundaries.
### Default: Using the dependency parse {#sbd-parser model="parser"} ### Default: Using the dependency parse {#sbd-parser model="parser"}
@ -1168,13 +1173,35 @@ for sent in doc.sents:
print(sent.text) print(sent.text)
``` ```
### Setting boundaries manually {#sbd-manual} ### Rule-based pipeline component {#sbd-component}
spaCy's dependency parser respects already set boundaries, so you can preprocess The [`Sentencizer`](/api/sentencizer) component is a
your `Doc` using custom rules _before_ it's parsed. This can be done by adding a [pipeline component](/usage/processing-pipelines) that splits sentences on
[custom pipeline component](/usage/processing-pipelines). Depending on your punctuation like `.`, `!` or `?`. You can plug it into your pipeline if you only
text, this may also improve accuracy, since the parser is constrained to predict need sentence boundaries without the dependency parse.
parses consistent with the sentence boundaries.
```python
### {executable="true"}
import spacy
from spacy.lang.en import English
nlp = English() # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
doc = nlp(u"This is a sentence. This is another sentence.")
for sent in doc.sents:
print(sent.text)
```
### Custom rule-based strategy {id="sbd-custom"}
If you want to implement your own strategy that differs from the default
rule-based approach of splitting on sentences, you can also create a
[custom pipeline component](/usage/processing-pipelines#custom-components) that
takes a `Doc` object and sets the `Token.is_sent_start` attribute on each
individual token. If set to `False`, the token is explicitly marked as _not_ the
start of a sentence. If set to `None` (default), it's treated as a missing value
and can still be overwritten by the parser.
<Infobox title="Important note" variant="warning"> <Infobox title="Important note" variant="warning">
@ -1187,9 +1214,11 @@ adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
Here's an example of a component that implements a pre-processing rule for Here's an example of a component that implements a pre-processing rule for
splitting on `'...'` tokens. The component is added before the parser, which is splitting on `'...'` tokens. The component is added before the parser, which is
then used to further segment the text. This approach can be useful if you want then used to further segment the text. That's possible, because `is_sent_start`
to implement **additional** rules specific to your data, while still being able is only set to `True` for some of the tokens all others still specify `None`
to take advantage of dependency-based sentence segmentation. for unset sentence boundaries. This approach can be useful if you want to
implement **additional** rules specific to your data, while still being able to
take advantage of dependency-based sentence segmentation.
```python ```python
### {executable="true"} ### {executable="true"}
@ -1212,62 +1241,6 @@ doc = nlp(text)
print("After:", [sent.text for sent in doc.sents]) print("After:", [sent.text for sent in doc.sents])
``` ```
### Rule-based pipeline component {#sbd-component}
The `sentencizer` component is a
[pipeline component](/usage/processing-pipelines) that splits sentences on
punctuation like `.`, `!` or `?`. You can plug it into your pipeline if you only
need sentence boundaries without the dependency parse. Note that `Doc.sents`
will **raise an error** if no sentence boundaries are set.
```python
### {executable="true"}
import spacy
from spacy.lang.en import English
nlp = English() # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
doc = nlp(u"This is a sentence. This is another sentence.")
for sent in doc.sents:
print(sent.text)
```
### Custom rule-based strategy {#sbd-custom}
If you want to implement your own strategy that differs from the default
rule-based approach of splitting on sentences, you can also instantiate the
`SentenceSegmenter` directly and pass in your own strategy. The strategy should
be a function that takes a `Doc` object and yields a `Span` for each sentence.
Here's an example of a custom segmentation strategy for splitting on newlines
only:
```python
### {executable="true"}
from spacy.lang.en import English
from spacy.pipeline import SentenceSegmenter
def split_on_newlines(doc):
start = 0
seen_newline = False
for word in doc:
if seen_newline and not word.is_space:
yield doc[start:word.i]
start = word.i
seen_newline = False
elif word.text == '\\n':
seen_newline = True
if start < len(doc):
yield doc[start:len(doc)]
nlp = English() # Just the language with no model
sentencizer = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sentencizer)
doc = nlp(u"This is a sentence\\n\\nThis is another sentence\\nAnd more")
for sent in doc.sents:
print([token.text for token in sent])
```
## Rule-based matching {#rule-based-matching hidden="true"} ## Rule-based matching {#rule-based-matching hidden="true"}
<div id="rule-based-matching"> <div id="rule-based-matching">

View File

@ -138,7 +138,7 @@ require them in the pipeline settings in your model's `meta.json`.
| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | | `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. |
| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | | `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. |
| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules. | | `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules. |
| `sentencizer` | [`SentenceSegmenter`](/api/sentencesegmenter) | Add rule-based sentence segmentation without the dependency parse. | | `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
| `merge_noun_chunks` | [`merge_noun_chunks`](/api/pipeline-functions#merge_noun_chunks) | Merge all noun chunks into a single token. Should be added after the tagger and parser. | | `merge_noun_chunks` | [`merge_noun_chunks`](/api/pipeline-functions#merge_noun_chunks) | Merge all noun chunks into a single token. Should be added after the tagger and parser. |
| `merge_entities` | [`merge_entities`](/api/pipeline-functions#merge_entities) | Merge all entities into a single token. Should be added after the entity recognizer. | | `merge_entities` | [`merge_entities`](/api/pipeline-functions#merge_entities) | Merge all entities into a single token. Should be added after the entity recognizer. |
| `merge_subtokens` | [`merge_subtokens`](/api/pipeline-functions#merge_subtokens) | Merge subtokens predicted by the parser into single tokens. Should be added after the parser. | | `merge_subtokens` | [`merge_subtokens`](/api/pipeline-functions#merge_subtokens) | Merge subtokens predicted by the parser into single tokens. Should be added after the parser. |

View File

@ -195,7 +195,7 @@ the existing pages and added some new content:
- **Universe:** [Videos](/universe/category/videos) and - **Universe:** [Videos](/universe/category/videos) and
[Podcasts](/universe/category/podcasts) [Podcasts](/universe/category/podcasts)
- **API:** [`EntityRuler`](/api/entityruler) - **API:** [`EntityRuler`](/api/entityruler)
- **API:** [`SentenceSegmenter`](/api/sentencesegmenter) - **API:** [`Sentencizer`](/api/sentencizer)
- **API:** [Pipeline functions](/api/pipeline-functions) - **API:** [Pipeline functions](/api/pipeline-functions)
## Backwards incompatibilities {#incompat} ## Backwards incompatibilities {#incompat}

View File

@ -79,7 +79,7 @@
{ "text": "Matcher", "url": "/api/matcher" }, { "text": "Matcher", "url": "/api/matcher" },
{ "text": "PhraseMatcher", "url": "/api/phrasematcher" }, { "text": "PhraseMatcher", "url": "/api/phrasematcher" },
{ "text": "EntityRuler", "url": "/api/entityruler" }, { "text": "EntityRuler", "url": "/api/entityruler" },
{ "text": "SentenceSegmenter", "url": "/api/sentencesegmenter" }, { "text": "Sentencizer", "url": "/api/sentencizer" },
{ "text": "Other Functions", "url": "/api/pipeline-functions" } { "text": "Other Functions", "url": "/api/pipeline-functions" }
] ]
}, },