mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	💫 Add better and serializable sentencizer (#3471)
* Add better serializable sentencizer component * Replace default factory * Add tests * Tidy up * Pass test * Update docs
This commit is contained in:
		
							parent
							
								
									d9a07a7f6e
								
							
						
					
					
						commit
						06bf130890
					
				|  | @ -43,8 +43,9 @@ redirects = [ | |||
|     {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"}, | ||||
|     {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"}, | ||||
|     {from = "/models/comparison", to = "/models"}, | ||||
|     {from = "/api/#section-cython", to = "/api/cython"}, | ||||
|     {from = "/api/#cython", to = "/api/cython"}, | ||||
|     {from = "/api/#section-cython", to = "/api/cython", force = true}, | ||||
|     {from = "/api/#cython", to = "/api/cython", force = true}, | ||||
|     {from = "/api/sentencesegmenter", to="/api/sentencizer"}, | ||||
|     {from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true}, | ||||
|     {from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true}, | ||||
| ] | ||||
|  |  | |||
|  | @ -15,7 +15,7 @@ from .tokenizer import Tokenizer | |||
| from .vocab import Vocab | ||||
| from .lemmatizer import Lemmatizer | ||||
| from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer | ||||
| from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter | ||||
| from .pipeline import SimilarityHook, TextCategorizer, Sentencizer | ||||
| from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens | ||||
| from .pipeline import EntityRuler | ||||
| from .compat import izip, basestring_ | ||||
|  | @ -119,7 +119,7 @@ class Language(object): | |||
|         "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), | ||||
|         "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), | ||||
|         "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), | ||||
|         "sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), | ||||
|         "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg), | ||||
|         "merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks, | ||||
|         "merge_entities": lambda nlp, **cfg: merge_entities, | ||||
|         "merge_subtokens": lambda nlp, **cfg: merge_subtokens, | ||||
|  |  | |||
|  | @ -2,7 +2,7 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .pipes import Tagger, DependencyParser, EntityRecognizer | ||||
| from .pipes import TextCategorizer, Tensorizer, Pipe | ||||
| from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer | ||||
| from .entityruler import EntityRuler | ||||
| from .hooks import SentenceSegmenter, SimilarityHook | ||||
| from .functions import merge_entities, merge_noun_chunks, merge_subtokens | ||||
|  | @ -15,6 +15,7 @@ __all__ = [ | |||
|     "Tensorizer", | ||||
|     "Pipe", | ||||
|     "EntityRuler", | ||||
|     "Sentencizer", | ||||
|     "SentenceSegmenter", | ||||
|     "SimilarityHook", | ||||
|     "merge_entities", | ||||
|  |  | |||
|  | @ -191,7 +191,7 @@ class EntityRuler(object): | |||
|         **kwargs: Other config paramters, mostly for consistency. | ||||
|         RETURNS (EntityRuler): The loaded entity ruler. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/entityruler | ||||
|         DOCS: https://spacy.io/api/entityruler#to_disk | ||||
|         """ | ||||
|         path = ensure_path(path) | ||||
|         path = path.with_suffix(".jsonl") | ||||
|  |  | |||
|  | @ -15,8 +15,6 @@ class SentenceSegmenter(object): | |||
|     initialization, or assign a new strategy to the .strategy attribute. | ||||
|     Sentence detection strategies should be generators that take `Doc` objects | ||||
|     and yield `Span` objects for each sentence. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/sentencesegmenter | ||||
|     """ | ||||
| 
 | ||||
|     name = "sentencizer" | ||||
|  | @ -35,12 +33,12 @@ class SentenceSegmenter(object): | |||
|     def split_on_punct(doc): | ||||
|         start = 0 | ||||
|         seen_period = False | ||||
|         for i, word in enumerate(doc): | ||||
|             if seen_period and not word.is_punct: | ||||
|                 yield doc[start : word.i] | ||||
|                 start = word.i | ||||
|         for i, token in enumerate(doc): | ||||
|             if seen_period and not token.is_punct: | ||||
|                 yield doc[start : token.i] | ||||
|                 start = token.i | ||||
|                 seen_period = False | ||||
|             elif word.text in [".", "!", "?"]: | ||||
|             elif token.text in [".", "!", "?"]: | ||||
|                 seen_period = True | ||||
|         if start < len(doc): | ||||
|             yield doc[start : len(doc)] | ||||
|  |  | |||
|  | @ -1058,4 +1058,90 @@ cdef class EntityRecognizer(Parser): | |||
|                 if move[0] in ("B", "I", "L", "U"))) | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer"] | ||||
| class Sentencizer(object): | ||||
|     """Segment the Doc into sentences using a rule-based strategy. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/sentencizer | ||||
|     """ | ||||
| 
 | ||||
|     name = "sentencizer" | ||||
|     default_punct_chars = [".", "!", "?"] | ||||
| 
 | ||||
|     def __init__(self, punct_chars=None, **kwargs): | ||||
|         """Initialize the sentencizer. | ||||
| 
 | ||||
|         punct_chars (list): Punctuation characters to split on. Will be | ||||
|             serialized with the nlp object. | ||||
|         RETURNS (Sentencizer): The sentencizer component. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/sentencizer#init | ||||
|         """ | ||||
|         self.punct_chars = punct_chars or self.default_punct_chars | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         """Apply the sentencizer to a Doc and set Token.is_sent_start. | ||||
| 
 | ||||
|         doc (Doc): The document to process. | ||||
|         RETURNS (Doc): The processed Doc. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/sentencizer#call | ||||
|         """ | ||||
|         start = 0 | ||||
|         seen_period = False | ||||
|         for i, token in enumerate(doc): | ||||
|             is_in_punct_chars = token.text in self.punct_chars | ||||
|             token.is_sent_start = i == 0 | ||||
|             if seen_period and not token.is_punct and not is_in_punct_chars: | ||||
|                 doc[start].is_sent_start = True | ||||
|                 start = token.i | ||||
|                 seen_period = False | ||||
|             elif is_in_punct_chars: | ||||
|                 seen_period = True | ||||
|         if start < len(doc): | ||||
|             doc[start].is_sent_start = True | ||||
|         return doc | ||||
| 
 | ||||
|     def to_bytes(self, **kwargs): | ||||
|         """Serialize the sentencizer to a bytestring. | ||||
| 
 | ||||
|         RETURNS (bytes): The serialized object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/sentencizer#to_bytes | ||||
|         """ | ||||
|         return srsly.msgpack_dumps({"punct_chars": self.punct_chars}) | ||||
| 
 | ||||
|     def from_bytes(self, bytes_data, **kwargs): | ||||
|         """Load the sentencizer from a bytestring. | ||||
| 
 | ||||
|         bytes_data (bytes): The data to load. | ||||
|         returns (Sentencizer): The loaded object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/sentencizer#from_bytes | ||||
|         """ | ||||
|         cfg = srsly.msgpack_loads(bytes_data) | ||||
|         self.punct_chars = cfg.get("punct_chars", self.default_punct_chars) | ||||
|         return self | ||||
| 
 | ||||
|     def to_disk(self, path, exclude=tuple(), **kwargs): | ||||
|         """Serialize the sentencizer to disk. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/sentencizer#to_disk | ||||
|         """ | ||||
|         path = util.ensure_path(path) | ||||
|         path = path.with_suffix(".json") | ||||
|         srsly.write_json(path, {"punct_chars": self.punct_chars}) | ||||
| 
 | ||||
| 
 | ||||
|     def from_disk(self, path, exclude=tuple(), **kwargs): | ||||
|         """Load the sentencizer from disk. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/sentencizer#from_disk | ||||
|         """ | ||||
|         path = util.ensure_path(path) | ||||
|         path = path.with_suffix(".json") | ||||
|         cfg = srsly.read_json(path) | ||||
|         self.punct_chars = cfg.get("punct_chars", self.default_punct_chars) | ||||
|         return self | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "Sentencizer"] | ||||
|  |  | |||
							
								
								
									
										87
									
								
								spacy/tests/pipeline/test_sentencizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								spacy/tests/pipeline/test_sentencizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,87 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| from spacy.pipeline import Sentencizer | ||||
| from spacy.tokens import Doc | ||||
| 
 | ||||
| 
 | ||||
| def test_sentencizer(en_vocab): | ||||
|     doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."]) | ||||
|     sentencizer = Sentencizer() | ||||
|     doc = sentencizer(doc) | ||||
|     assert doc.is_sentenced | ||||
|     sent_starts = [t.is_sent_start for t in doc] | ||||
|     assert sent_starts == [True, False, True, False, False, False, False] | ||||
|     assert len(list(doc.sents)) == 2 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "words,sent_starts,n_sents", | ||||
|     [ | ||||
|         # The expected result here is that the duplicate punctuation gets merged | ||||
|         # onto the same sentence and no one-token sentence is created for them. | ||||
|         ( | ||||
|             ["Hello", "!", ".", "Test", ".", ".", "ok"], | ||||
|             [True, False, False, True, False, False, True], | ||||
|             3, | ||||
|         ), | ||||
|         # We also want to make sure ¡ and ¿ aren't treated as sentence end | ||||
|         # markers, even though they're punctuation | ||||
|         ( | ||||
|             ["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"], | ||||
|             [True, False, False, False, True, False, False, False, False, False], | ||||
|             2, | ||||
|         ), | ||||
|         # The Token.is_punct check ensures that quotes are handled as well | ||||
|         ( | ||||
|             ['"', "Nice", "!", '"', "I", "am", "happy", "."], | ||||
|             [True, False, False, False, True, False, False, False], | ||||
|             2, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents): | ||||
|     doc = Doc(en_vocab, words=words) | ||||
|     sentencizer = Sentencizer() | ||||
|     doc = sentencizer(doc) | ||||
|     assert doc.is_sentenced | ||||
|     assert [t.is_sent_start for t in doc] == sent_starts | ||||
|     assert len(list(doc.sents)) == n_sents | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "punct_chars,words,sent_starts,n_sents", | ||||
|     [ | ||||
|         ( | ||||
|             ["~", "?"], | ||||
|             ["Hello", "world", "~", "A", ".", "B", "."], | ||||
|             [True, False, False, True, False, False, False], | ||||
|             2, | ||||
|         ), | ||||
|         # Even thought it's not common, the punct_chars should be able to | ||||
|         # handle any tokens | ||||
|         ( | ||||
|             [".", "ö"], | ||||
|             ["Hello", ".", "Test", "ö", "Ok", "."], | ||||
|             [True, False, True, False, True, False], | ||||
|             3, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents): | ||||
|     doc = Doc(en_vocab, words=words) | ||||
|     sentencizer = Sentencizer(punct_chars=punct_chars) | ||||
|     doc = sentencizer(doc) | ||||
|     assert doc.is_sentenced | ||||
|     assert [t.is_sent_start for t in doc] == sent_starts | ||||
|     assert len(list(doc.sents)) == n_sents | ||||
| 
 | ||||
| 
 | ||||
| def test_sentencizer_serialize_bytes(en_vocab): | ||||
|     punct_chars = [".", "~", "+"] | ||||
|     sentencizer = Sentencizer(punct_chars=punct_chars) | ||||
|     assert sentencizer.punct_chars == punct_chars | ||||
|     bytes_data = sentencizer.to_bytes() | ||||
|     new_sentencizer = Sentencizer().from_bytes(bytes_data) | ||||
|     assert new_sentencizer.punct_chars == punct_chars | ||||
|  | @ -6,10 +6,9 @@ from spacy.lang.en import English | |||
| from spacy.tokens import Doc | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_issue3468(): | ||||
|     """Test that sentence boundaries are serialized if they're not set by the | ||||
|     dependency parser.""" | ||||
|     """Test that sentence boundaries are set correctly so Doc.is_sentenced can | ||||
|     be restored after serialization.""" | ||||
|     nlp = English() | ||||
|     nlp.add_pipe(nlp.create_pipe("sentencizer")) | ||||
|     doc = nlp("Hello world") | ||||
|  |  | |||
|  | @ -230,7 +230,7 @@ cdef class Doc: | |||
|         defined as having at least one of the following: | ||||
| 
 | ||||
|         a) An entry "sents" in doc.user_hooks"; | ||||
|         b) sent.is_parsed is set to True; | ||||
|         b) Doc.is_parsed is set to True; | ||||
|         c) At least one token other than the first where sent_start is not None. | ||||
|         """ | ||||
|         if "sents" in self.user_hooks: | ||||
|  |  | |||
|  | @ -441,6 +441,7 @@ cdef class Token: | |||
| 
 | ||||
|     property sent_start: | ||||
|         def __get__(self): | ||||
|             """Deprecated: use Token.is_sent_start instead.""" | ||||
|             # Raising a deprecation warning here causes errors for autocomplete | ||||
|             # Handle broken backwards compatibility case: doc[0].sent_start | ||||
|             # was False. | ||||
|  |  | |||
|  | @ -1,78 +0,0 @@ | |||
| --- | ||||
| title: SentenceSegmenter | ||||
| tag: class | ||||
| source: spacy/pipeline/hooks.py | ||||
| --- | ||||
| 
 | ||||
| A simple spaCy hook, to allow custom sentence boundary detection logic that | ||||
| doesn't require the dependency parse. By default, sentence segmentation is | ||||
| performed by the [`DependencyParser`](/api/dependencyparser), so the | ||||
| `SentenceSegmenter` lets you implement a simpler, rule-based strategy that | ||||
| doesn't require a statistical model to be loaded. The component is also | ||||
| available via the string name `"sentencizer"`. After initialization, it is | ||||
| typically added to the processing pipeline using | ||||
| [`nlp.add_pipe`](/api/language#add_pipe). | ||||
| 
 | ||||
| ## SentenceSegmenter.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| Initialize the sentence segmenter. To change the sentence boundary detection | ||||
| strategy, pass a generator function `strategy` on initialization, or assign a | ||||
| new strategy to the `.strategy` attribute. Sentence detection strategies should | ||||
| be generators that take `Doc` objects and yield `Span` objects for each | ||||
| sentence. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > # Construction via create_pipe | ||||
| > sentencizer = nlp.create_pipe("sentencizer") | ||||
| > | ||||
| > # Construction from class | ||||
| > from spacy.pipeline import SentenceSegmenter | ||||
| > sentencizer = SentenceSegmenter(nlp.vocab) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type                | Description                                                 | | ||||
| | ----------- | ------------------- | ----------------------------------------------------------- | | ||||
| | `vocab`     | `Vocab`             | The shared vocabulary.                                      | | ||||
| | `strategy`  | unicode / callable  | The segmentation strategy to use. Defaults to `"on_punct"`. | | ||||
| | **RETURNS** | `SentenceSegmenter` | The newly constructed object.                               | | ||||
| 
 | ||||
| ## SentenceSegmenter.\_\_call\_\_ {#call tag="method"} | ||||
| 
 | ||||
| Apply the sentence segmenter on a `Doc`. Typically, this happens automatically | ||||
| after the component has been added to the pipeline using | ||||
| [`nlp.add_pipe`](/api/language#add_pipe). | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.lang.en import English | ||||
| > | ||||
| > nlp = English() | ||||
| > sentencizer = nlp.create_pipe("sentencizer") | ||||
| > nlp.add_pipe(sentencizer) | ||||
| > doc = nlp(u"This is a sentence. This is another sentence.") | ||||
| > assert list(doc.sents) == 2 | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type  | Description                                                  | | ||||
| | ----------- | ----- | ------------------------------------------------------------ | | ||||
| | `doc`       | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | ||||
| | **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries.           | | ||||
| 
 | ||||
| ## SentenceSegmenter.split_on_punct {#split_on_punct tag="staticmethod"} | ||||
| 
 | ||||
| Split the `Doc` on punctuation characters `.`, `!` and `?`. This is the default | ||||
| strategy used by the `SentenceSegmenter.` | ||||
| 
 | ||||
| | Name       | Type   | Description                    | | ||||
| | ---------- | ------ | ------------------------------ | | ||||
| | `doc`      | `Doc`  | The `Doc` object to process.   | | ||||
| | **YIELDS** | `Span` | The sentences in the document. | | ||||
| 
 | ||||
| ## Attributes {#attributes} | ||||
| 
 | ||||
| | Name       | Type     | Description                                                         | | ||||
| | ---------- | -------- | ------------------------------------------------------------------- | | ||||
| | `strategy` | callable | The segmentation strategy. Can be overwritten after initialization. | | ||||
							
								
								
									
										136
									
								
								website/docs/api/sentencizer.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										136
									
								
								website/docs/api/sentencizer.md
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,136 @@ | |||
| --- | ||||
| title: Sentencizer | ||||
| tag: class | ||||
| source: spacy/pipeline/pipes.pyx | ||||
| --- | ||||
| 
 | ||||
| A simple pipeline component, to allow custom sentence boundary detection logic | ||||
| that doesn't require the dependency parse. By default, sentence segmentation is | ||||
| performed by the [`DependencyParser`](/api/dependencyparser), so the | ||||
| `Sentencizer` lets you implement a simpler, rule-based strategy that doesn't | ||||
| require a statistical model to be loaded. The component is also available via | ||||
| the string name `"sentencizer"`. After initialization, it is typically added to | ||||
| the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). | ||||
| 
 | ||||
| <Infobox title="Important note" variant="warning"> | ||||
| 
 | ||||
| Compared to the previous `SentenceSegmenter` class, the `Sentencizer` component | ||||
| doesn't add a hook to `doc.user_hooks["sents"]`. Instead, it iterates over the | ||||
| tokens in the `Doc` and sets the `Token.is_sent_start` property. The | ||||
| `SentenceSegmenter` is still available if you import it directly: | ||||
| 
 | ||||
| ```python | ||||
| from spacy.pipeline import SentenceSegmenter | ||||
| ``` | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ## Sentencizer.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| Initialize the sentencizer. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > # Construction via create_pipe | ||||
| > sentencizer = nlp.create_pipe("sentencizer") | ||||
| > | ||||
| > # Construction from class | ||||
| > from spacy.pipeline import Sentencizer | ||||
| > sentencizer = Sentencizer() | ||||
| > ``` | ||||
| 
 | ||||
| | Name          | Type          | Description                                                                                            | | ||||
| | ------------- | ------------- | ------------------------------------------------------------------------------------------------------ | | ||||
| | `punct_chars` | list          | Optional custom list of punctuation characters that mark sentence ends. Defaults to `[".", "!", "?"].` | | ||||
| | **RETURNS**   | `Sentencizer` | The newly constructed object.                                                                          | | ||||
| 
 | ||||
| ## Sentencizer.\_\_call\_\_ {#call tag="method"} | ||||
| 
 | ||||
| Apply the sentencizer on a `Doc`. Typically, this happens automatically after | ||||
| the component has been added to the pipeline using | ||||
| [`nlp.add_pipe`](/api/language#add_pipe). | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.lang.en import English | ||||
| > | ||||
| > nlp = English() | ||||
| > sentencizer = nlp.create_pipe("sentencizer") | ||||
| > nlp.add_pipe(sentencizer) | ||||
| > doc = nlp(u"This is a sentence. This is another sentence.") | ||||
| > assert list(doc.sents) == 2 | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type  | Description                                                  | | ||||
| | ----------- | ----- | ------------------------------------------------------------ | | ||||
| | `doc`       | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | ||||
| | **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries.           | | ||||
| 
 | ||||
| ## Sentencizer.to_disk {#to_disk tag="method"} | ||||
| 
 | ||||
| Save the sentencizer settings (punctuation characters) a directory. Will create | ||||
| a file `sentencizer.json`. This also happens automatically when you save an | ||||
| `nlp` object with a sentencizer added to its pipeline. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > sentencizer = Sentencizer(punct_chars=[".", "?", "!", "。"]) | ||||
| > sentencizer.to_disk("/path/to/sentencizer.jsonl") | ||||
| > ``` | ||||
| 
 | ||||
| | Name   | Type             | Description                                                                                                      | | ||||
| | ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- | | ||||
| | `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | ||||
| 
 | ||||
| ## Sentencizer.from_disk {#from_disk tag="method"} | ||||
| 
 | ||||
| Load the sentencizer settings from a file. Expects a JSON file. This also | ||||
| happens automatically when you load an `nlp` object or model with a sentencizer | ||||
| added to its pipeline. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > sentencizer = Sentencizer() | ||||
| > sentencizer.from_disk("/path/to/sentencizer.json") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type             | Description                                                                | | ||||
| | ----------- | ---------------- | -------------------------------------------------------------------------- | | ||||
| | `path`      | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | | ||||
| | **RETURNS** | `Sentencizer`    | The modified `Sentencizer` object.                                         | | ||||
| 
 | ||||
| ## Sentencizer.to_bytes {#to_bytes tag="method"} | ||||
| 
 | ||||
| Serialize the sentencizer settings to a bytestring. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > sentencizer = Sentencizer(punct_chars=[".", "?", "!", "。"]) | ||||
| > sentencizer_bytes = sentencizer.to_bytes() | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type  | Description          | | ||||
| | ----------- | ----- | -------------------- | | ||||
| | **RETURNS** | bytes | The serialized data. | | ||||
| 
 | ||||
| ## Sentencizer.from_bytes {#from_bytes tag="method"} | ||||
| 
 | ||||
| Load the pipe from a bytestring. Modifies the object in place and returns it. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > sentencizer_bytes = sentencizer.to_bytes() | ||||
| > sentencizer = Sentencizer() | ||||
| > sentencizer.from_bytes(sentencizer_bytes) | ||||
| > ``` | ||||
| 
 | ||||
| | Name         | Type          | Description                        | | ||||
| | ------------ | ------------- | ---------------------------------- | | ||||
| | `bytes_data` | bytes         | The bytestring to load.            | | ||||
| | **RETURNS**  | `Sentencizer` | The modified `Sentencizer` object. | | ||||
|  | @ -26,7 +26,7 @@ an **annotated document**. It also orchestrates training and serialization. | |||
| ### Processing pipeline {#architecture-pipeline} | ||||
| 
 | ||||
| | Name                                        | Description                                                                                                                   | | ||||
| | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | ||||
| | ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | ||||
| | [`Language`](/api/language)                 | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. | | ||||
| | [`Tokenizer`](/api/tokenizer)               | Segment text, and create `Doc` objects with the discovered segment boundaries.                                                | | ||||
| | [`Lemmatizer`](/api/lemmatizer)             | Determine the base forms of words.                                                                                            | | ||||
|  | @ -38,7 +38,7 @@ an **annotated document**. It also orchestrates training and serialization. | |||
| | [`Matcher`](/api/matcher)                   | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                            | | ||||
| | [`PhraseMatcher`](/api/phrasematcher)       | Match sequences of tokens based on phrases.                                                                                   | | ||||
| | [`EntityRuler`](/api/entityruler)           | Add entity spans to the `Doc` using token-based rules or exact phrase matches.                                                | | ||||
| | [`SentenceSegmenter`](/api/sentencesegmenter) | Implement custom sentence boundary detection logic that doesn't require the dependency parse.                                 | | ||||
| | [`Sentencizer`](/api/sentencizer)           | Implement custom sentence boundary detection logic that doesn't require the dependency parse.                                 | | ||||
| | [Other functions](/api/pipeline-functions)  | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                                                    | | ||||
| 
 | ||||
| ### Other classes {#architecture-other} | ||||
|  |  | |||
|  | @ -1149,9 +1149,14 @@ but it also means you'll need a **statistical model** and accurate predictions. | |||
| If your texts are closer to general-purpose news or web text, this should work | ||||
| well out-of-the-box. For social media or conversational text that doesn't follow | ||||
| the same rules, your application may benefit from a custom rule-based | ||||
| implementation. You can either plug a rule-based component into your | ||||
| [processing pipeline](/usage/processing-pipelines) or use the | ||||
| `SentenceSegmenter` component with a custom strategy. | ||||
| implementation. You can either use the built-in | ||||
| [`Sentencizer`](/api/sentencizer) or plug an entirely custom rule-based function | ||||
| into your [processing pipeline](/usage/processing-pipelines). | ||||
| 
 | ||||
| spaCy's dependency parser respects already set boundaries, so you can preprocess | ||||
| your `Doc` using custom rules _before_ it's parsed. Depending on your text, this | ||||
| may also improve accuracy, since the parser is constrained to predict parses | ||||
| consistent with the sentence boundaries. | ||||
| 
 | ||||
| ### Default: Using the dependency parse {#sbd-parser model="parser"} | ||||
| 
 | ||||
|  | @ -1168,13 +1173,35 @@ for sent in doc.sents: | |||
|     print(sent.text) | ||||
| ``` | ||||
| 
 | ||||
| ### Setting boundaries manually {#sbd-manual} | ||||
| ### Rule-based pipeline component {#sbd-component} | ||||
| 
 | ||||
| spaCy's dependency parser respects already set boundaries, so you can preprocess | ||||
| your `Doc` using custom rules _before_ it's parsed. This can be done by adding a | ||||
| [custom pipeline component](/usage/processing-pipelines). Depending on your | ||||
| text, this may also improve accuracy, since the parser is constrained to predict | ||||
| parses consistent with the sentence boundaries. | ||||
| The [`Sentencizer`](/api/sentencizer) component is a | ||||
| [pipeline component](/usage/processing-pipelines) that splits sentences on | ||||
| punctuation like `.`, `!` or `?`. You can plug it into your pipeline if you only | ||||
| need sentence boundaries without the dependency parse. | ||||
| 
 | ||||
| ```python | ||||
| ### {executable="true"} | ||||
| import spacy | ||||
| from spacy.lang.en import English | ||||
| 
 | ||||
| nlp = English()  # just the language with no model | ||||
| sentencizer = nlp.create_pipe("sentencizer") | ||||
| nlp.add_pipe(sentencizer) | ||||
| doc = nlp(u"This is a sentence. This is another sentence.") | ||||
| for sent in doc.sents: | ||||
|     print(sent.text) | ||||
| ``` | ||||
| 
 | ||||
| ### Custom rule-based strategy {id="sbd-custom"} | ||||
| 
 | ||||
| If you want to implement your own strategy that differs from the default | ||||
| rule-based approach of splitting on sentences, you can also create a | ||||
| [custom pipeline component](/usage/processing-pipelines#custom-components) that | ||||
| takes a `Doc` object and sets the `Token.is_sent_start` attribute on each | ||||
| individual token. If set to `False`, the token is explicitly marked as _not_ the | ||||
| start of a sentence. If set to `None` (default), it's treated as a missing value | ||||
| and can still be overwritten by the parser. | ||||
| 
 | ||||
| <Infobox title="Important note" variant="warning"> | ||||
| 
 | ||||
|  | @ -1187,9 +1214,11 @@ adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). | |||
| 
 | ||||
| Here's an example of a component that implements a pre-processing rule for | ||||
| splitting on `'...'` tokens. The component is added before the parser, which is | ||||
| then used to further segment the text. This approach can be useful if you want | ||||
| to implement **additional** rules specific to your data, while still being able | ||||
| to take advantage of dependency-based sentence segmentation. | ||||
| then used to further segment the text. That's possible, because `is_sent_start` | ||||
| is only set to `True` for some of the tokens – all others still specify `None` | ||||
| for unset sentence boundaries. This approach can be useful if you want to | ||||
| implement **additional** rules specific to your data, while still being able to | ||||
| take advantage of dependency-based sentence segmentation. | ||||
| 
 | ||||
| ```python | ||||
| ### {executable="true"} | ||||
|  | @ -1212,62 +1241,6 @@ doc = nlp(text) | |||
| print("After:", [sent.text for sent in doc.sents]) | ||||
| ``` | ||||
| 
 | ||||
| ### Rule-based pipeline component {#sbd-component} | ||||
| 
 | ||||
| The `sentencizer` component is a | ||||
| [pipeline component](/usage/processing-pipelines) that splits sentences on | ||||
| punctuation like `.`, `!` or `?`. You can plug it into your pipeline if you only | ||||
| need sentence boundaries without the dependency parse. Note that `Doc.sents` | ||||
| will **raise an error** if no sentence boundaries are set. | ||||
| 
 | ||||
| ```python | ||||
| ### {executable="true"} | ||||
| import spacy | ||||
| from spacy.lang.en import English | ||||
| 
 | ||||
| nlp = English()  # just the language with no model | ||||
| sentencizer = nlp.create_pipe("sentencizer") | ||||
| nlp.add_pipe(sentencizer) | ||||
| doc = nlp(u"This is a sentence. This is another sentence.") | ||||
| for sent in doc.sents: | ||||
|     print(sent.text) | ||||
| ``` | ||||
| 
 | ||||
| ### Custom rule-based strategy {#sbd-custom} | ||||
| 
 | ||||
| If you want to implement your own strategy that differs from the default | ||||
| rule-based approach of splitting on sentences, you can also instantiate the | ||||
| `SentenceSegmenter` directly and pass in your own strategy. The strategy should | ||||
| be a function that takes a `Doc` object and yields a `Span` for each sentence. | ||||
| Here's an example of a custom segmentation strategy for splitting on newlines | ||||
| only: | ||||
| 
 | ||||
| ```python | ||||
| ### {executable="true"} | ||||
| from spacy.lang.en import English | ||||
| from spacy.pipeline import SentenceSegmenter | ||||
| 
 | ||||
| def split_on_newlines(doc): | ||||
|     start = 0 | ||||
|     seen_newline = False | ||||
|     for word in doc: | ||||
|         if seen_newline and not word.is_space: | ||||
|             yield doc[start:word.i] | ||||
|             start = word.i | ||||
|             seen_newline = False | ||||
|         elif word.text == '\\n': | ||||
|             seen_newline = True | ||||
|     if start < len(doc): | ||||
|         yield doc[start:len(doc)] | ||||
| 
 | ||||
| nlp = English()  # Just the language with no model | ||||
| sentencizer = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines) | ||||
| nlp.add_pipe(sentencizer) | ||||
| doc = nlp(u"This is a sentence\\n\\nThis is another sentence\\nAnd more") | ||||
| for sent in doc.sents: | ||||
|     print([token.text for token in sent]) | ||||
| ``` | ||||
| 
 | ||||
| ## Rule-based matching {#rule-based-matching hidden="true"} | ||||
| 
 | ||||
| <div id="rule-based-matching"> | ||||
|  |  | |||
|  | @ -138,7 +138,7 @@ require them in the pipeline settings in your model's `meta.json`. | |||
| | `ner`               | [`EntityRecognizer`](/api/entityrecognizer)                      | Assign named entities.                                                                        | | ||||
| | `textcat`           | [`TextCategorizer`](/api/textcategorizer)                        | Assign text categories.                                                                       | | ||||
| | `entity_ruler`      | [`EntityRuler`](/api/entityruler)                                | Assign named entities based on pattern rules.                                                 | | ||||
| | `sentencizer`       | [`SentenceSegmenter`](/api/sentencesegmenter)                    | Add rule-based sentence segmentation without the dependency parse.                            | | ||||
| | `sentencizer`       | [`Sentencizer`](/api/sentencizer)                                | Add rule-based sentence segmentation without the dependency parse.                            | | ||||
| | `merge_noun_chunks` | [`merge_noun_chunks`](/api/pipeline-functions#merge_noun_chunks) | Merge all noun chunks into a single token. Should be added after the tagger and parser.       | | ||||
| | `merge_entities`    | [`merge_entities`](/api/pipeline-functions#merge_entities)       | Merge all entities into a single token. Should be added after the entity recognizer.          | | ||||
| | `merge_subtokens`   | [`merge_subtokens`](/api/pipeline-functions#merge_subtokens)     | Merge subtokens predicted by the parser into single tokens. Should be added after the parser. | | ||||
|  |  | |||
|  | @ -195,7 +195,7 @@ the existing pages and added some new content: | |||
| - **Universe:** [Videos](/universe/category/videos) and | ||||
|   [Podcasts](/universe/category/podcasts) | ||||
| - **API:** [`EntityRuler`](/api/entityruler) | ||||
| - **API:** [`SentenceSegmenter`](/api/sentencesegmenter) | ||||
| - **API:** [`Sentencizer`](/api/sentencizer) | ||||
| - **API:** [Pipeline functions](/api/pipeline-functions) | ||||
| 
 | ||||
| ## Backwards incompatibilities {#incompat} | ||||
|  |  | |||
|  | @ -79,7 +79,7 @@ | |||
|                     { "text": "Matcher", "url": "/api/matcher" }, | ||||
|                     { "text": "PhraseMatcher", "url": "/api/phrasematcher" }, | ||||
|                     { "text": "EntityRuler", "url": "/api/entityruler" }, | ||||
|                     { "text": "SentenceSegmenter", "url": "/api/sentencesegmenter" }, | ||||
|                     { "text": "Sentencizer", "url": "/api/sentencizer" }, | ||||
|                     { "text": "Other Functions", "url": "/api/pipeline-functions" } | ||||
|                 ] | ||||
|             }, | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user