mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	* Add better serializable sentencizer component * Replace default factory * Add tests * Tidy up * Pass test * Update docs
		
			
				
	
	
		
			88 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			88 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import pytest
 | |
| from spacy.pipeline import Sentencizer
 | |
| from spacy.tokens import Doc
 | |
| 
 | |
| 
 | |
| def test_sentencizer(en_vocab):
 | |
|     doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
 | |
|     sentencizer = Sentencizer()
 | |
|     doc = sentencizer(doc)
 | |
|     assert doc.is_sentenced
 | |
|     sent_starts = [t.is_sent_start for t in doc]
 | |
|     assert sent_starts == [True, False, True, False, False, False, False]
 | |
|     assert len(list(doc.sents)) == 2
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "words,sent_starts,n_sents",
 | |
|     [
 | |
|         # The expected result here is that the duplicate punctuation gets merged
 | |
|         # onto the same sentence and no one-token sentence is created for them.
 | |
|         (
 | |
|             ["Hello", "!", ".", "Test", ".", ".", "ok"],
 | |
|             [True, False, False, True, False, False, True],
 | |
|             3,
 | |
|         ),
 | |
|         # We also want to make sure ¡ and ¿ aren't treated as sentence end
 | |
|         # markers, even though they're punctuation
 | |
|         (
 | |
|             ["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"],
 | |
|             [True, False, False, False, True, False, False, False, False, False],
 | |
|             2,
 | |
|         ),
 | |
|         # The Token.is_punct check ensures that quotes are handled as well
 | |
|         (
 | |
|             ['"', "Nice", "!", '"', "I", "am", "happy", "."],
 | |
|             [True, False, False, False, True, False, False, False],
 | |
|             2,
 | |
|         ),
 | |
|     ],
 | |
| )
 | |
| def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents):
 | |
|     doc = Doc(en_vocab, words=words)
 | |
|     sentencizer = Sentencizer()
 | |
|     doc = sentencizer(doc)
 | |
|     assert doc.is_sentenced
 | |
|     assert [t.is_sent_start for t in doc] == sent_starts
 | |
|     assert len(list(doc.sents)) == n_sents
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "punct_chars,words,sent_starts,n_sents",
 | |
|     [
 | |
|         (
 | |
|             ["~", "?"],
 | |
|             ["Hello", "world", "~", "A", ".", "B", "."],
 | |
|             [True, False, False, True, False, False, False],
 | |
|             2,
 | |
|         ),
 | |
|         # Even thought it's not common, the punct_chars should be able to
 | |
|         # handle any tokens
 | |
|         (
 | |
|             [".", "ö"],
 | |
|             ["Hello", ".", "Test", "ö", "Ok", "."],
 | |
|             [True, False, True, False, True, False],
 | |
|             3,
 | |
|         ),
 | |
|     ],
 | |
| )
 | |
| def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents):
 | |
|     doc = Doc(en_vocab, words=words)
 | |
|     sentencizer = Sentencizer(punct_chars=punct_chars)
 | |
|     doc = sentencizer(doc)
 | |
|     assert doc.is_sentenced
 | |
|     assert [t.is_sent_start for t in doc] == sent_starts
 | |
|     assert len(list(doc.sents)) == n_sents
 | |
| 
 | |
| 
 | |
| def test_sentencizer_serialize_bytes(en_vocab):
 | |
|     punct_chars = [".", "~", "+"]
 | |
|     sentencizer = Sentencizer(punct_chars=punct_chars)
 | |
|     assert sentencizer.punct_chars == punct_chars
 | |
|     bytes_data = sentencizer.to_bytes()
 | |
|     new_sentencizer = Sentencizer().from_bytes(bytes_data)
 | |
|     assert new_sentencizer.punct_chars == punct_chars
 |