mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	* Add better serializable sentencizer component * Replace default factory * Add tests * Tidy up * Pass test * Update docs
		
			
				
	
	
		
			88 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			88 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
import pytest
 | 
						|
from spacy.pipeline import Sentencizer
 | 
						|
from spacy.tokens import Doc
 | 
						|
 | 
						|
 | 
						|
def test_sentencizer(en_vocab):
 | 
						|
    doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
 | 
						|
    sentencizer = Sentencizer()
 | 
						|
    doc = sentencizer(doc)
 | 
						|
    assert doc.is_sentenced
 | 
						|
    sent_starts = [t.is_sent_start for t in doc]
 | 
						|
    assert sent_starts == [True, False, True, False, False, False, False]
 | 
						|
    assert len(list(doc.sents)) == 2
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "words,sent_starts,n_sents",
 | 
						|
    [
 | 
						|
        # The expected result here is that the duplicate punctuation gets merged
 | 
						|
        # onto the same sentence and no one-token sentence is created for them.
 | 
						|
        (
 | 
						|
            ["Hello", "!", ".", "Test", ".", ".", "ok"],
 | 
						|
            [True, False, False, True, False, False, True],
 | 
						|
            3,
 | 
						|
        ),
 | 
						|
        # We also want to make sure ¡ and ¿ aren't treated as sentence end
 | 
						|
        # markers, even though they're punctuation
 | 
						|
        (
 | 
						|
            ["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"],
 | 
						|
            [True, False, False, False, True, False, False, False, False, False],
 | 
						|
            2,
 | 
						|
        ),
 | 
						|
        # The Token.is_punct check ensures that quotes are handled as well
 | 
						|
        (
 | 
						|
            ['"', "Nice", "!", '"', "I", "am", "happy", "."],
 | 
						|
            [True, False, False, False, True, False, False, False],
 | 
						|
            2,
 | 
						|
        ),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents):
 | 
						|
    doc = Doc(en_vocab, words=words)
 | 
						|
    sentencizer = Sentencizer()
 | 
						|
    doc = sentencizer(doc)
 | 
						|
    assert doc.is_sentenced
 | 
						|
    assert [t.is_sent_start for t in doc] == sent_starts
 | 
						|
    assert len(list(doc.sents)) == n_sents
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "punct_chars,words,sent_starts,n_sents",
 | 
						|
    [
 | 
						|
        (
 | 
						|
            ["~", "?"],
 | 
						|
            ["Hello", "world", "~", "A", ".", "B", "."],
 | 
						|
            [True, False, False, True, False, False, False],
 | 
						|
            2,
 | 
						|
        ),
 | 
						|
        # Even thought it's not common, the punct_chars should be able to
 | 
						|
        # handle any tokens
 | 
						|
        (
 | 
						|
            [".", "ö"],
 | 
						|
            ["Hello", ".", "Test", "ö", "Ok", "."],
 | 
						|
            [True, False, True, False, True, False],
 | 
						|
            3,
 | 
						|
        ),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents):
 | 
						|
    doc = Doc(en_vocab, words=words)
 | 
						|
    sentencizer = Sentencizer(punct_chars=punct_chars)
 | 
						|
    doc = sentencizer(doc)
 | 
						|
    assert doc.is_sentenced
 | 
						|
    assert [t.is_sent_start for t in doc] == sent_starts
 | 
						|
    assert len(list(doc.sents)) == n_sents
 | 
						|
 | 
						|
 | 
						|
def test_sentencizer_serialize_bytes(en_vocab):
 | 
						|
    punct_chars = [".", "~", "+"]
 | 
						|
    sentencizer = Sentencizer(punct_chars=punct_chars)
 | 
						|
    assert sentencizer.punct_chars == punct_chars
 | 
						|
    bytes_data = sentencizer.to_bytes()
 | 
						|
    new_sentencizer = Sentencizer().from_bytes(bytes_data)
 | 
						|
    assert new_sentencizer.punct_chars == punct_chars
 |