mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			128 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			128 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import pytest
 | 
						|
 | 
						|
 | 
						|
def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
 | 
						|
    text = "don't giggle"
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert len(tokens) == 3
 | 
						|
    assert tokens[1].text == "n't"
 | 
						|
    text = "i said don't!"
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert len(tokens) == 5
 | 
						|
    assert tokens[4].text == "!"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("text", ["`ain't", """"isn't""", "can't!"])
 | 
						|
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert len(tokens) == 3
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "text_poss,text", [("Robin's", "Robin"), ("Alexis's", "Alexis")]
 | 
						|
)
 | 
						|
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
 | 
						|
    tokens = en_tokenizer(text_poss)
 | 
						|
    assert len(tokens) == 2
 | 
						|
    assert tokens[0].text == text
 | 
						|
    assert tokens[1].text == "'s"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("text", ["schools'", "Alexis'"])
 | 
						|
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert len(tokens) == 2
 | 
						|
    assert tokens[0].text == text.split("'")[0]
 | 
						|
    assert tokens[1].text == "'"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("text", ["'em", "nothin'", "ol'"])
 | 
						|
def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert len(tokens) == 1
 | 
						|
    assert tokens[0].text == text
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll", "this'll", "those'll"])
 | 
						|
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert len(tokens) == 2
 | 
						|
    assert tokens[0].text == text.split("'")[0]
 | 
						|
    assert tokens[1].text == "'ll"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "text_lower,text_title", [("can't", "Can't"), ("ain't", "Ain't")]
 | 
						|
)
 | 
						|
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
 | 
						|
    tokens_lower = en_tokenizer(text_lower)
 | 
						|
    tokens_title = en_tokenizer(text_title)
 | 
						|
    assert tokens_title[0].text == tokens_lower[0].text.title()
 | 
						|
    assert tokens_lower[0].text == tokens_title[0].text.lower()
 | 
						|
    assert tokens_lower[1].text == tokens_title[1].text
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("pron", ["I", "You", "He", "She", "It", "We", "They"])
 | 
						|
@pytest.mark.parametrize("contraction", ["'ll", "'d"])
 | 
						|
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
 | 
						|
    tokens = en_tokenizer(pron + contraction)
 | 
						|
    assert tokens[0].text == pron
 | 
						|
    assert tokens[1].text == contraction
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("exc", ["Ill", "ill", "Hell", "hell", "Well", "well"])
 | 
						|
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
 | 
						|
    tokens = en_tokenizer(exc)
 | 
						|
    assert len(tokens) == 1
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "wo_punct,w_punct", [("We've", "`We've"), ("couldn't", "couldn't)")]
 | 
						|
)
 | 
						|
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
 | 
						|
    tokens = en_tokenizer(wo_punct)
 | 
						|
    assert len(tokens) == 2
 | 
						|
    tokens = en_tokenizer(w_punct)
 | 
						|
    assert len(tokens) == 3
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("text", ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
 | 
						|
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert len(tokens) == 1
 | 
						|
 | 
						|
 | 
						|
def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
 | 
						|
    text = "It's mediocre i.e. bad."
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert len(tokens) == 6
 | 
						|
    assert tokens[3].text == "i.e."
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("text", ["1am", "12a.m.", "11p.m.", "4pm"])
 | 
						|
def test_en_tokenizer_handles_times(en_tokenizer, text):
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert len(tokens) == 2
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "text,norms",
 | 
						|
    [
 | 
						|
        ("I'm", ["i", "am"]),
 | 
						|
        ("shan't", ["shall", "not"]),
 | 
						|
        (
 | 
						|
            "Many factors cause cancer 'cause it is complex",
 | 
						|
            ["many", "factors", "cause", "cancer", "because", "it", "is", "complex"],
 | 
						|
        ),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert [token.norm_ for token in tokens] == norms
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("text,norm", [("Jan.", "January"), ("'cuz", "because")])
 | 
						|
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert tokens[0].norm_ == norm
 |