mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Update tokenizer tests for contractions
This commit is contained in:
		
							parent
							
								
									109f202e8f
								
							
						
					
					
						commit
						550630df73
					
				| 
						 | 
					@ -1,15 +1,10 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
from ...en import English
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def en_tokenizer():
 | 
					 | 
				
			||||||
    return English.Defaults.create_tokenizer()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
 | 
				
			||||||
@pytest.mark.parametrize('inputs', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
 | 
					def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
 | 
				
			||||||
def test_tokenizer_handles_poss_contraction(en_tokenizer, inputs):
 | 
					 | 
				
			||||||
    text_poss, text = inputs
 | 
					 | 
				
			||||||
    tokens = en_tokenizer(text_poss)
 | 
					    tokens = en_tokenizer(text_poss)
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
    assert tokens[0].text == text
 | 
					    assert tokens[0].text == text
 | 
				
			||||||
| 
						 | 
					@ -40,9 +35,8 @@ def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
 | 
				
			||||||
    assert tokens[1].lemma_ == "will"
 | 
					    assert tokens[1].lemma_ == "will"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize('inputs', [("can't", "Can't"), ("ain't", "Ain't")])
 | 
					@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
 | 
				
			||||||
def test_tokenizer_handles_capitalization(en_tokenizer, inputs):
 | 
					def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
 | 
				
			||||||
    text_lower, text_title = inputs
 | 
					 | 
				
			||||||
    tokens_lower = en_tokenizer(text_lower)
 | 
					    tokens_lower = en_tokenizer(text_lower)
 | 
				
			||||||
    tokens_title = en_tokenizer(text_title)
 | 
					    tokens_title = en_tokenizer(text_title)
 | 
				
			||||||
    assert tokens_title[0].text == tokens_lower[0].text.title()
 | 
					    assert tokens_title[0].text == tokens_lower[0].text.title()
 | 
				
			||||||
| 
						 | 
					@ -51,8 +45,8 @@ def test_tokenizer_handles_capitalization(en_tokenizer, inputs):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
 | 
					@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
 | 
				
			||||||
def test_tokenizer_keeps_title_case(en_tokenizer, pron):
 | 
					@pytest.mark.parametrize('contraction', ["'ll", "'d"])
 | 
				
			||||||
    for contraction in ["'ll", "'d"]:
 | 
					def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
 | 
				
			||||||
    tokens = en_tokenizer(pron + contraction)
 | 
					    tokens = en_tokenizer(pron + contraction)
 | 
				
			||||||
    assert tokens[0].text == pron
 | 
					    assert tokens[0].text == pron
 | 
				
			||||||
    assert tokens[1].text == contraction
 | 
					    assert tokens[1].text == contraction
 | 
				
			||||||
| 
						 | 
					@ -64,9 +58,8 @@ def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
 | 
				
			||||||
    assert len(tokens) == 1
 | 
					    assert len(tokens) == 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize('inputs', [("We've", "``We've"), ("couldn't", "couldn't)")])
 | 
					@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
 | 
				
			||||||
def test_tokenizer_splits_defined_punct(en_tokenizer, inputs):
 | 
					def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
 | 
				
			||||||
    wo_punct, w_punct = inputs
 | 
					 | 
				
			||||||
    tokens = en_tokenizer(wo_punct)
 | 
					    tokens = en_tokenizer(wo_punct)
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
    tokens = en_tokenizer(w_punct)
 | 
					    tokens = en_tokenizer(w_punct)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user