mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Add explicit tokenizer test
This commit is contained in:
		
							parent
							
								
									957e6eaa8a
								
							
						
					
					
						commit
						7195c07897
					
				| 
						 | 
				
			
			@ -72,53 +72,58 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
 | 
			
		|||
    assert len(tokens) > 5
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_cnts1(EN):
 | 
			
		||||
def test_cnts1(en_tokenizer):
 | 
			
		||||
    text = u"""The U.S. Army likes Shock and Awe."""
 | 
			
		||||
    tokens = EN(text)
 | 
			
		||||
    tokens = en_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 8
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_cnts2(EN):
 | 
			
		||||
def test_cnts2(en_tokenizer):
 | 
			
		||||
    text = u"""U.N. regulations are not a part of their concern."""
 | 
			
		||||
    tokens = EN(text)
 | 
			
		||||
    tokens = en_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 10
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_cnts3(EN):
 | 
			
		||||
def test_cnts3(en_tokenizer):
 | 
			
		||||
    text = u"“Isn't it?”"
 | 
			
		||||
    tokens = EN(text)
 | 
			
		||||
    tokens = en_tokenizer(text)
 | 
			
		||||
    words = [t.orth_ for t in tokens]
 | 
			
		||||
    assert len(words) == 6
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_cnts4(EN):
 | 
			
		||||
def test_cnts4(en_tokenizer):
 | 
			
		||||
    text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
 | 
			
		||||
    tokens = EN(text)
 | 
			
		||||
    tokens = en_tokenizer(text)
 | 
			
		||||
    words = [t.orth_ for t in tokens]
 | 
			
		||||
    assert len(words) == 15
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_cnts5(EN):
 | 
			
		||||
def test_cnts5(en_tokenizer):
 | 
			
		||||
    text = """'Me too!', Mr. P. Delaware cried. """
 | 
			
		||||
    tokens = EN(text)
 | 
			
		||||
    tokens = en_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 11
 | 
			
		||||
 | 
			
		||||
def test_mr(en_tokenizer):
 | 
			
		||||
    text = """Mr. Smith"""
 | 
			
		||||
    tokens = en_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 2
 | 
			
		||||
 | 
			
		||||
def test_cnts6(EN):
 | 
			
		||||
 | 
			
		||||
def test_cnts6(en_tokenizer):
 | 
			
		||||
    text = u'They ran about 10km.'
 | 
			
		||||
    tokens = EN(text)
 | 
			
		||||
    tokens = en_tokenizer(text)
 | 
			
		||||
    words = [t.orth_ for t in tokens]
 | 
			
		||||
    assert len(words) == 6
 | 
			
		||||
 | 
			
		||||
def test_bracket_period(EN):
 | 
			
		||||
def test_bracket_period(en_tokenizer):
 | 
			
		||||
    text = u'(And a 6a.m. run through Washington Park).'
 | 
			
		||||
    tokens = EN(text)
 | 
			
		||||
    tokens = en_tokenizer(text)
 | 
			
		||||
    assert tokens[len(tokens) - 1].orth_ == u'.'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_ie(EN):
 | 
			
		||||
def test_ie(en_tokenizer):
 | 
			
		||||
    text = u"It's mediocre i.e. bad."
 | 
			
		||||
    tokens = EN(text)
 | 
			
		||||
    tokens = en_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 6
 | 
			
		||||
    assert tokens[3].orth_ == "i.e."
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user