mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Add test for tokenizer regular expressions
This commit is contained in:
		
							parent
							
								
									e0712d1b32
								
							
						
					
					
						commit
						07f0efb102
					
				| 
						 | 
				
			
			@ -7,6 +7,10 @@ import pickle
 | 
			
		|||
import cloudpickle
 | 
			
		||||
import tempfile
 | 
			
		||||
 | 
			
		||||
from ... import util
 | 
			
		||||
from ...en.language_data import TOKENIZER_PREFIXES as EN_TOKENIZER_PREFIXES
 | 
			
		||||
 | 
			
		||||
en_search_prefixes = util.compile_prefix_regex(EN_TOKENIZER_PREFIXES).search
 | 
			
		||||
 | 
			
		||||
# @pytest.mark.xfail
 | 
			
		||||
# def test_pickle(en_tokenizer):
 | 
			
		||||
| 
						 | 
				
			
			@ -16,6 +20,10 @@ import tempfile
 | 
			
		|||
#     loaded = pickle.load(file_)
 | 
			
		||||
#     assert loaded is not None
 | 
			
		||||
 | 
			
		||||
def test_pre_punct_regex():
 | 
			
		||||
    string = "(can't"
 | 
			
		||||
    match = en_search_prefixes(string)
 | 
			
		||||
    assert match.group() == "("
 | 
			
		||||
 | 
			
		||||
def test_no_word(en_tokenizer):
 | 
			
		||||
    tokens = en_tokenizer(u'')
 | 
			
		||||
| 
						 | 
				
			
			@ -57,10 +65,9 @@ def test_contraction(en_tokenizer):
 | 
			
		|||
    assert len(tokens) == 5
 | 
			
		||||
    assert tokens[4].orth == en_tokenizer.vocab['!'].orth
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_contraction_punct(en_tokenizer):
 | 
			
		||||
    tokens = en_tokenizer("(can't")
 | 
			
		||||
    assert len(tokens) == 3
 | 
			
		||||
    tokens = [w.text for w in en_tokenizer("(can't")]
 | 
			
		||||
    assert tokens == ['(', 'ca', "n't"]
 | 
			
		||||
    tokens = en_tokenizer("`ain't")
 | 
			
		||||
    assert len(tokens) == 3
 | 
			
		||||
    tokens = en_tokenizer('''"isn't''')
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user