mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			27 lines
		
	
	
		
			783 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			27 lines
		
	
	
		
			783 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import regex as re
 | |
| from ...lang.en import English
 | |
| from ...tokenizer import Tokenizer
 | |
| 
 | |
| 
 | |
| def test_issue1488():
 | |
|     prefix_re = re.compile(r'''[\[\("']''')
 | |
|     suffix_re = re.compile(r'''[\]\)"']''')
 | |
|     infix_re = re.compile(r'''[-~\.]''')
 | |
|     simple_url_re = re.compile(r'''^https?://''')
 | |
| 
 | |
|     def my_tokenizer(nlp):
 | |
|         return Tokenizer(nlp.vocab, {},
 | |
|                          prefix_search=prefix_re.search,
 | |
|                          suffix_search=suffix_re.search,
 | |
|                          infix_finditer=infix_re.finditer,
 | |
|                          token_match=simple_url_re.match)
 | |
| 
 | |
|     nlp = English()
 | |
|     nlp.tokenizer = my_tokenizer(nlp)
 | |
|     doc = nlp("This is a test.")
 | |
|     for token in doc:
 | |
|         assert token.text
 |