mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	- added some tests for tokenization issues - fixed some issues with tokenization of words with hyphen infix - rewrote the "tokenizer_exceptions.py" file (stemming from the German version)
		
			
				
	
	
		
			17 lines
		
	
	
		
			459 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			17 lines
		
	
	
		
			459 B
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
# TODO
 | 
						|
# norm execptions: find a possibility to deal with the zillions of spelling
 | 
						|
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
 | 
						|
# here one could include the most common spelling mistakes
 | 
						|
 | 
						|
_exc = {"dass": "datt", "viläicht": "vläicht"}
 | 
						|
 | 
						|
 | 
						|
NORM_EXCEPTIONS = {}
 | 
						|
 | 
						|
for string, norm in _exc.items():
 | 
						|
    NORM_EXCEPTIONS[string] = norm
 | 
						|
    NORM_EXCEPTIONS[string.title()] = norm
 |