mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	updated tokenizer exceptions
This commit is contained in:
		
							parent
							
								
									edec51b1b1
								
							
						
					
					
						commit
						6eee7a7411
					
				|  | @ -1,12 +1,51 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import regex as re | ||||
| 
 | ||||
| from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS | ||||
| from ..tokenizer_exceptions import URL_PATTERN | ||||
| from ..char_classes import ALPHA | ||||
| from ...symbols import ORTH | ||||
| 
 | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
| for orth in ID_BASE_EXCEPTIONS + ["etc."]: | ||||
| for orth in ID_BASE_EXCEPTIONS: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = dict(_exc) | ||||
|     orth_title = orth.title() | ||||
|     _exc[orth_title] = [{ORTH: orth_title}] | ||||
| 
 | ||||
|     orth_caps = orth.upper() | ||||
|     _exc[orth_caps] = [{ORTH: orth_caps}] | ||||
| 
 | ||||
|     orth_lower = orth.lower() | ||||
|     _exc[orth_lower] = [{ORTH: orth_lower}] | ||||
| 
 | ||||
|     if '-' in orth: | ||||
|         orth_title = '-'.join([part.title() for part in orth.split('-')]) | ||||
|         _exc[orth_title] = [{ORTH: orth_title}] | ||||
| 
 | ||||
|         orth_caps = '-'.join([part.upper() for part in orth.split('-')]) | ||||
|         _exc[orth_caps] = [{ORTH: orth_caps}] | ||||
| 
 | ||||
| 
 | ||||
| _hyphen_prefix = """Abdur Abdus Abou Aboul Abror Abshar Abu Abubakar Abul | ||||
| Aero Agri Agro Ahmadi Ahmed Air abd abdel abdul ad adz afro al ala ali all | ||||
| amir an antar anti ar as ash asy at ath az bekas ber best bi co di double | ||||
| dual duo e eco eks el era ex full hi high i in inter intra ke kontra korona | ||||
| kuartal lintas m macro makro me mem meng micro mid mikro mini multi neo nge | ||||
| no non on pan pasca pe pem poli poly post pra pre pro re se self serba seri | ||||
| sub super t trans ultra un x """.split() | ||||
| 
 | ||||
| _hyphen_infix = """me-kan me-kannya men-kan men-kannya meng-kannya ke-an | ||||
| ke-annya di-kan di-kannya de-isasi ber-an berke-an""".split() | ||||
| 
 | ||||
| _regular_exp = ['^{p}-*$'.format(p=prefix) for prefix in _hyphen_prefix] | ||||
| _regular_exp += ['^{0}-*-{1}$'.format(*infix.split('-')) for infix in _hyphen_infix] | ||||
| _regular_exp.append(URL_PATTERN) | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = dict(_exc) | ||||
| TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user