mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			27 lines
		
	
	
		
			703 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			27 lines
		
	
	
		
			703 B
		
	
	
	
		
			Python
		
	
	
	
	
	
# encoding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
 | 
						|
from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
 | 
						|
 | 
						|
 | 
						|
_exc = {}
 | 
						|
 | 
						|
for exc_data in [
 | 
						|
    {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
 | 
						|
    {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
 | 
						|
    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
 | 
						|
    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
 | 
						|
    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
 | 
						|
    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
 | 
						|
]:
 | 
						|
    _exc[exc_data[ORTH]] = [exc_data]
 | 
						|
 | 
						|
for orth in ["w.", "r."]:
 | 
						|
    _exc[orth] = [{ORTH: orth}]
 | 
						|
 | 
						|
for orth in PL_BASE_EXCEPTIONS:
 | 
						|
    _exc[orth] = [{ORTH: orth}]
 | 
						|
 | 
						|
TOKENIZER_EXCEPTIONS = _exc
 |