mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			54 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			54 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
						||
from ...symbols import ORTH, NORM
 | 
						||
from ...util import update_exc
 | 
						||
 | 
						||
 | 
						||
# TODO
 | 
						||
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
 | 
						||
 | 
						||
_exc = {}
 | 
						||
 | 
						||
# translate / delete what is not necessary
 | 
						||
for exc_data in [
 | 
						||
    {ORTH: "’t", NORM: "et"},
 | 
						||
    {ORTH: "’T", NORM: "et"},
 | 
						||
    {ORTH: "'t", NORM: "et"},
 | 
						||
    {ORTH: "'T", NORM: "et"},
 | 
						||
    {ORTH: "wgl.", NORM: "wannechgelift"},
 | 
						||
    {ORTH: "M.", NORM: "Monsieur"},
 | 
						||
    {ORTH: "Mme.", NORM: "Madame"},
 | 
						||
    {ORTH: "Dr.", NORM: "Dokter"},
 | 
						||
    {ORTH: "Tel.", NORM: "Telefon"},
 | 
						||
    {ORTH: "asw.", NORM: "an sou weider"},
 | 
						||
    {ORTH: "etc.", NORM: "et cetera"},
 | 
						||
    {ORTH: "bzw.", NORM: "bezéiungsweis"},
 | 
						||
    {ORTH: "Jan.", NORM: "Januar"},
 | 
						||
]:
 | 
						||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
						||
 | 
						||
 | 
						||
# to be extended
 | 
						||
for orth in [
 | 
						||
    "z.B.",
 | 
						||
    "Dipl.",
 | 
						||
    "Dr.",
 | 
						||
    "etc.",
 | 
						||
    "i.e.",
 | 
						||
    "o.k.",
 | 
						||
    "O.K.",
 | 
						||
    "p.a.",
 | 
						||
    "p.s.",
 | 
						||
    "P.S.",
 | 
						||
    "phil.",
 | 
						||
    "q.e.d.",
 | 
						||
    "R.I.P.",
 | 
						||
    "rer.",
 | 
						||
    "sen.",
 | 
						||
    "ë.a.",
 | 
						||
    "U.S.",
 | 
						||
    "U.S.A.",
 | 
						||
]:
 | 
						||
    _exc[orth] = [{ORTH: orth}]
 | 
						||
 | 
						||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 |