mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			49 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			49 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
						|
from ...symbols import ORTH, NORM
 | 
						|
from ...util import update_exc
 | 
						|
 | 
						|
 | 
						|
_exc = {}
 | 
						|
 | 
						|
 | 
						|
# Time
 | 
						|
for exc_data in [
 | 
						|
    {NORM: "قبل الميلاد", ORTH: "ق.م"},
 | 
						|
    {NORM: "بعد الميلاد", ORTH: "ب. م"},
 | 
						|
    {NORM: "ميلادي", ORTH: ".م"},
 | 
						|
    {NORM: "هجري", ORTH: ".هـ"},
 | 
						|
    {NORM: "توفي", ORTH: ".ت"},
 | 
						|
]:
 | 
						|
    _exc[exc_data[ORTH]] = [exc_data]
 | 
						|
 | 
						|
# Scientific abv.
 | 
						|
for exc_data in [
 | 
						|
    {NORM: "صلى الله عليه وسلم", ORTH: "صلعم"},
 | 
						|
    {NORM: "الشارح", ORTH: "الشـ"},
 | 
						|
    {NORM: "الظاهر", ORTH: "الظـ"},
 | 
						|
    {NORM: "أيضًا", ORTH: "أيضـ"},
 | 
						|
    {NORM: "إلى آخره", ORTH: "إلخ"},
 | 
						|
    {NORM: "انتهى", ORTH: "اهـ"},
 | 
						|
    {NORM: "حدّثنا", ORTH: "ثنا"},
 | 
						|
    {NORM: "حدثني", ORTH: "ثنى"},
 | 
						|
    {NORM: "أنبأنا", ORTH: "أنا"},
 | 
						|
    {NORM: "أخبرنا", ORTH: "نا"},
 | 
						|
    {NORM: "مصدر سابق", ORTH: "م. س"},
 | 
						|
    {NORM: "مصدر نفسه", ORTH: "م. ن"},
 | 
						|
]:
 | 
						|
    _exc[exc_data[ORTH]] = [exc_data]
 | 
						|
 | 
						|
# Other abv.
 | 
						|
for exc_data in [
 | 
						|
    {NORM: "دكتور", ORTH: "د."},
 | 
						|
    {NORM: "أستاذ دكتور", ORTH: "أ.د"},
 | 
						|
    {NORM: "أستاذ", ORTH: "أ."},
 | 
						|
    {NORM: "بروفيسور", ORTH: "ب."},
 | 
						|
]:
 | 
						|
    _exc[exc_data[ORTH]] = [exc_data]
 | 
						|
 | 
						|
for exc_data in [{NORM: "تلفون", ORTH: "ت."}, {NORM: "صندوق بريد", ORTH: "ص.ب"}]:
 | 
						|
    _exc[exc_data[ORTH]] = [exc_data]
 | 
						|
 | 
						|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 |