mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			98 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			98 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
from ...symbols import ORTH
 | 
						|
from .punctuation import _make_ro_variants
 | 
						|
 | 
						|
 | 
						|
_exc = {}
 | 
						|
 | 
						|
 | 
						|
# Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations
 | 
						|
for orth in [
 | 
						|
    "1-a",
 | 
						|
    "2-a",
 | 
						|
    "3-a",
 | 
						|
    "4-a",
 | 
						|
    "5-a",
 | 
						|
    "6-a",
 | 
						|
    "7-a",
 | 
						|
    "8-a",
 | 
						|
    "9-a",
 | 
						|
    "10-a",
 | 
						|
    "11-a",
 | 
						|
    "12-a",
 | 
						|
    "1-ul",
 | 
						|
    "2-lea",
 | 
						|
    "3-lea",
 | 
						|
    "4-lea",
 | 
						|
    "5-lea",
 | 
						|
    "6-lea",
 | 
						|
    "7-lea",
 | 
						|
    "8-lea",
 | 
						|
    "9-lea",
 | 
						|
    "10-lea",
 | 
						|
    "11-lea",
 | 
						|
    "12-lea",
 | 
						|
    "d-voastră",
 | 
						|
    "dvs.",
 | 
						|
    "ing.",
 | 
						|
    "dr.",
 | 
						|
    "Rom.",
 | 
						|
    "str.",
 | 
						|
    "nr.",
 | 
						|
    "etc.",
 | 
						|
    "d.p.d.v.",
 | 
						|
    "dpdv",
 | 
						|
    "șamd.",
 | 
						|
    "ș.a.m.d.",
 | 
						|
    # below: from UD_Romanian-RRT:
 | 
						|
    "A.c.",
 | 
						|
    "A.f.",
 | 
						|
    "A.r.",
 | 
						|
    "Al.",
 | 
						|
    "Art.",
 | 
						|
    "Aug.",
 | 
						|
    "Bd.",
 | 
						|
    "Dem.",
 | 
						|
    "Dr.",
 | 
						|
    "Fig.",
 | 
						|
    "Fr.",
 | 
						|
    "Gh.",
 | 
						|
    "Gr.",
 | 
						|
    "Lt.",
 | 
						|
    "Nr.",
 | 
						|
    "Obs.",
 | 
						|
    "Prof.",
 | 
						|
    "Sf.",
 | 
						|
    "a.m.",
 | 
						|
    "a.r.",
 | 
						|
    "alin.",
 | 
						|
    "art.",
 | 
						|
    "d-l",
 | 
						|
    "d-lui",
 | 
						|
    "d-nei",
 | 
						|
    "ex.",
 | 
						|
    "fig.",
 | 
						|
    "ian.",
 | 
						|
    "lit.",
 | 
						|
    "lt.",
 | 
						|
    "p.a.",
 | 
						|
    "p.m.",
 | 
						|
    "pct.",
 | 
						|
    "prep.",
 | 
						|
    "sf.",
 | 
						|
    "tel.",
 | 
						|
    "univ.",
 | 
						|
    "îngr.",
 | 
						|
    "într-adevăr",
 | 
						|
    "Șt.",
 | 
						|
    "ș.a.",
 | 
						|
]:
 | 
						|
    # note: does not distinguish capitalized-only exceptions from others
 | 
						|
    for variant in _make_ro_variants([orth]):
 | 
						|
        _exc[variant] = [{ORTH: variant}]
 | 
						|
 | 
						|
 | 
						|
TOKENIZER_EXCEPTIONS = _exc
 |