mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
56 lines
716 B
Python
56 lines
716 B
Python
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
from ...symbols import ORTH
|
|
from ...util import update_exc
|
|
|
|
|
|
_exc = {}
|
|
|
|
|
|
for orth in [
|
|
"Adm.",
|
|
"Art.",
|
|
"art.",
|
|
"Av.",
|
|
"av.",
|
|
"Cia.",
|
|
"dom.",
|
|
"Dr.",
|
|
"dr.",
|
|
"e.g.",
|
|
"E.g.",
|
|
"E.G.",
|
|
"e/ou",
|
|
"ed.",
|
|
"eng.",
|
|
"etc.",
|
|
"Fund.",
|
|
"Gen.",
|
|
"Gov.",
|
|
"i.e.",
|
|
"I.e.",
|
|
"I.E.",
|
|
"Inc.",
|
|
"Jr.",
|
|
"km/h",
|
|
"Ltd.",
|
|
"Mr.",
|
|
"p.m.",
|
|
"Ph.D.",
|
|
"Rep.",
|
|
"Rev.",
|
|
"S/A",
|
|
"Sen.",
|
|
"Sr.",
|
|
"sr.",
|
|
"Sra.",
|
|
"sra.",
|
|
"vs.",
|
|
"tel.",
|
|
"pág.",
|
|
"pag.",
|
|
]:
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
|
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|