mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-29 11:26:28 +03:00
84 lines
1.3 KiB
Python
84 lines
1.3 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ..symbols import *
|
|
from ..language_data import PRON_LEMMA, DET_LEMMA
|
|
|
|
|
|
TOKENIZER_EXCEPTIONS = {
|
|
"pal": [
|
|
{ORTH: "pa", LEMMA: "para"},
|
|
{ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}
|
|
],
|
|
|
|
"pala": [
|
|
{ORTH: "pa", LEMMA: "para"},
|
|
{ORTH: "la", LEMMA: DET_LEMMA, NORM: "la"}
|
|
],
|
|
|
|
"aprox.": [
|
|
{ORTH: "aprox.", LEMMA: "aproximadamente"}
|
|
],
|
|
|
|
"dna.": [
|
|
{ORTH: "dna.", LEMMA: "docena"}
|
|
],
|
|
|
|
"esq.": [
|
|
{ORTH: "esq.", LEMMA: "esquina"}
|
|
],
|
|
|
|
"pág.": [
|
|
{ORTH: "pág.", LEMMA: "página"}
|
|
],
|
|
|
|
"p.ej.": [
|
|
{ORTH: "p.ej.", LEMMA: "por ejemplo"}
|
|
],
|
|
|
|
"Ud.": [
|
|
{ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}
|
|
],
|
|
|
|
"Vd.": [
|
|
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}
|
|
],
|
|
|
|
"Uds.": [
|
|
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}
|
|
],
|
|
|
|
"Vds.": [
|
|
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}
|
|
]
|
|
}
|
|
|
|
|
|
ORTH_ONLY = [
|
|
"a.C.",
|
|
"a.J.C.",
|
|
"apdo.",
|
|
"Av.",
|
|
"Avda.",
|
|
"Cía.",
|
|
"etc.",
|
|
"Gob.",
|
|
"Gral.",
|
|
"Ing.",
|
|
"J.C.",
|
|
"Lic.",
|
|
"m.n.",
|
|
"no.",
|
|
"núm.",
|
|
"P.D.",
|
|
"Prof.",
|
|
"Profa.",
|
|
"q.e.p.d."
|
|
"S.A.",
|
|
"S.L.",
|
|
"s.s.s.",
|
|
"Sr.",
|
|
"Sra.",
|
|
"Srta."
|
|
]
|