mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
114 lines
2.0 KiB
Python
114 lines
2.0 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ..symbols import *
|
|
from ..language_data import PRON_LEMMA, DET_LEMMA
|
|
|
|
|
|
TOKENIZER_EXCEPTIONS = {
|
|
"al": [
|
|
{ORTH: "a", LEMMA: "a", TAG: ADP},
|
|
{ORTH: "el", LEMMA: "el", TAG: DET}
|
|
],
|
|
|
|
"consigo": [
|
|
{ORTH: "con", LEMMA: "con"},
|
|
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}
|
|
],
|
|
|
|
"conmigo": [
|
|
{ORTH: "con", LEMMA: "con"},
|
|
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}
|
|
],
|
|
|
|
"contigo": [
|
|
{ORTH: "con", LEMMA: "con"},
|
|
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}
|
|
],
|
|
|
|
"del": [
|
|
{ORTH: "de", LEMMA: "de", TAG: ADP},
|
|
{ORTH: "l", LEMMA: "el", TAG: DET}
|
|
],
|
|
|
|
"pel": [
|
|
{ORTH: "pe", LEMMA: "per", TAG: ADP},
|
|
{ORTH: "l", LEMMA: "el", TAG: DET}
|
|
],
|
|
|
|
"pal": [
|
|
{ORTH: "pa", LEMMA: "para"},
|
|
{ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}
|
|
],
|
|
|
|
"pala": [
|
|
{ORTH: "pa", LEMMA: "para"},
|
|
{ORTH: "la", LEMMA: DET_LEMMA}
|
|
],
|
|
|
|
"aprox.": [
|
|
{ORTH: "aprox.", LEMMA: "aproximadamente"}
|
|
],
|
|
|
|
"dna.": [
|
|
{ORTH: "dna.", LEMMA: "docena"}
|
|
],
|
|
|
|
"esq.": [
|
|
{ORTH: "esq.", LEMMA: "esquina"}
|
|
],
|
|
|
|
"pág.": [
|
|
{ORTH: "pág.", LEMMA: "página"}
|
|
],
|
|
|
|
"p.ej.": [
|
|
{ORTH: "p.ej.", LEMMA: "por ejemplo"}
|
|
],
|
|
|
|
"Ud.": [
|
|
{ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}
|
|
],
|
|
|
|
"Vd.": [
|
|
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}
|
|
],
|
|
|
|
"Uds.": [
|
|
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}
|
|
],
|
|
|
|
"Vds.": [
|
|
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}
|
|
]
|
|
}
|
|
|
|
|
|
ORTH_ONLY = [
|
|
"a.C.",
|
|
"a.J.C.",
|
|
"apdo.",
|
|
"Av.",
|
|
"Avda.",
|
|
"Cía.",
|
|
"etc.",
|
|
"Gob.",
|
|
"Gral.",
|
|
"Ing.",
|
|
"J.C.",
|
|
"Lic.",
|
|
"m.n.",
|
|
"no.",
|
|
"núm.",
|
|
"P.D.",
|
|
"Prof.",
|
|
"Profa.",
|
|
"q.e.p.d."
|
|
"S.A.",
|
|
"S.L.",
|
|
"s.s.s.",
|
|
"Sr.",
|
|
"Sra.",
|
|
"Srta."
|
|
]
|