Update tokenizer exceptions for Spanish

This commit is contained in:
Ines Montani 2016-12-21 18:06:17 +01:00
parent 920fa0fed2
commit d60380418e

View File

@ -7,312 +7,398 @@ from ..language_data import PRON_LEMMA
TOKENIZER_EXCEPTIONS = { TOKENIZER_EXCEPTIONS = {
"accidentarse": [ "accidentarse": [
{ORTH: "accidentar", LEMMA: "accidentar", POS: AUX}, {ORTH: "accidentar", LEMMA: "accidentar", TAG: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"aceptarlo": [ "aceptarlo": [
{ORTH: "aceptar", LEMMA: "aceptar", POS: AUX}, {ORTH: "aceptar", LEMMA: "aceptar", TAG: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"acompañarla": [ "acompañarla": [
{ORTH: "acompañar", LEMMA: "acompañar", POS: AUX}, {ORTH: "acompañar", LEMMA: "acompañar", TAG: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"advertirle": [ "advertirle": [
{ORTH: "advertir", LEMMA: "advertir", POS: AUX}, {ORTH: "advertir", LEMMA: "advertir", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"al": [ "al": [
{ORTH: "a", LEMMA: "a", POS: ADP}, {ORTH: "a", LEMMA: "a", TAG: ADP},
{ORTH: "el", LEMMA: "el", POS: DET} {ORTH: "el", LEMMA: "el", TAG: DET}
], ],
"anunciarnos": [ "anunciarnos": [
{ORTH: "anunciar", LEMMA: "anunciar", POS: AUX}, {ORTH: "anunciar", LEMMA: "anunciar", TAG: AUX},
{ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "nos", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"asegurándole": [ "asegurándole": [
{ORTH: "asegurando", LEMMA: "asegurar", POS: AUX}, {ORTH: "asegurando", LEMMA: "asegurar", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"considerarle": [ "considerarle": [
{ORTH: "considerar", LEMMA: "considerar", POS: AUX}, {ORTH: "considerar", LEMMA: "considerar", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"decirle": [ "decirle": [
{ORTH: "decir", LEMMA: "decir", POS: AUX}, {ORTH: "decir", LEMMA: "decir", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"decirles": [ "decirles": [
{ORTH: "decir", LEMMA: "decir", POS: AUX}, {ORTH: "decir", LEMMA: "decir", TAG: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "les", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"decirte": [ "decirte": [
{ORTH: "Decir", LEMMA: "decir", POS: AUX}, {ORTH: "Decir", LEMMA: "decir", TAG: AUX},
{ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "te", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"dejarla": [ "dejarla": [
{ORTH: "dejar", LEMMA: "dejar", POS: AUX}, {ORTH: "dejar", LEMMA: "dejar", TAG: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"dejarnos": [ "dejarnos": [
{ORTH: "dejar", LEMMA: "dejar", POS: AUX}, {ORTH: "dejar", LEMMA: "dejar", TAG: AUX},
{ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "nos", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"dejándole": [ "dejándole": [
{ORTH: "dejando", LEMMA: "dejar", POS: AUX}, {ORTH: "dejando", LEMMA: "dejar", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"del": [ "del": [
{ORTH: "de", LEMMA: "de", POS: ADP}, {ORTH: "de", LEMMA: "de", TAG: ADP},
{ORTH: "el", LEMMA: "el", POS: DET} {ORTH: "el", LEMMA: "el", TAG: DET}
], ],
"demostrarles": [ "demostrarles": [
{ORTH: "demostrar", LEMMA: "demostrar", POS: AUX}, {ORTH: "demostrar", LEMMA: "demostrar", TAG: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "les", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"diciéndole": [ "diciéndole": [
{ORTH: "diciendo", LEMMA: "decir", POS: AUX}, {ORTH: "diciendo", LEMMA: "decir", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"diciéndoles": [ "diciéndoles": [
{ORTH: "diciendo", LEMMA: "decir", POS: AUX}, {ORTH: "diciendo", LEMMA: "decir", TAG: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "les", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"diferenciarse": [ "diferenciarse": [
{ORTH: "diferenciar", LEMMA: "diferenciar", POS: AUX}, {ORTH: "diferenciar", LEMMA: "diferenciar", TAG: AUX},
{ORTH: "se", LEMMA: "él", POS: PRON} {ORTH: "se", LEMMA: "él", TAG: "PRON"}
], ],
"divirtiéndome": [ "divirtiéndome": [
{ORTH: "divirtiendo", LEMMA: "divertir", POS: AUX}, {ORTH: "divirtiendo", LEMMA: "divertir", TAG: AUX},
{ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "me", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"ensanchándose": [ "ensanchándose": [
{ORTH: "ensanchando", LEMMA: "ensanchar", POS: AUX}, {ORTH: "ensanchando", LEMMA: "ensanchar", TAG: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"explicarles": [ "explicarles": [
{ORTH: "explicar", LEMMA: "explicar", POS: AUX}, {ORTH: "explicar", LEMMA: "explicar", TAG: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "les", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"haberla": [ "haberla": [
{ORTH: "haber", LEMMA: "haber", POS: AUX}, {ORTH: "haber", LEMMA: "haber", TAG: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"haberlas": [ "haberlas": [
{ORTH: "haber", LEMMA: "haber", POS: AUX}, {ORTH: "haber", LEMMA: "haber", TAG: AUX},
{ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "las", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"haberlo": [ "haberlo": [
{ORTH: "haber", LEMMA: "haber", POS: AUX}, {ORTH: "haber", LEMMA: "haber", TAG: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"haberlos": [ "haberlos": [
{ORTH: "haber", LEMMA: "haber", POS: AUX}, {ORTH: "haber", LEMMA: "haber", TAG: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "los", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"haberme": [ "haberme": [
{ORTH: "haber", LEMMA: "haber", POS: AUX}, {ORTH: "haber", LEMMA: "haber", TAG: AUX},
{ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "me", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"haberse": [ "haberse": [
{ORTH: "haber", LEMMA: "haber", POS: AUX}, {ORTH: "haber", LEMMA: "haber", TAG: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"hacerle": [ "hacerle": [
{ORTH: "hacer", LEMMA: "hacer", POS: AUX}, {ORTH: "hacer", LEMMA: "hacer", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"hacerles": [ "hacerles": [
{ORTH: "hacer", LEMMA: "hacer", POS: AUX}, {ORTH: "hacer", LEMMA: "hacer", TAG: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "les", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"hallarse": [ "hallarse": [
{ORTH: "hallar", LEMMA: "hallar", POS: AUX}, {ORTH: "hallar", LEMMA: "hallar", TAG: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"imaginaros": [ "imaginaros": [
{ORTH: "imaginar", LEMMA: "imaginar", POS: AUX}, {ORTH: "imaginar", LEMMA: "imaginar", TAG: AUX},
{ORTH: "os", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "os", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"insinuarle": [ "insinuarle": [
{ORTH: "insinuar", LEMMA: "insinuar", POS: AUX}, {ORTH: "insinuar", LEMMA: "insinuar", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"justificarla": [ "justificarla": [
{ORTH: "justificar", LEMMA: "justificar", POS: AUX}, {ORTH: "justificar", LEMMA: "justificar", TAG: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"mantenerlas": [ "mantenerlas": [
{ORTH: "mantener", LEMMA: "mantener", POS: AUX}, {ORTH: "mantener", LEMMA: "mantener", TAG: AUX},
{ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "las", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"mantenerlos": [ "mantenerlos": [
{ORTH: "mantener", LEMMA: "mantener", POS: AUX}, {ORTH: "mantener", LEMMA: "mantener", TAG: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "los", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"mantenerme": [ "mantenerme": [
{ORTH: "mantener", LEMMA: "mantener", POS: AUX}, {ORTH: "mantener", LEMMA: "mantener", TAG: AUX},
{ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "me", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"pasarte": [ "pasarte": [
{ORTH: "pasar", LEMMA: "pasar", POS: AUX}, {ORTH: "pasar", LEMMA: "pasar", TAG: AUX},
{ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "te", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"pedirle": [ "pedirle": [
{ORTH: "pedir", LEMMA: "pedir", POS: AUX}, {ORTH: "pedir", LEMMA: "pedir", TAG: AUX},
{ORTH: "le", LEMMA: "él", POS: PRON} {ORTH: "le", LEMMA: "él", TAG: "PRON"}
], ],
"pel": [ "pel": [
{ORTH: "per", LEMMA: "per", POS: ADP}, {ORTH: "per", LEMMA: "per", TAG: ADP},
{ORTH: "el", LEMMA: "el", POS: DET} {ORTH: "el", LEMMA: "el", TAG: DET}
], ],
"pidiéndonos": [ "pidiéndonos": [
{ORTH: "pidiendo", LEMMA: "pedir", POS: AUX}, {ORTH: "pidiendo", LEMMA: "pedir", TAG: AUX},
{ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "nos", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"poderle": [ "poderle": [
{ORTH: "poder", LEMMA: "poder", POS: AUX}, {ORTH: "poder", LEMMA: "poder", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"preguntarse": [ "preguntarse": [
{ORTH: "preguntar", LEMMA: "preguntar", POS: AUX}, {ORTH: "preguntar", LEMMA: "preguntar", TAG: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"preguntándose": [ "preguntándose": [
{ORTH: "preguntando", LEMMA: "preguntar", POS: AUX}, {ORTH: "preguntando", LEMMA: "preguntar", TAG: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"presentarla": [ "presentarla": [
{ORTH: "presentar", LEMMA: "presentar", POS: AUX}, {ORTH: "presentar", LEMMA: "presentar", TAG: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"pudiéndolo": [ "pudiéndolo": [
{ORTH: "pudiendo", LEMMA: "poder", POS: AUX}, {ORTH: "pudiendo", LEMMA: "poder", TAG: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"pudiéndose": [ "pudiéndose": [
{ORTH: "pudiendo", LEMMA: "poder", POS: AUX}, {ORTH: "pudiendo", LEMMA: "poder", TAG: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"quererle": [ "quererle": [
{ORTH: "querer", LEMMA: "querer", POS: AUX}, {ORTH: "querer", LEMMA: "querer", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"rasgarse": [ "rasgarse": [
{ORTH: "Rasgar", LEMMA: "rasgar", POS: AUX}, {ORTH: "Rasgar", LEMMA: "rasgar", TAG: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"repetirlo": [ "repetirlo": [
{ORTH: "repetir", LEMMA: "repetir", POS: AUX}, {ORTH: "repetir", LEMMA: "repetir", TAG: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"robarle": [ "robarle": [
{ORTH: "robar", LEMMA: "robar", POS: AUX}, {ORTH: "robar", LEMMA: "robar", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"seguirlos": [ "seguirlos": [
{ORTH: "seguir", LEMMA: "seguir", POS: AUX}, {ORTH: "seguir", LEMMA: "seguir", TAG: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "los", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"serle": [ "serle": [
{ORTH: "ser", LEMMA: "ser", POS: AUX}, {ORTH: "ser", LEMMA: "ser", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"serlo": [ "serlo": [
{ORTH: "ser", LEMMA: "ser", POS: AUX}, {ORTH: "ser", LEMMA: "ser", TAG: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"señalándole": [ "señalándole": [
{ORTH: "señalando", LEMMA: "señalar", POS: AUX}, {ORTH: "señalando", LEMMA: "señalar", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"suplicarle": [ "suplicarle": [
{ORTH: "suplicar", LEMMA: "suplicar", POS: AUX}, {ORTH: "suplicar", LEMMA: "suplicar", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"tenerlos": [ "tenerlos": [
{ORTH: "tener", LEMMA: "tener", POS: AUX}, {ORTH: "tener", LEMMA: "tener", TAG: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "los", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"vengarse": [ "vengarse": [
{ORTH: "vengar", LEMMA: "vengar", POS: AUX}, {ORTH: "vengar", LEMMA: "vengar", TAG: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"verla": [ "verla": [
{ORTH: "ver", LEMMA: "ver", POS: AUX}, {ORTH: "ver", LEMMA: "ver", TAG: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"verle": [ "verle": [
{ORTH: "ver", LEMMA: "ver", POS: AUX}, {ORTH: "ver", LEMMA: "ver", TAG: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"}
], ],
"volverlo": [ "volverlo": [
{ORTH: "volver", LEMMA: "volver", POS: AUX}, {ORTH: "volver", LEMMA: "volver", TAG: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"}
],
"aprox.": [
{ORTH: "aprox.", LEMMA: "aproximadamente"}
],
"dna.": [
{ORTH: "dna.", LEMMA: "docena"}
],
"esq.": [
{ORTH: "esq.", LEMMA: "esquina"}
],
"pág.": [
{ORTH: "pág.", LEMMA: "página"}
],
"p.ej.": [
{ORTH: "p.ej.", LEMMA: "por ejemplo"}
],
"Ud.": [
{ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}
],
"Vd.": [
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}
],
"Uds.": [
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}
],
"Vds.": [
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}
] ]
} }
ORTH_ONLY = [ ORTH_ONLY = [
"a.",
"a.C.",
"a.J.C.",
"apdo.",
"Av.",
"Avda.",
"b.",
"c.",
"Cía.",
"d.",
"e.",
"etc.",
"f.",
"g.",
"Gob.",
"Gral.",
"h.",
"i.",
"Ing.",
"j.",
"J.C.",
"k.",
"l.",
"Lic.",
"m.",
"m.n.",
"n.",
"no.",
"núm.",
"o.",
"p.",
"P.D.",
"Prof.",
"Profa.",
"q.",
"q.e.p.d."
"r.",
"s.",
"S.A.",
"S.L.",
"s.s.s.",
"Sr.",
"Sra.",
"Srta.",
"t.",
"u.",
"v.",
"w.",
"x.",
"y.",
"z."
] ]