From 1d64527727795109cd3510fe3788fd3e5857b4ce Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 23 Dec 2016 21:35:03 +0100 Subject: [PATCH] Update Spanish tokenizer Remove reflexive pronouns as they're part of an open class, fix mistakes and add exceptions --- spacy/es/tokenizer_exceptions.py | 303 ++----------------------------- 1 file changed, 19 insertions(+), 284 deletions(-) diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py index f31e1bb4d..f9259ce93 100644 --- a/spacy/es/tokenizer_exceptions.py +++ b/spacy/es/tokenizer_exceptions.py @@ -2,313 +2,48 @@ from __future__ import unicode_literals from ..symbols import * -from ..language_data import PRON_LEMMA +from ..language_data import PRON_LEMMA, DET_LEMMA TOKENIZER_EXCEPTIONS = { - "accidentarse": [ - {ORTH: "accidentar", LEMMA: "accidentar", TAG: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "aceptarlo": [ - {ORTH: "aceptar", LEMMA: "aceptar", TAG: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "acompañarla": [ - {ORTH: "acompañar", LEMMA: "acompañar", TAG: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "advertirle": [ - {ORTH: "advertir", LEMMA: "advertir", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - "al": [ {ORTH: "a", LEMMA: "a", TAG: ADP}, {ORTH: "el", LEMMA: "el", TAG: DET} ], - "anunciarnos": [ - {ORTH: "anunciar", LEMMA: "anunciar", TAG: AUX}, - {ORTH: "nos", LEMMA: PRON_LEMMA, TAG: "PRON"} + "consigo": [ + {ORTH: "con", LEMMA: "con"}, + {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"} ], - "asegurándole": [ - {ORTH: "asegurando", LEMMA: "asegurar", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} + "conmigo": [ + {ORTH: "con", LEMMA: "con"}, + {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"} ], - "considerarle": [ - {ORTH: "considerar", LEMMA: "considerar", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "decirle": [ - {ORTH: "decir", LEMMA: "decir", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "decirles": [ - {ORTH: "decir", LEMMA: "decir", TAG: AUX}, - {ORTH: "les", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "decirte": [ - {ORTH: "Decir", LEMMA: "decir", TAG: AUX}, - {ORTH: "te", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "dejarla": [ - {ORTH: "dejar", LEMMA: "dejar", TAG: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "dejarnos": [ - {ORTH: "dejar", LEMMA: "dejar", TAG: AUX}, - {ORTH: "nos", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "dejándole": [ - {ORTH: "dejando", LEMMA: "dejar", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} + "contigo": [ + {ORTH: "con", LEMMA: "con"}, + {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"} ], "del": [ {ORTH: "de", LEMMA: "de", TAG: ADP}, - {ORTH: "el", LEMMA: "el", TAG: DET} - ], - - "demostrarles": [ - {ORTH: "demostrar", LEMMA: "demostrar", TAG: AUX}, - {ORTH: "les", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "diciéndole": [ - {ORTH: "diciendo", LEMMA: "decir", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "diciéndoles": [ - {ORTH: "diciendo", LEMMA: "decir", TAG: AUX}, - {ORTH: "les", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "diferenciarse": [ - {ORTH: "diferenciar", LEMMA: "diferenciar", TAG: AUX}, - {ORTH: "se", LEMMA: "él", TAG: "PRON"} - ], - - "divirtiéndome": [ - {ORTH: "divirtiendo", LEMMA: "divertir", TAG: AUX}, - {ORTH: "me", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "ensanchándose": [ - {ORTH: "ensanchando", LEMMA: "ensanchar", TAG: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "explicarles": [ - {ORTH: "explicar", LEMMA: "explicar", TAG: AUX}, - {ORTH: "les", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "haberla": [ - {ORTH: "haber", LEMMA: "haber", TAG: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "haberlas": [ - {ORTH: "haber", LEMMA: "haber", TAG: AUX}, - {ORTH: "las", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "haberlo": [ - {ORTH: "haber", LEMMA: "haber", TAG: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "haberlos": [ - {ORTH: "haber", LEMMA: "haber", TAG: AUX}, - {ORTH: "los", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "haberme": [ - {ORTH: "haber", LEMMA: "haber", TAG: AUX}, - {ORTH: "me", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "haberse": [ - {ORTH: "haber", LEMMA: "haber", TAG: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "hacerle": [ - {ORTH: "hacer", LEMMA: "hacer", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "hacerles": [ - {ORTH: "hacer", LEMMA: "hacer", TAG: AUX}, - {ORTH: "les", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "hallarse": [ - {ORTH: "hallar", LEMMA: "hallar", TAG: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "imaginaros": [ - {ORTH: "imaginar", LEMMA: "imaginar", TAG: AUX}, - {ORTH: "os", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "insinuarle": [ - {ORTH: "insinuar", LEMMA: "insinuar", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "justificarla": [ - {ORTH: "justificar", LEMMA: "justificar", TAG: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "mantenerlas": [ - {ORTH: "mantener", LEMMA: "mantener", TAG: AUX}, - {ORTH: "las", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "mantenerlos": [ - {ORTH: "mantener", LEMMA: "mantener", TAG: AUX}, - {ORTH: "los", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "mantenerme": [ - {ORTH: "mantener", LEMMA: "mantener", TAG: AUX}, - {ORTH: "me", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "pasarte": [ - {ORTH: "pasar", LEMMA: "pasar", TAG: AUX}, - {ORTH: "te", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "pedirle": [ - {ORTH: "pedir", LEMMA: "pedir", TAG: AUX}, - {ORTH: "le", LEMMA: "él", TAG: "PRON"} + {ORTH: "l", LEMMA: "el", TAG: DET} ], "pel": [ - {ORTH: "per", LEMMA: "per", TAG: ADP}, - {ORTH: "el", LEMMA: "el", TAG: DET} + {ORTH: "pe", LEMMA: "per", TAG: ADP}, + {ORTH: "l", LEMMA: "el", TAG: DET} ], - "pidiéndonos": [ - {ORTH: "pidiendo", LEMMA: "pedir", TAG: AUX}, - {ORTH: "nos", LEMMA: PRON_LEMMA, TAG: "PRON"} + "pal": [ + {ORTH: "pa", LEMMA: "para"}, + {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"} ], - "poderle": [ - {ORTH: "poder", LEMMA: "poder", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "preguntarse": [ - {ORTH: "preguntar", LEMMA: "preguntar", TAG: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "preguntándose": [ - {ORTH: "preguntando", LEMMA: "preguntar", TAG: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "presentarla": [ - {ORTH: "presentar", LEMMA: "presentar", TAG: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "pudiéndolo": [ - {ORTH: "pudiendo", LEMMA: "poder", TAG: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "pudiéndose": [ - {ORTH: "pudiendo", LEMMA: "poder", TAG: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "quererle": [ - {ORTH: "querer", LEMMA: "querer", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "rasgarse": [ - {ORTH: "Rasgar", LEMMA: "rasgar", TAG: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "repetirlo": [ - {ORTH: "repetir", LEMMA: "repetir", TAG: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "robarle": [ - {ORTH: "robar", LEMMA: "robar", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "seguirlos": [ - {ORTH: "seguir", LEMMA: "seguir", TAG: AUX}, - {ORTH: "los", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "serle": [ - {ORTH: "ser", LEMMA: "ser", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "serlo": [ - {ORTH: "ser", LEMMA: "ser", TAG: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "señalándole": [ - {ORTH: "señalando", LEMMA: "señalar", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "suplicarle": [ - {ORTH: "suplicar", LEMMA: "suplicar", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "tenerlos": [ - {ORTH: "tener", LEMMA: "tener", TAG: AUX}, - {ORTH: "los", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "vengarse": [ - {ORTH: "vengar", LEMMA: "vengar", TAG: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "verla": [ - {ORTH: "ver", LEMMA: "ver", TAG: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "verle": [ - {ORTH: "ver", LEMMA: "ver", TAG: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, TAG: "PRON"} - ], - - "volverlo": [ - {ORTH: "volver", LEMMA: "volver", TAG: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, TAG: "PRON"} + "pala": [ + {ORTH: "pa", LEMMA: "para"}, + {ORTH: "la", LEMMA: DET_LEMMA} ], "aprox.": [