diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index 0e31b56af..7ab0a7dfe 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -18,5 +18,9 @@ sentences = [ "El gato come pescado.", "Veo al hombre con el telescopio.", "La araña come moscas.", - "El pingüino incuba en su nido.", + "El pingüino incuba en su nido sobre el hielo.", + "¿Dónde estais?", + "¿Quién es el presidente Francés?", + "¿Dónde está encuentra la capital de Argentina?", + "¿Cuándo nació José de San Martín?", ] diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 2c2631086..891323705 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -4,15 +4,16 @@ from __future__ import unicode_literals from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA -_exc = { - "pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}], - "pala": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "la", LEMMA: "la", NORM: "la"}], -} +_exc = {} for exc_data in [ + {ORTH: "n°", LEMMA: "número"}, + {ORTH: "°C", LEMMA: "grados Celcius"}, {ORTH: "aprox.", LEMMA: "aproximadamente"}, {ORTH: "dna.", LEMMA: "docena"}, + {ORTH: "dpto.", LEMMA: "departamento"}, + {ORTH: "ej.", LEMMA: "ejemplo"}, {ORTH: "esq.", LEMMA: "esquina"}, {ORTH: "pág.", LEMMA: "página"}, {ORTH: "p.ej.", LEMMA: "por ejemplo"}, @@ -20,6 +21,8 @@ for exc_data in [ {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, + {ORTH: "vol.", NORM: "volúmen"}, + ]: _exc[exc_data[ORTH]] = [exc_data] @@ -39,10 +42,14 @@ for h in range(1, 12 + 1): for orth in [ "a.C.", "a.J.C.", + "d.C.", + "d.J.C.", "apdo.", "Av.", "Avda.", "Cía.", + "Dr.", + "Dra.", "EE.UU.", "etc.", "fig.", @@ -58,8 +65,10 @@ for orth in [ "Prof.", "Profa.", "q.e.p.d.", + "Q.E.P.D." "S.A.", "S.L.", + "S.R.L." "s.s.s.", "Sr.", "Sra.",