Spanish tokenizer exception and examples improvement (#5531)

* Spanish tokenizer exception additions. Added Spanish question examples

* erased slang tokenization examples
This commit is contained in:
Leo 2020-06-01 18:18:34 +02:00 committed by GitHub
parent 67af3a32b0
commit 925e938570
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 5 deletions

View File

@ -18,5 +18,9 @@ sentences = [
"El gato come pescado.", "El gato come pescado.",
"Veo al hombre con el telescopio.", "Veo al hombre con el telescopio.",
"La araña come moscas.", "La araña come moscas.",
"El pingüino incuba en su nido.", "El pingüino incuba en su nido sobre el hielo.",
"¿Dónde estais?",
"¿Quién es el presidente Francés?",
"¿Dónde está encuentra la capital de Argentina?",
"¿Cuándo nació José de San Martín?",
] ]

View File

@ -4,15 +4,16 @@ from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
_exc = { _exc = {}
"pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}],
"pala": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "la", LEMMA: "la", NORM: "la"}],
}
for exc_data in [ for exc_data in [
{ORTH: "", LEMMA: "número"},
{ORTH: "°C", LEMMA: "grados Celcius"},
{ORTH: "aprox.", LEMMA: "aproximadamente"}, {ORTH: "aprox.", LEMMA: "aproximadamente"},
{ORTH: "dna.", LEMMA: "docena"}, {ORTH: "dna.", LEMMA: "docena"},
{ORTH: "dpto.", LEMMA: "departamento"},
{ORTH: "ej.", LEMMA: "ejemplo"},
{ORTH: "esq.", LEMMA: "esquina"}, {ORTH: "esq.", LEMMA: "esquina"},
{ORTH: "pág.", LEMMA: "página"}, {ORTH: "pág.", LEMMA: "página"},
{ORTH: "p.ej.", LEMMA: "por ejemplo"}, {ORTH: "p.ej.", LEMMA: "por ejemplo"},
@ -20,6 +21,8 @@ for exc_data in [
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
{ORTH: "vol.", NORM: "volúmen"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -39,10 +42,14 @@ for h in range(1, 12 + 1):
for orth in [ for orth in [
"a.C.", "a.C.",
"a.J.C.", "a.J.C.",
"d.C.",
"d.J.C.",
"apdo.", "apdo.",
"Av.", "Av.",
"Avda.", "Avda.",
"Cía.", "Cía.",
"Dr.",
"Dra.",
"EE.UU.", "EE.UU.",
"etc.", "etc.",
"fig.", "fig.",
@ -58,8 +65,10 @@ for orth in [
"Prof.", "Prof.",
"Profa.", "Profa.",
"q.e.p.d.", "q.e.p.d.",
"Q.E.P.D."
"S.A.", "S.A.",
"S.L.", "S.L.",
"S.R.L."
"s.s.s.", "s.s.s.",
"Sr.", "Sr.",
"Sra.", "Sra.",