mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Spanish tokenizer exception and examples improvement (#5531)
* Spanish tokenizer exception additions. Added Spanish question examples * erased slang tokenization examples
This commit is contained in:
parent
67af3a32b0
commit
925e938570
|
@ -18,5 +18,9 @@ sentences = [
|
|||
"El gato come pescado.",
|
||||
"Veo al hombre con el telescopio.",
|
||||
"La araña come moscas.",
|
||||
"El pingüino incuba en su nido.",
|
||||
"El pingüino incuba en su nido sobre el hielo.",
|
||||
"¿Dónde estais?",
|
||||
"¿Quién es el presidente Francés?",
|
||||
"¿Dónde está encuentra la capital de Argentina?",
|
||||
"¿Cuándo nació José de San Martín?",
|
||||
]
|
||||
|
|
|
@ -4,15 +4,16 @@ from __future__ import unicode_literals
|
|||
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
|
||||
|
||||
|
||||
_exc = {
|
||||
"pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}],
|
||||
"pala": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "la", LEMMA: "la", NORM: "la"}],
|
||||
}
|
||||
_exc = {}
|
||||
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "n°", LEMMA: "número"},
|
||||
{ORTH: "°C", LEMMA: "grados Celcius"},
|
||||
{ORTH: "aprox.", LEMMA: "aproximadamente"},
|
||||
{ORTH: "dna.", LEMMA: "docena"},
|
||||
{ORTH: "dpto.", LEMMA: "departamento"},
|
||||
{ORTH: "ej.", LEMMA: "ejemplo"},
|
||||
{ORTH: "esq.", LEMMA: "esquina"},
|
||||
{ORTH: "pág.", LEMMA: "página"},
|
||||
{ORTH: "p.ej.", LEMMA: "por ejemplo"},
|
||||
|
@ -20,6 +21,8 @@ for exc_data in [
|
|||
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
|
||||
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
|
||||
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
|
||||
{ORTH: "vol.", NORM: "volúmen"},
|
||||
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
@ -39,10 +42,14 @@ for h in range(1, 12 + 1):
|
|||
for orth in [
|
||||
"a.C.",
|
||||
"a.J.C.",
|
||||
"d.C.",
|
||||
"d.J.C.",
|
||||
"apdo.",
|
||||
"Av.",
|
||||
"Avda.",
|
||||
"Cía.",
|
||||
"Dr.",
|
||||
"Dra.",
|
||||
"EE.UU.",
|
||||
"etc.",
|
||||
"fig.",
|
||||
|
@ -58,8 +65,10 @@ for orth in [
|
|||
"Prof.",
|
||||
"Profa.",
|
||||
"q.e.p.d.",
|
||||
"Q.E.P.D."
|
||||
"S.A.",
|
||||
"S.L.",
|
||||
"S.R.L."
|
||||
"s.s.s.",
|
||||
"Sr.",
|
||||
"Sra.",
|
||||
|
|
Loading…
Reference in New Issue
Block a user