fix(spanish sentence segmentation): remove tokenizer exceptions the break sentence segmentation. Aligned with training corpus

This commit is contained in:
Francisco Aranda 2017-06-02 08:19:57 +02:00
parent 5b385e7d78
commit 70a2180199

View File

@ -6,44 +6,15 @@ from ..language_data import PRON_LEMMA, DET_LEMMA
TOKENIZER_EXCEPTIONS = { TOKENIZER_EXCEPTIONS = {
"al": [
{ORTH: "a", LEMMA: "a", TAG: ADP},
{ORTH: "el", LEMMA: "el", TAG: DET}
],
"consigo": [
{ORTH: "con", LEMMA: "con"},
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: ""}
],
"conmigo": [
{ORTH: "con", LEMMA: "con"},
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: ""}
],
"contigo": [
{ORTH: "con", LEMMA: "con"},
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}
],
"del": [
{ORTH: "de", LEMMA: "de", TAG: ADP},
{ORTH: "l", LEMMA: "el", TAG: DET}
],
"pel": [
{ORTH: "pe", LEMMA: "per", TAG: ADP},
{ORTH: "l", LEMMA: "el", TAG: DET}
],
"pal": [ "pal": [
{ORTH: "pa", LEMMA: "para"}, {ORTH: "pa", LEMMA: "para"},
{ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"} {ORTH: "el", LEMMA: DET_LEMMA, NORM: "el"}
], ],
"pala": [ "pala": [
{ORTH: "pa", LEMMA: "para"}, {ORTH: "pa", LEMMA: "para"},
{ORTH: "la", LEMMA: DET_LEMMA} {ORTH: "la", LEMMA: DET_LEMMA, NORM: "la"}
], ],
"aprox.": [ "aprox.": [