From 70a21801994d7c9023f050ecfa2e3ec8a5d52d04 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Fri, 2 Jun 2017 08:19:57 +0200 Subject: [PATCH] fix(spanish sentence segmentation): remove tokenizer exceptions the break sentence segmentation. Aligned with training corpus --- spacy/es/tokenizer_exceptions.py | 33 ++------------------------------ 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py index e60bcd104..fb274f907 100644 --- a/spacy/es/tokenizer_exceptions.py +++ b/spacy/es/tokenizer_exceptions.py @@ -6,44 +6,15 @@ from ..language_data import PRON_LEMMA, DET_LEMMA TOKENIZER_EXCEPTIONS = { - "al": [ - {ORTH: "a", LEMMA: "a", TAG: ADP}, - {ORTH: "el", LEMMA: "el", TAG: DET} - ], - - "consigo": [ - {ORTH: "con", LEMMA: "con"}, - {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"} - ], - - "conmigo": [ - {ORTH: "con", LEMMA: "con"}, - {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"} - ], - - "contigo": [ - {ORTH: "con", LEMMA: "con"}, - {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"} - ], - - "del": [ - {ORTH: "de", LEMMA: "de", TAG: ADP}, - {ORTH: "l", LEMMA: "el", TAG: DET} - ], - - "pel": [ - {ORTH: "pe", LEMMA: "per", TAG: ADP}, - {ORTH: "l", LEMMA: "el", TAG: DET} - ], "pal": [ {ORTH: "pa", LEMMA: "para"}, - {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"} + {ORTH: "el", LEMMA: DET_LEMMA, NORM: "el"} ], "pala": [ {ORTH: "pa", LEMMA: "para"}, - {ORTH: "la", LEMMA: DET_LEMMA} + {ORTH: "la", LEMMA: DET_LEMMA, NORM: "la"} ], "aprox.": [