mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-28 02:46:35 +03:00
fix(spanish sentence segmentation): remove tokenizer exceptions the break sentence segmentation. Aligned with training corpus
This commit is contained in:
parent
5b385e7d78
commit
70a2180199
|
@ -6,44 +6,15 @@ from ..language_data import PRON_LEMMA, DET_LEMMA
|
|||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
"al": [
|
||||
{ORTH: "a", LEMMA: "a", TAG: ADP},
|
||||
{ORTH: "el", LEMMA: "el", TAG: DET}
|
||||
],
|
||||
|
||||
"consigo": [
|
||||
{ORTH: "con", LEMMA: "con"},
|
||||
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}
|
||||
],
|
||||
|
||||
"conmigo": [
|
||||
{ORTH: "con", LEMMA: "con"},
|
||||
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}
|
||||
],
|
||||
|
||||
"contigo": [
|
||||
{ORTH: "con", LEMMA: "con"},
|
||||
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}
|
||||
],
|
||||
|
||||
"del": [
|
||||
{ORTH: "de", LEMMA: "de", TAG: ADP},
|
||||
{ORTH: "l", LEMMA: "el", TAG: DET}
|
||||
],
|
||||
|
||||
"pel": [
|
||||
{ORTH: "pe", LEMMA: "per", TAG: ADP},
|
||||
{ORTH: "l", LEMMA: "el", TAG: DET}
|
||||
],
|
||||
|
||||
"pal": [
|
||||
{ORTH: "pa", LEMMA: "para"},
|
||||
{ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}
|
||||
{ORTH: "el", LEMMA: DET_LEMMA, NORM: "el"}
|
||||
],
|
||||
|
||||
"pala": [
|
||||
{ORTH: "pa", LEMMA: "para"},
|
||||
{ORTH: "la", LEMMA: DET_LEMMA}
|
||||
{ORTH: "la", LEMMA: DET_LEMMA, NORM: "la"}
|
||||
],
|
||||
|
||||
"aprox.": [
|
||||
|
|
Loading…
Reference in New Issue
Block a user