From e935c950d821fed7a4fb14df32e7afe78f31ffbb Mon Sep 17 00:00:00 2001 From: Magnus Burton Date: Fri, 30 Dec 2016 21:08:44 +0100 Subject: [PATCH 1/2] Added months and days as abbreviations for Swedish --- spacy/sv/tokenizer_exceptions.py | 58 +++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/spacy/sv/tokenizer_exceptions.py b/spacy/sv/tokenizer_exceptions.py index 6cf144b44..b2a85eec9 100644 --- a/spacy/sv/tokenizer_exceptions.py +++ b/spacy/sv/tokenizer_exceptions.py @@ -6,7 +6,63 @@ from ..language_data import PRON_LEMMA TOKENIZER_EXCEPTIONS = { - + "jan.": [ + {ORTH: "jan.", LEMMA: "januari"} + ], + "febr.": [ + {ORTH: "febr.", LEMMA: "februari"} + ], + "feb.": [ + {ORTH: "feb.", LEMMA: "februari"} + ], + "apr.": [ + {ORTH: "apr.", LEMMA: "april"} + ], + "jun.": [ + {ORTH: "jun.", LEMMA: "juni"} + ], + "jul.": [ + {ORTH: "jul.", LEMMA: "juli"} + ], + "aug.": [ + {ORTH: "aug.", LEMMA: "augusti"} + ], + "sept.": [ + {ORTH: "sept.", LEMMA: "september"} + ], + "sep.": [ + {ORTH: "sep.", LEMMA: "september"} + ], + "okt.": [ + {ORTH: "okt.", LEMMA: "oktober"} + ], + "nov.": [ + {ORTH: "nov.", LEMMA: "november"} + ], + "dec.": [ + {ORTH: "dec.", LEMMA: "december"} + ], + "mån.": [ + {ORTH: "mån.", LEMMA: "måndag"} + ], + "tis.": [ + {ORTH: "tis.", LEMMA: "tisdag"} + ], + "ons.": [ + {ORTH: "ons.", LEMMA: "onsdag"} + ], + "tors.": [ + {ORTH: "tors.", LEMMA: "torsdag"} + ], + "fre.": [ + {ORTH: "fre.", LEMMA: "fredag"} + ], + "lör.": [ + {ORTH: "lör.", LEMMA: "lördag"} + ], + "sön.": [ + {ORTH: "sön.", LEMMA: "söndag"} + ] } From 56e2219b658993ea90c84c7094bec9d8593fd76f Mon Sep 17 00:00:00 2001 From: Magnus Burton Date: Fri, 30 Dec 2016 21:17:34 +0100 Subject: [PATCH 2/2] Added Swedish city abbreviations --- spacy/sv/tokenizer_exceptions.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/sv/tokenizer_exceptions.py b/spacy/sv/tokenizer_exceptions.py index b2a85eec9..d8d4e8823 100644 --- a/spacy/sv/tokenizer_exceptions.py +++ b/spacy/sv/tokenizer_exceptions.py @@ -62,6 +62,12 @@ TOKENIZER_EXCEPTIONS = { ], "sön.": [ {ORTH: "sön.", LEMMA: "söndag"} + ], + "sthlm": [ + {ORTH: "sthlm", LEMMA: "Stockholm"} + ], + "gbg": [ + {ORTH: "gbg", LEMMA: "Göteborg"} ] }