Add weekday abbreviations and remove abiguous month abbreviations for Danish.

This commit is contained in:
Søren Lind Kristiansen 2017-11-24 14:43:29 +01:00
parent 056547e989
commit 0c276ed020
2 changed files with 45 additions and 13 deletions

View File

@ -11,20 +11,47 @@ from ...symbols import ORTH, LEMMA, NORM, TAG, ADP, PUNCT
_exc = {} _exc = {}
# Abbreviations for weekdays "søn." (for "søndag") as well as "Tor." and "Tors."
# (for "torsdag") are left out because they are ambiguous. The same is the case
# for abbreviations "jul." and "Jul." ("juli").
for exc_data in [ for exc_data in [
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, {ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
{ORTH: "Jan.", LEMMA: "januar", NORM: "januar"}, {ORTH: "jan.", LEMMA: "januar"},
{ORTH: "Feb.", LEMMA: "februar", NORM: "februar"}, {ORTH: "febr.", LEMMA: "februar"},
{ORTH: "Mar.", LEMMA: "marts", NORM: "marts"}, {ORTH: "feb.", LEMMA: "februar"},
{ORTH: "Apr.", LEMMA: "april", NORM: "april"}, {ORTH: "mar.", LEMMA: "marts"},
{ORTH: "Maj.", LEMMA: "maj", NORM: "maj"}, {ORTH: "apr.", LEMMA: "april"},
{ORTH: "Jun.", LEMMA: "juni", NORM: "juni"}, {ORTH: "jun.", LEMMA: "juni"},
{ORTH: "Jul.", LEMMA: "juli", NORM: "juli"}, {ORTH: "aug.", LEMMA: "august"},
{ORTH: "Aug.", LEMMA: "august", NORM: "august"}, {ORTH: "sept.", LEMMA: "september"},
{ORTH: "Sep.", LEMMA: "september", NORM: "september"}, {ORTH: "sep.", LEMMA: "september"},
{ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"}, {ORTH: "okt.", LEMMA: "oktober"},
{ORTH: "Nov.", LEMMA: "november", NORM: "november"}, {ORTH: "nov.", LEMMA: "november"},
{ORTH: "Dec.", LEMMA: "december", NORM: "december"}]: {ORTH: "dec.", LEMMA: "december"},
{ORTH: "man.", LEMMA: "mandag"},
{ORTH: "tirs.", LEMMA: "tirsdag"},
{ORTH: "ons.", LEMMA: "onsdag"},
{ORTH: "tor.", LEMMA: "torsdag"},
{ORTH: "tors.", LEMMA: "torsdag"},
{ORTH: "fre.", LEMMA: "fredag"},
{ORTH: "lør.", LEMMA: "lørdag"},
{ORTH: "Jan.", LEMMA: "januar"},
{ORTH: "Febr.", LEMMA: "februar"},
{ORTH: "Feb.", LEMMA: "februar"},
{ORTH: "Mar.", LEMMA: "marts"},
{ORTH: "Apr.", LEMMA: "april"},
{ORTH: "Jun.", LEMMA: "juni"},
{ORTH: "Aug.", LEMMA: "august"},
{ORTH: "Sept.", LEMMA: "september"},
{ORTH: "Sep.", LEMMA: "september"},
{ORTH: "Okt.", LEMMA: "oktober"},
{ORTH: "Nov.", LEMMA: "november"},
{ORTH: "Dec.", LEMMA: "december"},
{ORTH: "Man.", LEMMA: "mandag"},
{ORTH: "Tirs.", LEMMA: "tirsdag"},
{ORTH: "Ons.", LEMMA: "onsdag"},
{ORTH: "Fre.", LEMMA: "fredag"},
{ORTH: "Lør.", LEMMA: "lørdag"}]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
for orth in [ for orth in [

View File

@ -9,6 +9,11 @@ def test_da_tokenizer_handles_abbr(da_tokenizer, text):
tokens = da_tokenizer(text) tokens = da_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
@pytest.mark.parametrize('text', ["Jul.", "jul.", "Tor.", "Tors."])
def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 2
def test_da_tokenizer_handles_exc_in_text(da_tokenizer): def test_da_tokenizer_handles_exc_in_text(da_tokenizer):
text = "Det er bl.a. ikke meningen" text = "Det er bl.a. ikke meningen"
tokens = da_tokenizer(text) tokens = da_tokenizer(text)