from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH from ...util import update_exc _exc = {} # Verbs for verb_data in [ {ORTH: "driver"}, {ORTH: "kör"}, {ORTH: "hörr"}, {ORTH: "fattar"}, {ORTH: "hajar"}, {ORTH: "lever"}, {ORTH: "serr"}, {ORTH: "fixar"}, ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() for data in [verb_data, verb_data_tc]: _exc[data[ORTH] + "u"] = [data, {ORTH: "u", NORM: "du"}] # Abbreviations for weekdays "sön." (for "söndag" / "söner") # are left out because they are ambiguous. The same is the case # for abbreviations "jul." and "Jul." ("juli" / "jul"). for exc_data in [ {ORTH: "jan.", NORM: "januari"}, {ORTH: "febr.", NORM: "februari"}, {ORTH: "feb.", NORM: "februari"}, {ORTH: "apr.", NORM: "april"}, {ORTH: "jun.", NORM: "juni"}, {ORTH: "aug.", NORM: "augusti"}, {ORTH: "sept.", NORM: "september"}, {ORTH: "sep.", NORM: "september"}, {ORTH: "okt.", NORM: "oktober"}, {ORTH: "nov.", NORM: "november"}, {ORTH: "dec.", NORM: "december"}, {ORTH: "mån.", NORM: "måndag"}, {ORTH: "tis.", NORM: "tisdag"}, {ORTH: "ons.", NORM: "onsdag"}, {ORTH: "tors.", NORM: "torsdag"}, {ORTH: "fre.", NORM: "fredag"}, {ORTH: "lör.", NORM: "lördag"}, {ORTH: "Jan.", NORM: "Januari"}, {ORTH: "Febr.", NORM: "Februari"}, {ORTH: "Feb.", NORM: "Februari"}, {ORTH: "Apr.", NORM: "April"}, {ORTH: "Jun.", NORM: "Juni"}, {ORTH: "Aug.", NORM: "Augusti"}, {ORTH: "Sept.", NORM: "September"}, {ORTH: "Sep.", NORM: "September"}, {ORTH: "Okt.", NORM: "Oktober"}, {ORTH: "Nov.", NORM: "November"}, {ORTH: "Dec.", NORM: "December"}, {ORTH: "Mån.", NORM: "Måndag"}, {ORTH: "Tis.", NORM: "Tisdag"}, {ORTH: "Ons.", NORM: "Onsdag"}, {ORTH: "Tors.", NORM: "Torsdag"}, {ORTH: "Fre.", NORM: "Fredag"}, {ORTH: "Lör.", NORM: "Lördag"}, {ORTH: "sthlm", NORM: "Stockholm"}, {ORTH: "gbg", NORM: "Göteborg"}, ]: _exc[exc_data[ORTH]] = [exc_data] # Specific case abbreviations only for orth in ["AB", "Dr.", "H.M.", "H.K.H.", "m/s", "M/S", "Ph.d.", "S:t", "s:t"]: _exc[orth] = [{ORTH: orth}] ABBREVIATIONS = [ "ang", "anm", "bl.a", "d.v.s", "doc", "dvs", "e.d", "e.kr", "el.", "eng", "etc", "exkl", "ev", "f.", "f.d", "f.kr", "f.n", "f.ö", "fid", "fig", "forts", "fr.o.m", "förf", "inkl", "iofs", "jur.", "kap", "kl", "kor.", "kr", "kungl", "lat", "m.a.o", "m.fl", "m.m", "max", "milj", "min.", "mos", "mt", "mvh", "o.d", "o.s.v", "obs", "osv", "p.g.a", "proc", "prof", "ref", "resp", "s.a.s", "s.k", "s.t", "sid", "t.ex", "t.h", "t.o.m", "t.v", "tel", "ung.", "vol", "v.", "äv", "övers", ] # Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it. for abbr in ABBREVIATIONS: if not abbr.endswith("."): ABBREVIATIONS.append(abbr + ".") for orth in ABBREVIATIONS: _exc[orth] = [{ORTH: orth}] capitalized = orth.capitalize() _exc[capitalized] = [{ORTH: capitalized}] # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), # should be tokenized as two separate tokens. for orth in ["i", "m"]: _exc[orth + "."] = [{ORTH: orth, NORM: orth, NORM: orth}, {ORTH: "."}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)