2017-05-10 22:15:12 +03:00
|
|
|
# encoding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2017-11-24 13:16:53 +03:00
|
|
|
from ...symbols import ORTH, LEMMA, NORM, TAG, ADP, PUNCT
|
2017-05-10 22:15:12 +03:00
|
|
|
|
|
|
|
|
|
|
|
_exc = {}
|
|
|
|
|
2017-07-03 16:44:17 +03:00
|
|
|
for exc_data in [
|
|
|
|
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
|
|
|
|
{ORTH: "Jan.", LEMMA: "januar", NORM: "januar"},
|
|
|
|
{ORTH: "Feb.", LEMMA: "februar", NORM: "februar"},
|
|
|
|
{ORTH: "Mar.", LEMMA: "marts", NORM: "marts"},
|
|
|
|
{ORTH: "Apr.", LEMMA: "april", NORM: "april"},
|
|
|
|
{ORTH: "Maj.", LEMMA: "maj", NORM: "maj"},
|
|
|
|
{ORTH: "Jun.", LEMMA: "juni", NORM: "juni"},
|
|
|
|
{ORTH: "Jul.", LEMMA: "juli", NORM: "juli"},
|
|
|
|
{ORTH: "Aug.", LEMMA: "august", NORM: "august"},
|
|
|
|
{ORTH: "Sep.", LEMMA: "september", NORM: "september"},
|
|
|
|
{ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"},
|
|
|
|
{ORTH: "Nov.", LEMMA: "november", NORM: "november"},
|
|
|
|
{ORTH: "Dec.", LEMMA: "december", NORM: "december"}]:
|
2017-11-02 01:02:45 +03:00
|
|
|
_exc[exc_data[ORTH]] = [exc_data]
|
2017-05-10 22:15:12 +03:00
|
|
|
|
|
|
|
for orth in [
|
|
|
|
"A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.",
|
|
|
|
"if.", "iflg.", "m.a.o.", "mht.", "min.", "osv.", "pga.", "resp.", "self.",
|
|
|
|
"t.o.m.", "vha.", ""]:
|
|
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
|
2017-11-24 13:16:53 +03:00
|
|
|
_custom_base_exc = {
|
|
|
|
"i.": [
|
|
|
|
{ORTH: "i", LEMMA: "i", NORM: "i"},
|
|
|
|
{ORTH: ".", TAG: PUNCT}]
|
|
|
|
}
|
|
|
|
_exc.update(_custom_base_exc)
|
|
|
|
|
2017-05-10 22:15:12 +03:00
|
|
|
|
2017-10-31 23:05:29 +03:00
|
|
|
TOKENIZER_EXCEPTIONS = _exc
|