Remove POS, TAG and LEMMA from tokenizer exceptions

This commit is contained in:
Ines Montani 2020-07-22 23:09:01 +02:00
parent 14d7d46f89
commit a624ae0675
34 changed files with 2173 additions and 4349 deletions

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -8,41 +8,41 @@ _exc = {}
# Time # Time
for exc_data in [ for exc_data in [
{LEMMA: "قبل الميلاد", ORTH: "ق.م"}, {NORM: "قبل الميلاد", ORTH: "ق.م"},
{LEMMA: "بعد الميلاد", ORTH: "ب. م"}, {NORM: "بعد الميلاد", ORTH: "ب. م"},
{LEMMA: "ميلادي", ORTH: ""}, {NORM: "ميلادي", ORTH: ""},
{LEMMA: "هجري", ORTH: ".هـ"}, {NORM: "هجري", ORTH: ".هـ"},
{LEMMA: "توفي", ORTH: ""}, {NORM: "توفي", ORTH: ""},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
# Scientific abv. # Scientific abv.
for exc_data in [ for exc_data in [
{LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"}, {NORM: "صلى الله عليه وسلم", ORTH: "صلعم"},
{LEMMA: "الشارح", ORTH: "الشـ"}, {NORM: "الشارح", ORTH: "الشـ"},
{LEMMA: "الظاهر", ORTH: "الظـ"}, {NORM: "الظاهر", ORTH: "الظـ"},
{LEMMA: "أيضًا", ORTH: "أيضـ"}, {NORM: "أيضًا", ORTH: "أيضـ"},
{LEMMA: "إلى آخره", ORTH: "إلخ"}, {NORM: "إلى آخره", ORTH: "إلخ"},
{LEMMA: "انتهى", ORTH: "اهـ"}, {NORM: "انتهى", ORTH: "اهـ"},
{LEMMA: "حدّثنا", ORTH: "ثنا"}, {NORM: "حدّثنا", ORTH: "ثنا"},
{LEMMA: "حدثني", ORTH: "ثنى"}, {NORM: "حدثني", ORTH: "ثنى"},
{LEMMA: "أنبأنا", ORTH: "أنا"}, {NORM: "أنبأنا", ORTH: "أنا"},
{LEMMA: "أخبرنا", ORTH: "نا"}, {NORM: "أخبرنا", ORTH: "نا"},
{LEMMA: "مصدر سابق", ORTH: "م. س"}, {NORM: "مصدر سابق", ORTH: "م. س"},
{LEMMA: "مصدر نفسه", ORTH: "م. ن"}, {NORM: "مصدر نفسه", ORTH: "م. ن"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
# Other abv. # Other abv.
for exc_data in [ for exc_data in [
{LEMMA: "دكتور", ORTH: "د."}, {NORM: "دكتور", ORTH: "د."},
{LEMMA: "أستاذ دكتور", ORTH: "أ.د"}, {NORM: "أستاذ دكتور", ORTH: "أ.د"},
{LEMMA: "أستاذ", ORTH: "أ."}, {NORM: "أستاذ", ORTH: "أ."},
{LEMMA: "بروفيسور", ORTH: "ب."}, {NORM: "بروفيسور", ORTH: "ب."},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]: for exc_data in [{NORM: "تلفون", ORTH: "ت."}, {NORM: "صندوق بريد", ORTH: "ص.ب"}]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -7,18 +7,18 @@ _exc = {}
for exc_data in [ for exc_data in [
{ORTH: "ডঃ", LEMMA: "ডক্টর"}, {ORTH: "ডঃ", NORM: "ডক্টর"},
{ORTH: "ডাঃ", LEMMA: "ডাক্তার"}, {ORTH: "ডাঃ", NORM: "ডাক্তার"},
{ORTH: "ড.", LEMMA: "ডক্টর"}, {ORTH: "ড.", NORM: "ডক্টর"},
{ORTH: "ডা.", LEMMA: "ডাক্তার"}, {ORTH: "ডা.", NORM: "ডাক্তার"},
{ORTH: "মোঃ", LEMMA: "মোহাম্মদ"}, {ORTH: "মোঃ", NORM: "মোহাম্মদ"},
{ORTH: "মো.", LEMMA: "মোহাম্মদ"}, {ORTH: "মো.", NORM: "মোহাম্মদ"},
{ORTH: "সে.", LEMMA: "সেলসিয়াস"}, {ORTH: "সে.", NORM: "সেলসিয়াস"},
{ORTH: "কি.মি.", LEMMA: "কিলোমিটার"}, {ORTH: "কি.মি.", NORM: "কিলোমিটার"},
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"}, {ORTH: "কি.মি", NORM: "কিলোমিটার"},
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, {ORTH: "সে.মি.", NORM: "সেন্টিমিটার"},
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, {ORTH: "সে.মি", NORM: "সেন্টিমিটার"},
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}, {ORTH: "মি.লি.", NORM: "মিলিলিটার"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -1,40 +1,40 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
_exc = {} _exc = {}
for exc_data in [ for exc_data in [
{ORTH: "aprox.", LEMMA: "aproximadament"}, {ORTH: "aprox.", NORM: "aproximadament"},
{ORTH: "pàg.", LEMMA: "pàgina"}, {ORTH: "pàg.", NORM: "pàgina"},
{ORTH: "p.ex.", LEMMA: "per exemple"}, {ORTH: "p.ex.", NORM: "per exemple"},
{ORTH: "gen.", LEMMA: "gener"}, {ORTH: "gen.", NORM: "gener"},
{ORTH: "feb.", LEMMA: "febrer"}, {ORTH: "feb.", NORM: "febrer"},
{ORTH: "abr.", LEMMA: "abril"}, {ORTH: "abr.", NORM: "abril"},
{ORTH: "jul.", LEMMA: "juliol"}, {ORTH: "jul.", NORM: "juliol"},
{ORTH: "set.", LEMMA: "setembre"}, {ORTH: "set.", NORM: "setembre"},
{ORTH: "oct.", LEMMA: "octubre"}, {ORTH: "oct.", NORM: "octubre"},
{ORTH: "nov.", LEMMA: "novembre"}, {ORTH: "nov.", NORM: "novembre"},
{ORTH: "dec.", LEMMA: "desembre"}, {ORTH: "dec.", NORM: "desembre"},
{ORTH: "Dr.", LEMMA: "doctor"}, {ORTH: "Dr.", NORM: "doctor"},
{ORTH: "Sr.", LEMMA: "senyor"}, {ORTH: "Sr.", NORM: "senyor"},
{ORTH: "Sra.", LEMMA: "senyora"}, {ORTH: "Sra.", NORM: "senyora"},
{ORTH: "Srta.", LEMMA: "senyoreta"}, {ORTH: "Srta.", NORM: "senyoreta"},
{ORTH: "núm", LEMMA: "número"}, {ORTH: "núm", NORM: "número"},
{ORTH: "St.", LEMMA: "sant"}, {ORTH: "St.", NORM: "sant"},
{ORTH: "Sta.", LEMMA: "santa"}, {ORTH: "Sta.", NORM: "santa"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
# Times # Times
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}] _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
for h in range(1, 12 + 1): for h in range(1, 12 + 1):
for period in ["a.m.", "am"]: for period in ["a.m.", "am"]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, NORM: "a.m."}]
for period in ["p.m.", "pm"]: for period in ["p.m.", "pm"]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, NORM: "p.m."}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -3,7 +3,7 @@ Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others. Source: https://forkortelse.dk/ and various others.
""" """
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -13,44 +13,44 @@ _exc = {}
# (for "torsdag") are left out because they are ambiguous. The same is the case # (for "torsdag") are left out because they are ambiguous. The same is the case
# for abbreviations "jul." and "Jul." ("juli"). # for abbreviations "jul." and "Jul." ("juli").
for exc_data in [ for exc_data in [
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, {ORTH: "Kbh.", NORM: "København"},
{ORTH: "jan.", LEMMA: "januar"}, {ORTH: "jan.", NORM: "januar"},
{ORTH: "febr.", LEMMA: "februar"}, {ORTH: "febr.", NORM: "februar"},
{ORTH: "feb.", LEMMA: "februar"}, {ORTH: "feb.", NORM: "februar"},
{ORTH: "mar.", LEMMA: "marts"}, {ORTH: "mar.", NORM: "marts"},
{ORTH: "apr.", LEMMA: "april"}, {ORTH: "apr.", NORM: "april"},
{ORTH: "jun.", LEMMA: "juni"}, {ORTH: "jun.", NORM: "juni"},
{ORTH: "aug.", LEMMA: "august"}, {ORTH: "aug.", NORM: "august"},
{ORTH: "sept.", LEMMA: "september"}, {ORTH: "sept.", NORM: "september"},
{ORTH: "sep.", LEMMA: "september"}, {ORTH: "sep.", NORM: "september"},
{ORTH: "okt.", LEMMA: "oktober"}, {ORTH: "okt.", NORM: "oktober"},
{ORTH: "nov.", LEMMA: "november"}, {ORTH: "nov.", NORM: "november"},
{ORTH: "dec.", LEMMA: "december"}, {ORTH: "dec.", NORM: "december"},
{ORTH: "man.", LEMMA: "mandag"}, {ORTH: "man.", NORM: "mandag"},
{ORTH: "tirs.", LEMMA: "tirsdag"}, {ORTH: "tirs.", NORM: "tirsdag"},
{ORTH: "ons.", LEMMA: "onsdag"}, {ORTH: "ons.", NORM: "onsdag"},
{ORTH: "tor.", LEMMA: "torsdag"}, {ORTH: "tor.", NORM: "torsdag"},
{ORTH: "tors.", LEMMA: "torsdag"}, {ORTH: "tors.", NORM: "torsdag"},
{ORTH: "fre.", LEMMA: "fredag"}, {ORTH: "fre.", NORM: "fredag"},
{ORTH: "lør.", LEMMA: "lørdag"}, {ORTH: "lør.", NORM: "lørdag"},
{ORTH: "Jan.", LEMMA: "januar"}, {ORTH: "Jan.", NORM: "januar"},
{ORTH: "Febr.", LEMMA: "februar"}, {ORTH: "Febr.", NORM: "februar"},
{ORTH: "Feb.", LEMMA: "februar"}, {ORTH: "Feb.", NORM: "februar"},
{ORTH: "Mar.", LEMMA: "marts"}, {ORTH: "Mar.", NORM: "marts"},
{ORTH: "Apr.", LEMMA: "april"}, {ORTH: "Apr.", NORM: "april"},
{ORTH: "Jun.", LEMMA: "juni"}, {ORTH: "Jun.", NORM: "juni"},
{ORTH: "Aug.", LEMMA: "august"}, {ORTH: "Aug.", NORM: "august"},
{ORTH: "Sept.", LEMMA: "september"}, {ORTH: "Sept.", NORM: "september"},
{ORTH: "Sep.", LEMMA: "september"}, {ORTH: "Sep.", NORM: "september"},
{ORTH: "Okt.", LEMMA: "oktober"}, {ORTH: "Okt.", NORM: "oktober"},
{ORTH: "Nov.", LEMMA: "november"}, {ORTH: "Nov.", NORM: "november"},
{ORTH: "Dec.", LEMMA: "december"}, {ORTH: "Dec.", NORM: "december"},
{ORTH: "Man.", LEMMA: "mandag"}, {ORTH: "Man.", NORM: "mandag"},
{ORTH: "Tirs.", LEMMA: "tirsdag"}, {ORTH: "Tirs.", NORM: "tirsdag"},
{ORTH: "Ons.", LEMMA: "onsdag"}, {ORTH: "Ons.", NORM: "onsdag"},
{ORTH: "Fre.", LEMMA: "fredag"}, {ORTH: "Fre.", NORM: "fredag"},
{ORTH: "Lør.", LEMMA: "lørdag"}, {ORTH: "Lør.", NORM: "lørdag"},
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"}, {ORTH: "og/eller", NORM: "og/eller"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -550,22 +550,22 @@ for orth in [
_exc[capitalized] = [{ORTH: capitalized}] _exc[capitalized] = [{ORTH: capitalized}]
for exc_data in [ for exc_data in [
{ORTH: "s'gu", LEMMA: "s'gu", NORM: "s'gu"}, {ORTH: "s'gu", NORM: "s'gu"},
{ORTH: "S'gu", LEMMA: "s'gu", NORM: "s'gu"}, {ORTH: "S'gu", NORM: "s'gu"},
{ORTH: "sgu'", LEMMA: "s'gu", NORM: "s'gu"}, {ORTH: "sgu'", NORM: "s'gu"},
{ORTH: "Sgu'", LEMMA: "s'gu", NORM: "s'gu"}, {ORTH: "Sgu'", NORM: "s'gu"},
{ORTH: "sku'", LEMMA: "skal", NORM: "skulle"}, {ORTH: "sku'", NORM: "skulle"},
{ORTH: "ku'", LEMMA: "kan", NORM: "kunne"}, {ORTH: "ku'", NORM: "kunne"},
{ORTH: "Ku'", LEMMA: "kan", NORM: "kunne"}, {ORTH: "Ku'", NORM: "kunne"},
{ORTH: "ka'", LEMMA: "kan", NORM: "kan"}, {ORTH: "ka'", NORM: "kan"},
{ORTH: "Ka'", LEMMA: "kan", NORM: "kan"}, {ORTH: "Ka'", NORM: "kan"},
{ORTH: "gi'", LEMMA: "give", NORM: "giv"}, {ORTH: "gi'", NORM: "giv"},
{ORTH: "Gi'", LEMMA: "give", NORM: "giv"}, {ORTH: "Gi'", NORM: "giv"},
{ORTH: "li'", LEMMA: "lide", NORM: "lide"}, {ORTH: "li'", NORM: "lide"},
{ORTH: "ha'", LEMMA: "have", NORM: "have"}, {ORTH: "ha'", NORM: "have"},
{ORTH: "Ha'", LEMMA: "have", NORM: "have"}, {ORTH: "Ha'", NORM: "have"},
{ORTH: "ik'", LEMMA: "ikke", NORM: "ikke"}, {ORTH: "ik'", NORM: "ikke"},
{ORTH: "Ik'", LEMMA: "ikke", NORM: "ikke"}, {ORTH: "Ik'", NORM: "ikke"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -575,7 +575,7 @@ for h in range(1, 31 + 1):
for period in ["."]: for period in ["."]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]} _custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
_exc.update(_custom_base_exc) _exc.update(_custom_base_exc)
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,159 +1,135 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
_exc = { _exc = {
"auf'm": [{ORTH: "auf", LEMMA: "auf"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}], "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
"du's": [ "du's": [{ORTH: "du"}, {ORTH: "'s", NORM: "es"}],
{ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"}, "er's": [{ORTH: "er"}, {ORTH: "'s", NORM: "es"}],
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}, "hinter'm": [{ORTH: "hinter"}, {ORTH: "'m", NORM: "dem"}],
], "ich's": [{ORTH: "ich"}, {ORTH: "'s", NORM: "es"}],
"er's": [ "ihr's": [{ORTH: "ihr"}, {ORTH: "'s", NORM: "es"}],
{ORTH: "er", LEMMA: PRON_LEMMA, TAG: "PPER"}, "sie's": [{ORTH: "sie"}, {ORTH: "'s", NORM: "es"}],
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}, "unter'm": [{ORTH: "unter"}, {ORTH: "'m", NORM: "dem"}],
], "vor'm": [{ORTH: "vor"}, {ORTH: "'m", NORM: "dem"}],
"hinter'm": [ "wir's": [{ORTH: "wir"}, {ORTH: "'s", NORM: "es"}],
{ORTH: "hinter", LEMMA: "hinter"}, "über'm": [{ORTH: "über"}, {ORTH: "'m", NORM: "dem"}],
{ORTH: "'m", LEMMA: "der", NORM: "dem"},
],
"ich's": [
{ORTH: "ich", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
],
"ihr's": [
{ORTH: "ihr", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
],
"sie's": [
{ORTH: "sie", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
],
"unter'm": [
{ORTH: "unter", LEMMA: "unter"},
{ORTH: "'m", LEMMA: "der", NORM: "dem"},
],
"vor'm": [{ORTH: "vor", LEMMA: "vor"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
"wir's": [
{ORTH: "wir", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
],
"über'm": [{ORTH: "über", LEMMA: "über"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
} }
for exc_data in [ for exc_data in [
{ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, {ORTH: "'S", NORM: "'s"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, {ORTH: "'s", NORM: "'s"},
{ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, {ORTH: "S'", NORM: "'s"},
{ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, {ORTH: "s'", NORM: "'s"},
{ORTH: "'n", LEMMA: "ein", NORM: "ein"}, {ORTH: "'n", NORM: "ein"},
{ORTH: "'ne", LEMMA: "eine", NORM: "eine"}, {ORTH: "'ne", NORM: "eine"},
{ORTH: "'nen", LEMMA: "ein", NORM: "einen"}, {ORTH: "'nen", NORM: "einen"},
{ORTH: "'nem", LEMMA: "ein", NORM: "einem"}, {ORTH: "'nem", NORM: "einem"},
{ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"}, {ORTH: "Abb.", NORM: "Abbildung"},
{ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"}, {ORTH: "Abk.", NORM: "Abkürzung"},
{ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"}, {ORTH: "Abt.", NORM: "Abteilung"},
{ORTH: "Apr.", LEMMA: "April", NORM: "April"}, {ORTH: "Apr.", NORM: "April"},
{ORTH: "Aug.", LEMMA: "August", NORM: "August"}, {ORTH: "Aug.", NORM: "August"},
{ORTH: "Bd.", LEMMA: "Band", NORM: "Band"}, {ORTH: "Bd.", NORM: "Band"},
{ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"}, {ORTH: "Betr.", NORM: "Betreff"},
{ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"}, {ORTH: "Bf.", NORM: "Bahnhof"},
{ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"}, {ORTH: "Bhf.", NORM: "Bahnhof"},
{ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"}, {ORTH: "Bsp.", NORM: "Beispiel"},
{ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"}, {ORTH: "Dez.", NORM: "Dezember"},
{ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"}, {ORTH: "Di.", NORM: "Dienstag"},
{ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"}, {ORTH: "Do.", NORM: "Donnerstag"},
{ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"}, {ORTH: "Fa.", NORM: "Firma"},
{ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"}, {ORTH: "Fam.", NORM: "Familie"},
{ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"}, {ORTH: "Feb.", NORM: "Februar"},
{ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"}, {ORTH: "Fr.", NORM: "Frau"},
{ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"}, {ORTH: "Frl.", NORM: "Fräulein"},
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"}, {ORTH: "Hbf.", NORM: "Hauptbahnhof"},
{ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"}, {ORTH: "Hr.", NORM: "Herr"},
{ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"}, {ORTH: "Hrn.", NORM: "Herrn"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, {ORTH: "Jan.", NORM: "Januar"},
{ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"}, {ORTH: "Jh.", NORM: "Jahrhundert"},
{ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"}, {ORTH: "Jhd.", NORM: "Jahrhundert"},
{ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"}, {ORTH: "Jul.", NORM: "Juli"},
{ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"}, {ORTH: "Jun.", NORM: "Juni"},
{ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"}, {ORTH: "Mi.", NORM: "Mittwoch"},
{ORTH: "Mio.", LEMMA: "Million", NORM: "Million"}, {ORTH: "Mio.", NORM: "Million"},
{ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"}, {ORTH: "Mo.", NORM: "Montag"},
{ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"}, {ORTH: "Mrd.", NORM: "Milliarde"},
{ORTH: "Mrz.", LEMMA: "März", NORM: "März"}, {ORTH: "Mrz.", NORM: "März"},
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"}, {ORTH: "MwSt.", NORM: "Mehrwertsteuer"},
{ORTH: "Mär.", LEMMA: "März", NORM: "März"}, {ORTH: "Mär.", NORM: "März"},
{ORTH: "Nov.", LEMMA: "November", NORM: "November"}, {ORTH: "Nov.", NORM: "November"},
{ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"}, {ORTH: "Nr.", NORM: "Nummer"},
{ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"}, {ORTH: "Okt.", NORM: "Oktober"},
{ORTH: "Orig.", LEMMA: "Original", NORM: "Original"}, {ORTH: "Orig.", NORM: "Original"},
{ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"}, {ORTH: "Pkt.", NORM: "Punkt"},
{ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"}, {ORTH: "Prof.", NORM: "Professor"},
{ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"}, {ORTH: "Red.", NORM: "Redaktion"},
{ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"}, {ORTH: "Sa.", NORM: "Samstag"},
{ORTH: "Sep.", LEMMA: "September", NORM: "September"}, {ORTH: "Sep.", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September", NORM: "September"}, {ORTH: "Sept.", NORM: "September"},
{ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"}, {ORTH: "So.", NORM: "Sonntag"},
{ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"}, {ORTH: "Std.", NORM: "Stunde"},
{ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"}, {ORTH: "Str.", NORM: "Straße"},
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"}, {ORTH: "Tel.", NORM: "Telefon"},
{ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"}, {ORTH: "Tsd.", NORM: "Tausend"},
{ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"}, {ORTH: "Univ.", NORM: "Universität"},
{ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"}, {ORTH: "abzgl.", NORM: "abzüglich"},
{ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"}, {ORTH: "allg.", NORM: "allgemein"},
{ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"}, {ORTH: "bspw.", NORM: "beispielsweise"},
{ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"}, {ORTH: "bzgl.", NORM: "bezüglich"},
{ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"}, {ORTH: "bzw.", NORM: "beziehungsweise"},
{ORTH: "d.h.", LEMMA: "das heißt"}, {ORTH: "d.h."},
{ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"}, {ORTH: "dgl.", NORM: "dergleichen"},
{ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"}, {ORTH: "ebd.", NORM: "ebenda"},
{ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"}, {ORTH: "eigtl.", NORM: "eigentlich"},
{ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"}, {ORTH: "engl.", NORM: "englisch"},
{ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"}, {ORTH: "evtl.", NORM: "eventuell"},
{ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"}, {ORTH: "frz.", NORM: "französisch"},
{ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"}, {ORTH: "gegr.", NORM: "gegründet"},
{ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"}, {ORTH: "ggf.", NORM: "gegebenenfalls"},
{ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"}, {ORTH: "ggfs.", NORM: "gegebenenfalls"},
{ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"}, {ORTH: "ggü.", NORM: "gegenüber"},
{ORTH: "i.O.", LEMMA: "in Ordnung"}, {ORTH: "i.O."},
{ORTH: "i.d.R.", LEMMA: "in der Regel"}, {ORTH: "i.d.R."},
{ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"}, {ORTH: "incl.", NORM: "inklusive"},
{ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"}, {ORTH: "inkl.", NORM: "inklusive"},
{ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"}, {ORTH: "insb.", NORM: "insbesondere"},
{ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"}, {ORTH: "kath.", NORM: "katholisch"},
{ORTH: "lt.", LEMMA: "laut", NORM: "laut"}, {ORTH: "lt.", NORM: "laut"},
{ORTH: "max.", LEMMA: "maximal", NORM: "maximal"}, {ORTH: "max.", NORM: "maximal"},
{ORTH: "min.", LEMMA: "minimal", NORM: "minimal"}, {ORTH: "min.", NORM: "minimal"},
{ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"}, {ORTH: "mind.", NORM: "mindestens"},
{ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"}, {ORTH: "mtl.", NORM: "monatlich"},
{ORTH: "n.Chr.", LEMMA: "nach Christus"}, {ORTH: "n.Chr."},
{ORTH: "orig.", LEMMA: "original", NORM: "original"}, {ORTH: "orig.", NORM: "original"},
{ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"}, {ORTH: "röm.", NORM: "römisch"},
{ORTH: "s.o.", LEMMA: "siehe oben"}, {ORTH: "s.o."},
{ORTH: "sog.", LEMMA: "so genannt"}, {ORTH: "sog."},
{ORTH: "stellv.", LEMMA: "stellvertretend"}, {ORTH: "stellv."},
{ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"}, {ORTH: "tägl.", NORM: "täglich"},
{ORTH: "u.U.", LEMMA: "unter Umständen"}, {ORTH: "u.U."},
{ORTH: "u.s.w.", LEMMA: "und so weiter"}, {ORTH: "u.s.w."},
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"}, {ORTH: "u.v.m."},
{ORTH: "usf.", LEMMA: "und so fort"}, {ORTH: "usf."},
{ORTH: "usw.", LEMMA: "und so weiter"}, {ORTH: "usw."},
{ORTH: "uvm.", LEMMA: "und vieles mehr"}, {ORTH: "uvm."},
{ORTH: "v.Chr.", LEMMA: "vor Christus"}, {ORTH: "v.Chr."},
{ORTH: "v.a.", LEMMA: "vor allem"}, {ORTH: "v.a."},
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}, {ORTH: "v.l.n.r."},
{ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"}, {ORTH: "vgl.", NORM: "vergleiche"},
{ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"}, {ORTH: "vllt.", NORM: "vielleicht"},
{ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"}, {ORTH: "vlt.", NORM: "vielleicht"},
{ORTH: "z.B.", LEMMA: "zum Beispiel"}, {ORTH: "z.B."},
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}, {ORTH: "z.Bsp."},
{ORTH: "z.T.", LEMMA: "zum Teil"}, {ORTH: "z.T."},
{ORTH: "z.Z.", LEMMA: "zur Zeit"}, {ORTH: "z.Z."},
{ORTH: "z.Zt.", LEMMA: "zur Zeit"}, {ORTH: "z.Zt."},
{ORTH: "z.b.", LEMMA: "zum Beispiel"}, {ORTH: "z.b."},
{ORTH: "zzgl.", LEMMA: "zuzüglich"}, {ORTH: "zzgl."},
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}, {ORTH: "österr.", NORM: "österreichisch"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -1,130 +1,128 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
_exc = {} _exc = {}
for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]: for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]:
_exc[token] = [{ORTH: token, LEMMA: "από", NORM: "από"}] _exc[token] = [{ORTH: token, NORM: "από"}]
for token in ["Αλλ'", "αλλ'"]: for token in ["Αλλ'", "αλλ'"]:
_exc[token] = [{ORTH: token, LEMMA: "αλλά", NORM: "αλλά"}] _exc[token] = [{ORTH: token, NORM: "αλλά"}]
for token in ["παρ'", "Παρ'", "ΠΑΡ'"]: for token in ["παρ'", "Παρ'", "ΠΑΡ'"]:
_exc[token] = [{ORTH: token, LEMMA: "παρά", NORM: "παρά"}] _exc[token] = [{ORTH: token, NORM: "παρά"}]
for token in ["καθ'", "Καθ'"]: for token in ["καθ'", "Καθ'"]:
_exc[token] = [{ORTH: token, LEMMA: "κάθε", NORM: "κάθε"}] _exc[token] = [{ORTH: token, NORM: "κάθε"}]
for token in ["κατ'", "Κατ'"]: for token in ["κατ'", "Κατ'"]:
_exc[token] = [{ORTH: token, LEMMA: "κατά", NORM: "κατά"}] _exc[token] = [{ORTH: token, NORM: "κατά"}]
for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]: for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]:
_exc[token] = [{ORTH: token, LEMMA: "είμαι", NORM: "είμαι"}] _exc[token] = [{ORTH: token, NORM: "είμαι"}]
for token in ["Επ'", "επ'", "εφ'", "Εφ'"]: for token in ["Επ'", "επ'", "εφ'", "Εφ'"]:
_exc[token] = [{ORTH: token, LEMMA: "επί", NORM: "επί"}] _exc[token] = [{ORTH: token, NORM: "επί"}]
for token in ["Δι'", "δι'"]: for token in ["Δι'", "δι'"]:
_exc[token] = [{ORTH: token, LEMMA: "δια", NORM: "δια"}] _exc[token] = [{ORTH: token, NORM: "δια"}]
for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]: for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]:
_exc[token] = [{ORTH: token, LEMMA: "έχω", NORM: "έχω"}] _exc[token] = [{ORTH: token, NORM: "έχω"}]
for token in ["υπ'", "Υπ'"]: for token in ["υπ'", "Υπ'"]:
_exc[token] = [{ORTH: token, LEMMA: "υπό", NORM: "υπό"}] _exc[token] = [{ORTH: token, NORM: "υπό"}]
for token in ["Μετ'", "ΜΕΤ'", "'μετ"]: for token in ["Μετ'", "ΜΕΤ'", "'μετ"]:
_exc[token] = [{ORTH: token, LEMMA: "μετά", NORM: "μετά"}] _exc[token] = [{ORTH: token, NORM: "μετά"}]
for token in ["Μ'", "μ'"]: for token in ["Μ'", "μ'"]:
_exc[token] = [{ORTH: token, LEMMA: "με", NORM: "με"}] _exc[token] = [{ORTH: token, NORM: "με"}]
for token in ["Γι'", "ΓΙ'", "γι'"]: for token in ["Γι'", "ΓΙ'", "γι'"]:
_exc[token] = [{ORTH: token, LEMMA: "για", NORM: "για"}] _exc[token] = [{ORTH: token, NORM: "για"}]
for token in ["Σ'", "σ'"]: for token in ["Σ'", "σ'"]:
_exc[token] = [{ORTH: token, LEMMA: "σε", NORM: "σε"}] _exc[token] = [{ORTH: token, NORM: "σε"}]
for token in ["Θ'", "θ'"]: for token in ["Θ'", "θ'"]:
_exc[token] = [{ORTH: token, LEMMA: "θα", NORM: "θα"}] _exc[token] = [{ORTH: token, NORM: "θα"}]
for token in ["Ν'", "ν'"]: for token in ["Ν'", "ν'"]:
_exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}] _exc[token] = [{ORTH: token, NORM: "να"}]
for token in ["Τ'", "τ'"]: for token in ["Τ'", "τ'"]:
_exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}] _exc[token] = [{ORTH: token, NORM: "να"}]
for token in ["'γω", "'σένα", "'μεις"]: for token in ["'γω", "'σένα", "'μεις"]:
_exc[token] = [{ORTH: token, LEMMA: "εγώ", NORM: "εγώ"}] _exc[token] = [{ORTH: token, NORM: "εγώ"}]
for token in ["Τ'", "τ'"]: for token in ["Τ'", "τ'"]:
_exc[token] = [{ORTH: token, LEMMA: "το", NORM: "το"}] _exc[token] = [{ORTH: token, NORM: "το"}]
for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]: for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]:
_exc[token] = [{ORTH: token, LEMMA: "φέρνω", NORM: "φέρνω"}] _exc[token] = [{ORTH: token, NORM: "φέρνω"}]
for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]: for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]:
_exc[token] = [{ORTH: token, LEMMA: "έρχομαι", NORM: "έρχομαι"}] _exc[token] = [{ORTH: token, NORM: "έρχομαι"}]
for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]: for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]:
_exc[token] = [{ORTH: token, LEMMA: "λέγω", NORM: "λέγω"}] _exc[token] = [{ORTH: token, NORM: "λέγω"}]
for token in ["Πάρ'", "πάρ'"]: for token in ["Πάρ'", "πάρ'"]:
_exc[token] = [{ORTH: token, LEMMA: "παίρνω", NORM: "παίρνω"}] _exc[token] = [{ORTH: token, NORM: "παίρνω"}]
for token in ["μέσ'", "Μέσ'", "μεσ'"]: for token in ["μέσ'", "Μέσ'", "μεσ'"]:
_exc[token] = [{ORTH: token, LEMMA: "μέσα", NORM: "μέσα"}] _exc[token] = [{ORTH: token, NORM: "μέσα"}]
for token in ["Δέσ'", "Δεσ'", "δεσ'"]: for token in ["Δέσ'", "Δεσ'", "δεσ'"]:
_exc[token] = [{ORTH: token, LEMMA: "δένω", NORM: "δένω"}] _exc[token] = [{ORTH: token, NORM: "δένω"}]
for token in ["'κανε", "Κάν'"]: for token in ["'κανε", "Κάν'"]:
_exc[token] = [{ORTH: token, LEMMA: "κάνω", NORM: "κάνω"}] _exc[token] = [{ORTH: token, NORM: "κάνω"}]
_other_exc = { _other_exc = {
"κι": [{ORTH: "κι", LEMMA: "και", NORM: "και"}], "κι": [{ORTH: "κι", NORM: "και"}],
"Παίξ'": [{ORTH: "Παίξ'", LEMMA: "παίζω", NORM: "παίζω"}], "Παίξ'": [{ORTH: "Παίξ'", NORM: "παίζω"}],
"Αντ'": [{ORTH: "Αντ'", LEMMA: "αντί", NORM: "αντί"}], "Αντ'": [{ORTH: "Αντ'", NORM: "αντί"}],
"ολ'": [{ORTH: "ολ'", LEMMA: "όλος", NORM: "όλος"}], "ολ'": [{ORTH: "ολ'", NORM: "όλος"}],
"ύστερ'": [{ORTH: "ύστερ'", LEMMA: "ύστερα", NORM: "ύστερα"}], "ύστερ'": [{ORTH: "ύστερ'", NORM: "ύστερα"}],
"'πρεπε": [{ORTH: "'πρεπε", LEMMA: "πρέπει", NORM: "πρέπει"}], "'πρεπε": [{ORTH: "'πρεπε", NORM: "πρέπει"}],
"Δύσκολ'": [{ORTH: "Δύσκολ'", LEMMA: "δύσκολος", NORM: "δύσκολος"}], "Δύσκολ'": [{ORTH: "Δύσκολ'", NORM: "δύσκολος"}],
"'θελα": [{ORTH: "'θελα", LEMMA: "θέλω", NORM: "θέλω"}], "'θελα": [{ORTH: "'θελα", NORM: "θέλω"}],
"'γραφα": [{ORTH: "'γραφα", LEMMA: "γράφω", NORM: "γράφω"}], "'γραφα": [{ORTH: "'γραφα", NORM: "γράφω"}],
"'παιρνα": [{ORTH: "'παιρνα", LEMMA: "παίρνω", NORM: "παίρνω"}], "'παιρνα": [{ORTH: "'παιρνα", NORM: "παίρνω"}],
"'δειξε": [{ORTH: "'δειξε", LEMMA: "δείχνω", NORM: "δείχνω"}], "'δειξε": [{ORTH: "'δειξε", NORM: "δείχνω"}],
"όμουρφ'": [{ORTH: "όμουρφ'", LEMMA: "όμορφος", NORM: "όμορφος"}], "όμουρφ'": [{ORTH: "όμουρφ'", NORM: "όμορφος"}],
"κ'τσή": [{ORTH: "κ'τσή", LEMMA: "κουτσός", NORM: "κουτσός"}], "κ'τσή": [{ORTH: "κ'τσή", NORM: "κουτσός"}],
"μηδ'": [{ORTH: "μηδ'", LEMMA: "μήδε", NORM: "μήδε"}], "μηδ'": [{ORTH: "μηδ'", NORM: "μήδε"}],
"'ξομολογήθηκε": [ "'ξομολογήθηκε": [{ORTH: "'ξομολογήθηκε", NORM: "εξομολογούμαι"}],
{ORTH: "'ξομολογήθηκε", LEMMA: "εξομολογούμαι", NORM: "εξομολογούμαι"} "'μας": [{ORTH: "'μας", NORM: "εμάς"}],
], "'ξερες": [{ORTH: "'ξερες", NORM: "ξέρω"}],
"'μας": [{ORTH: "'μας", LEMMA: "εμάς", NORM: "εμάς"}], "έφθασ'": [{ORTH: "έφθασ'", NORM: "φθάνω"}],
"'ξερες": [{ORTH: "'ξερες", LEMMA: "ξέρω", NORM: "ξέρω"}], "εξ'": [{ORTH: "εξ'", NORM: "εκ"}],
"έφθασ'": [{ORTH: "έφθασ'", LEMMA: "φθάνω", NORM: "φθάνω"}], "δώσ'": [{ORTH: "δώσ'", NORM: "δίνω"}],
"εξ'": [{ORTH: "εξ'", LEMMA: "εκ", NORM: "εκ"}], "τίποτ'": [{ORTH: "τίποτ'", NORM: "τίποτα"}],
"δώσ'": [{ORTH: "δώσ'", LEMMA: "δίνω", NORM: "δίνω"}], "Λήξ'": [{ORTH: "Λήξ'", NORM: "λήγω"}],
"τίποτ'": [{ORTH: "τίποτ'", LEMMA: "τίποτα", NORM: "τίποτα"}], "άσ'": [{ORTH: "άσ'", NORM: "αφήνω"}],
"Λήξ'": [{ORTH: "Λήξ'", LEMMA: "λήγω", NORM: "λήγω"}], "Στ'": [{ORTH: "Στ'", NORM: "στο"}],
"άσ'": [{ORTH: "άσ'", LEMMA: "αφήνω", NORM: "αφήνω"}], "Δωσ'": [{ORTH: "Δωσ'", NORM: "δίνω"}],
"Στ'": [{ORTH: "Στ'", LEMMA: "στο", NORM: "στο"}], "Βάψ'": [{ORTH: "Βάψ'", NORM: "βάφω"}],
"Δωσ'": [{ORTH: "Δωσ'", LEMMA: "δίνω", NORM: "δίνω"}], "Αλλ'": [{ORTH: "Αλλ'", NORM: "αλλά"}],
"Βάψ'": [{ORTH: "Βάψ'", LEMMA: "βάφω", NORM: "βάφω"}], "Αμ'": [{ORTH: "Αμ'", NORM: "άμα"}],
"Αλλ'": [{ORTH: "Αλλ'", LEMMA: "αλλά", NORM: "αλλά"}], "Αγόρασ'": [{ORTH: "Αγόρασ'", NORM: "αγοράζω"}],
"Αμ'": [{ORTH: "Αμ'", LEMMA: "άμα", NORM: "άμα"}], "'φύγε": [{ORTH: "'φύγε", NORM: "φεύγω"}],
"Αγόρασ'": [{ORTH: "Αγόρασ'", LEMMA: "αγοράζω", NORM: "αγοράζω"}], "'φερε": [{ORTH: "'φερε", NORM: "φέρνω"}],
"'φύγε": [{ORTH: "'φύγε", LEMMA: "φεύγω", NORM: "φεύγω"}], "'φαγε": [{ORTH: "'φαγε", NORM: "τρώω"}],
"'φερε": [{ORTH: "'φερε", LEMMA: "φέρνω", NORM: "φέρνω"}], "'σπαγαν": [{ORTH: "'σπαγαν", NORM: "σπάω"}],
"'φαγε": [{ORTH: "'φαγε", LEMMA: "τρώω", NORM: "τρώω"}], "'σκασε": [{ORTH: "'σκασε", NORM: "σκάω"}],
"'σπαγαν": [{ORTH: "'σπαγαν", LEMMA: "σπάω", NORM: "σπάω"}], "'σβηνε": [{ORTH: "'σβηνε", NORM: "σβήνω"}],
"'σκασε": [{ORTH: "'σκασε", LEMMA: "σκάω", NORM: "σκάω"}], "'ριξε": [{ORTH: "'ριξε", NORM: "ρίχνω"}],
"'σβηνε": [{ORTH: "'σβηνε", LEMMA: "σβήνω", NORM: "σβήνω"}], "'κλεβε": [{ORTH: "'κλεβε", NORM: "κλέβω"}],
"'ριξε": [{ORTH: "'ριξε", LEMMA: "ρίχνω", NORM: "ρίχνω"}], "'κει": [{ORTH: "'κει", NORM: "εκεί"}],
"'κλεβε": [{ORTH: "'κλεβε", LEMMA: "κλέβω", NORM: "κλέβω"}], "'βλεπε": [{ORTH: "'βλεπε", NORM: "βλέπω"}],
"'κει": [{ORTH: "'κει", LEMMA: "εκεί", NORM: "εκεί"}], "'βγαινε": [{ORTH: "'βγαινε", NORM: "βγαίνω"}],
"'βλεπε": [{ORTH: "'βλεπε", LEMMA: "βλέπω", NORM: "βλέπω"}],
"'βγαινε": [{ORTH: "'βγαινε", LEMMA: "βγαίνω", NORM: "βγαίνω"}],
} }
_exc.update(_other_exc) _exc.update(_other_exc)
@ -134,35 +132,35 @@ for h in range(1, 12 + 1):
for period in ["π.μ.", "πμ"]: for period in ["π.μ.", "πμ"]:
_exc[f"{h}{period}"] = [ _exc[f"{h}{period}"] = [
{ORTH: f"{h}"}, {ORTH: f"{h}"},
{ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."}, {ORTH: period, NORM: "π.μ."},
] ]
for period in ["μ.μ.", "μμ"]: for period in ["μ.μ.", "μμ"]:
_exc[f"{h}{period}"] = [ _exc[f"{h}{period}"] = [
{ORTH: f"{h}"}, {ORTH: f"{h}"},
{ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."}, {ORTH: period, NORM: "μ.μ."},
] ]
for exc_data in [ for exc_data in [
{ORTH: "ΑΓΡ.", LEMMA: "Αγροτικός", NORM: "Αγροτικός"}, {ORTH: "ΑΓΡ.", NORM: "Αγροτικός"},
{ORTH: "Αγ. Γρ.", LEMMA: "Αγία Γραφή", NORM: "Αγία Γραφή"}, {ORTH: "Αγ. Γρ.", NORM: "Αγία Γραφή"},
{ORTH: "Αθ.", LEMMA: "Αθανάσιος", NORM: "Αθανάσιος"}, {ORTH: "Αθ.", NORM: "Αθανάσιος"},
{ORTH: "Αλεξ.", LEMMA: "Αλέξανδρος", NORM: "Αλέξανδρος"}, {ORTH: "Αλεξ.", NORM: "Αλέξανδρος"},
{ORTH: "Απρ.", LEMMA: "Απρίλιος", NORM: "Απρίλιος"}, {ORTH: "Απρ.", NORM: "Απρίλιος"},
{ORTH: "Αύγ.", LEMMA: "Αύγουστος", NORM: "Αύγουστος"}, {ORTH: "Αύγ.", NORM: "Αύγουστος"},
{ORTH: "Δεκ.", LEMMA: "Δεκέμβριος", NORM: "Δεκέμβριος"}, {ORTH: "Δεκ.", NORM: "Δεκέμβριος"},
{ORTH: "Δημ.", LEMMA: "Δήμος", NORM: "Δήμος"}, {ORTH: "Δημ.", NORM: "Δήμος"},
{ORTH: "Ιαν.", LEMMA: "Ιανουάριος", NORM: "Ιανουάριος"}, {ORTH: "Ιαν.", NORM: "Ιανουάριος"},
{ORTH: "Ιούλ.", LEMMA: "Ιούλιος", NORM: "Ιούλιος"}, {ORTH: "Ιούλ.", NORM: "Ιούλιος"},
{ORTH: "Ιούν.", LEMMA: "Ιούνιος", NORM: "Ιούνιος"}, {ORTH: "Ιούν.", NORM: "Ιούνιος"},
{ORTH: "Ιωαν.", LEMMA: "Ιωάννης", NORM: "Ιωάννης"}, {ORTH: "Ιωαν.", NORM: "Ιωάννης"},
{ORTH: "Μ. Ασία", LEMMA: "Μικρά Ασία", NORM: "Μικρά Ασία"}, {ORTH: "Μ. Ασία", NORM: "Μικρά Ασία"},
{ORTH: "Μάρτ.", LEMMA: "Μάρτιος", NORM: "Μάρτιος"}, {ORTH: "Μάρτ.", NORM: "Μάρτιος"},
{ORTH: "Μάρτ'", LEMMA: "Μάρτιος", NORM: "Μάρτιος"}, {ORTH: "Μάρτ'", NORM: "Μάρτιος"},
{ORTH: "Νοέμβρ.", LEMMA: "Νοέμβριος", NORM: "Νοέμβριος"}, {ORTH: "Νοέμβρ.", NORM: "Νοέμβριος"},
{ORTH: "Οκτ.", LEMMA: "Οκτώβριος", NORM: "Οκτώβριος"}, {ORTH: "Οκτ.", NORM: "Οκτώβριος"},
{ORTH: "Σεπτ.", LEMMA: "Σεπτέμβριος", NORM: "Σεπτέμβριος"}, {ORTH: "Σεπτ.", NORM: "Σεπτέμβριος"},
{ORTH: "Φεβρ.", LEMMA: "Φεβρουάριος", NORM: "Φεβρουάριος"}, {ORTH: "Φεβρ.", NORM: "Φεβρουάριος"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -28,110 +28,110 @@ _exclude = [
for pron in ["i"]: for pron in ["i"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [ _exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"}, {ORTH: "'m", NORM: "am"},
] ]
_exc[orth + "m"] = [ _exc[orth + "m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}, {ORTH: "m", "tenspect": 1, "number": 1},
] ]
_exc[orth + "'ma"] = [ _exc[orth + "'ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'m", LEMMA: "be", NORM: "am"}, {ORTH: "'m", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}, {ORTH: "a", NORM: "gonna"},
] ]
_exc[orth + "ma"] = [ _exc[orth + "ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "m", LEMMA: "be", NORM: "am"}, {ORTH: "m", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}, {ORTH: "a", NORM: "gonna"},
] ]
for pron in ["i", "you", "he", "she", "it", "we", "they"]: for pron in ["i", "you", "he", "she", "it", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "'ll", NORM: "will"},
] ]
_exc[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "ll", NORM: "will"},
] ]
_exc[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "'ll", NORM: "will"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "ll", NORM: "will"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'d", NORM: "'d"}, {ORTH: "'d", NORM: "'d"},
] ]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "d", NORM: "'d"}, {ORTH: "d", NORM: "'d"},
] ]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, {ORTH: "'d", NORM: "would"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, {ORTH: "d", NORM: "would"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
for pron in ["i", "you", "we", "they"]: for pron in ["i", "you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
for pron in ["you", "we", "they"]: for pron in ["you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'re", LEMMA: "be", NORM: "are"}, {ORTH: "'re", NORM: "are"},
] ]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}, {ORTH: "re", NORM: "are"},
] ]
for pron in ["he", "she", "it"]: for pron in ["he", "she", "it"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'s", NORM: "'s"}, {ORTH: "'s", NORM: "'s"},
] ]
_exc[orth + "s"] = [ _exc[orth + "s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "s"}, {ORTH: "s"},
] ]
@ -153,145 +153,145 @@ for word in [
]: ]:
for orth in [word, word.title()]: for orth in [word, word.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'s", NORM: "'s"}, {ORTH: "'s", NORM: "'s"},
] ]
_exc[orth + "s"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "s"}] _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "'ll", NORM: "will"},
] ]
_exc[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "ll", NORM: "will"},
] ]
_exc[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "'ll", NORM: "will"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "ll", NORM: "will"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'re", LEMMA: "be", NORM: "are"}, {ORTH: "'re", NORM: "are"},
] ]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "re", LEMMA: "be", NORM: "are"}, {ORTH: "re", NORM: "are"},
] ]
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}, {ORTH: "'ve"},
] ]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'d", NORM: "'d"}, {ORTH: "'d", NORM: "'d"},
] ]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "d", NORM: "'d"}, {ORTH: "d", NORM: "'d"},
] ]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, {ORTH: "'d", NORM: "would"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, {ORTH: "d", NORM: "would"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
# Verbs # Verbs
for verb_data in [ for verb_data in [
{ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"}, {ORTH: "ca", NORM: "can"},
{ORTH: "could", NORM: "could", TAG: "MD"}, {ORTH: "could", NORM: "could"},
{ORTH: "do", LEMMA: "do", NORM: "do"}, {ORTH: "do", NORM: "do"},
{ORTH: "does", LEMMA: "do", NORM: "does"}, {ORTH: "does", NORM: "does"},
{ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"}, {ORTH: "did", NORM: "do"},
{ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"}, {ORTH: "had", NORM: "have"},
{ORTH: "may", NORM: "may", TAG: "MD"}, {ORTH: "may", NORM: "may"},
{ORTH: "might", NORM: "might", TAG: "MD"}, {ORTH: "might", NORM: "might"},
{ORTH: "must", NORM: "must", TAG: "MD"}, {ORTH: "must", NORM: "must"},
{ORTH: "need", NORM: "need"}, {ORTH: "need", NORM: "need"},
{ORTH: "ought", NORM: "ought", TAG: "MD"}, {ORTH: "ought", NORM: "ought"},
{ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"}, {ORTH: "sha", NORM: "shall"},
{ORTH: "should", NORM: "should", TAG: "MD"}, {ORTH: "should", NORM: "should"},
{ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "wo", NORM: "will"},
{ORTH: "would", NORM: "would", TAG: "MD"}, {ORTH: "would", NORM: "would"},
]: ]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "n't", NORM: "not"},
] ]
_exc[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "nt", NORM: "not"},
] ]
_exc[data[ORTH] + "n't've"] = [ _exc[data[ORTH] + "n't've"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "n't", NORM: "not"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[data[ORTH] + "ntve"] = [ _exc[data[ORTH] + "ntve"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "nt", NORM: "not"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
for verb_data in [ for verb_data in [
{ORTH: "could", NORM: "could", TAG: "MD"}, {ORTH: "could", NORM: "could"},
{ORTH: "might", NORM: "might", TAG: "MD"}, {ORTH: "might", NORM: "might"},
{ORTH: "must", NORM: "must", TAG: "MD"}, {ORTH: "must", NORM: "must"},
{ORTH: "should", NORM: "should", TAG: "MD"}, {ORTH: "should", NORM: "should"},
{ORTH: "would", NORM: "would", TAG: "MD"}, {ORTH: "would", NORM: "would"},
]: ]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "'ve"] = [dict(data), {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] _exc[data[ORTH] + "'ve"] = [dict(data), {ORTH: "'ve"}]
_exc[data[ORTH] + "ve"] = [dict(data), {ORTH: "ve", LEMMA: "have", TAG: "VB"}] _exc[data[ORTH] + "ve"] = [dict(data), {ORTH: "ve"}]
for verb_data in [ for verb_data in [
{ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2}, {ORTH: "ai", "number": 2},
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2}, {ORTH: "are", NORM: "are", "number": 2},
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"}, {ORTH: "is", NORM: "is"},
{ORTH: "was", LEMMA: "be", NORM: "was"}, {ORTH: "was", NORM: "was"},
{ORTH: "were", LEMMA: "be", NORM: "were"}, {ORTH: "were", NORM: "were"},
{ORTH: "have", NORM: "have"}, {ORTH: "have", NORM: "have"},
{ORTH: "has", LEMMA: "have", NORM: "has"}, {ORTH: "has", NORM: "has"},
{ORTH: "dare", NORM: "dare"}, {ORTH: "dare", NORM: "dare"},
]: ]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
@ -299,24 +299,24 @@ for verb_data in [
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "n't", NORM: "not"},
] ]
_exc[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "nt", NORM: "not"},
] ]
# Other contractions with trailing apostrophe # Other contractions with trailing apostrophe
for exc_data in [ for exc_data in [
{ORTH: "doin", LEMMA: "do", NORM: "doing"}, {ORTH: "doin", NORM: "doing"},
{ORTH: "goin", LEMMA: "go", NORM: "going"}, {ORTH: "goin", NORM: "going"},
{ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"}, {ORTH: "nothin", NORM: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"}, {ORTH: "nuthin", NORM: "nothing"},
{ORTH: "ol", LEMMA: "old", NORM: "old"}, {ORTH: "ol", NORM: "old"},
{ORTH: "somethin", LEMMA: "something", NORM: "something"}, {ORTH: "somethin", NORM: "something"},
]: ]:
exc_data_tc = dict(exc_data) exc_data_tc = dict(exc_data)
exc_data_tc[ORTH] = exc_data_tc[ORTH].title() exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
@ -331,9 +331,9 @@ for exc_data in [
for exc_data in [ for exc_data in [
{ORTH: "cause", NORM: "because"}, {ORTH: "cause", NORM: "because"},
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, {ORTH: "em", NORM: "them"},
{ORTH: "ll", LEMMA: "will", NORM: "will"}, {ORTH: "ll", NORM: "will"},
{ORTH: "nuff", LEMMA: "enough", NORM: "enough"}, {ORTH: "nuff", NORM: "enough"},
]: ]:
exc_data_apos = dict(exc_data) exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
@ -347,166 +347,131 @@ for h in range(1, 12 + 1):
for period in ["a.m.", "am"]: for period in ["a.m.", "am"]:
_exc[f"{h}{period}"] = [ _exc[f"{h}{period}"] = [
{ORTH: f"{h}"}, {ORTH: f"{h}"},
{ORTH: period, LEMMA: "a.m.", NORM: "a.m."}, {ORTH: period, NORM: "a.m."},
] ]
for period in ["p.m.", "pm"]: for period in ["p.m.", "pm"]:
_exc[f"{h}{period}"] = [ _exc[f"{h}{period}"] = [
{ORTH: f"{h}"}, {ORTH: f"{h}"},
{ORTH: period, LEMMA: "p.m.", NORM: "p.m."}, {ORTH: period, NORM: "p.m."},
] ]
# Rest # Rest
_other_exc = { _other_exc = {
"y'all": [{ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "all"}], "y'all": [{ORTH: "y'", NORM: "you"}, {ORTH: "all"}],
"yall": [{ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "all"}], "yall": [{ORTH: "y", NORM: "you"}, {ORTH: "all"}],
"how'd'y": [ "how'd'y": [{ORTH: "how"}, {ORTH: "'d"}, {ORTH: "'y", NORM: "you"}],
{ORTH: "how", LEMMA: "how"}, "How'd'y": [{ORTH: "How", NORM: "how"}, {ORTH: "'d"}, {ORTH: "'y", NORM: "you"}],
{ORTH: "'d", LEMMA: "do"}, "not've": [{ORTH: "not"}, {ORTH: "'ve", NORM: "have"}],
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}, "notve": [{ORTH: "not"}, {ORTH: "ve", NORM: "have"}],
], "Not've": [{ORTH: "Not", NORM: "not"}, {ORTH: "'ve", NORM: "have"}],
"How'd'y": [ "Notve": [{ORTH: "Not", NORM: "not"}, {ORTH: "ve", NORM: "have"}],
{ORTH: "How", LEMMA: "how", NORM: "how"}, "cannot": [{ORTH: "can"}, {ORTH: "not"}],
{ORTH: "'d", LEMMA: "do"}, "Cannot": [{ORTH: "Can", NORM: "can"}, {ORTH: "not"}],
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}, "gonna": [{ORTH: "gon", NORM: "going"}, {ORTH: "na", NORM: "to"}],
], "Gonna": [{ORTH: "Gon", NORM: "going"}, {ORTH: "na", NORM: "to"}],
"not've": [ "gotta": [{ORTH: "got"}, {ORTH: "ta", NORM: "to"}],
{ORTH: "not", LEMMA: "not", TAG: "RB"}, "Gotta": [{ORTH: "Got", NORM: "got"}, {ORTH: "ta", NORM: "to"}],
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, "let's": [{ORTH: "let"}, {ORTH: "'s", NORM: "us"}],
], "Let's": [{ORTH: "Let", NORM: "let"}, {ORTH: "'s", NORM: "us"}],
"notve": [ "c'mon": [{ORTH: "c'm", NORM: "come"}, {ORTH: "on"}],
{ORTH: "not", LEMMA: "not", TAG: "RB"}, "C'mon": [{ORTH: "C'm", NORM: "come"}, {ORTH: "on"}],
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
],
"Not've": [
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
],
"Notve": [
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
],
"cannot": [
{ORTH: "can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"},
],
"Cannot": [
{ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"},
],
"gonna": [
{ORTH: "gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to", NORM: "to"},
],
"Gonna": [
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to", NORM: "to"},
],
"gotta": [{ORTH: "got"}, {ORTH: "ta", LEMMA: "to", NORM: "to"}],
"Gotta": [{ORTH: "Got", NORM: "got"}, {ORTH: "ta", LEMMA: "to", NORM: "to"}],
"let's": [{ORTH: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
"Let's": [
{ORTH: "Let", LEMMA: "let", NORM: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
],
"c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
"C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
} }
_exc.update(_other_exc) _exc.update(_other_exc)
for exc_data in [ for exc_data in [
{ORTH: "'S", LEMMA: "'s", NORM: "'s"}, {ORTH: "'S", NORM: "'s"},
{ORTH: "'s", LEMMA: "'s", NORM: "'s"}, {ORTH: "'s", NORM: "'s"},
{ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"}, {ORTH: "\u2018S", NORM: "'s"},
{ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"}, {ORTH: "\u2018s", NORM: "'s"},
{ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"}, {ORTH: "and/or", NORM: "and/or"},
{ORTH: "w/o", LEMMA: "without", NORM: "without"}, {ORTH: "w/o", NORM: "without"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}, {ORTH: "'re", NORM: "are"},
{ORTH: "'Cause", LEMMA: "because", NORM: "because"}, {ORTH: "'Cause", NORM: "because"},
{ORTH: "'cause", LEMMA: "because", NORM: "because"}, {ORTH: "'cause", NORM: "because"},
{ORTH: "'cos", LEMMA: "because", NORM: "because"}, {ORTH: "'cos", NORM: "because"},
{ORTH: "'Cos", LEMMA: "because", NORM: "because"}, {ORTH: "'Cos", NORM: "because"},
{ORTH: "'coz", LEMMA: "because", NORM: "because"}, {ORTH: "'coz", NORM: "because"},
{ORTH: "'Coz", LEMMA: "because", NORM: "because"}, {ORTH: "'Coz", NORM: "because"},
{ORTH: "'cuz", LEMMA: "because", NORM: "because"}, {ORTH: "'cuz", NORM: "because"},
{ORTH: "'Cuz", LEMMA: "because", NORM: "because"}, {ORTH: "'Cuz", NORM: "because"},
{ORTH: "'bout", LEMMA: "about", NORM: "about"}, {ORTH: "'bout", NORM: "about"},
{ORTH: "ma'am", LEMMA: "madam", NORM: "madam"}, {ORTH: "ma'am", NORM: "madam"},
{ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"}, {ORTH: "Ma'am", NORM: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"}, {ORTH: "o'clock", NORM: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"}, {ORTH: "O'clock", NORM: "o'clock"},
{ORTH: "lovin'", LEMMA: "love", NORM: "loving"}, {ORTH: "lovin'", NORM: "loving"},
{ORTH: "Lovin'", LEMMA: "love", NORM: "loving"}, {ORTH: "Lovin'", NORM: "loving"},
{ORTH: "lovin", LEMMA: "love", NORM: "loving"}, {ORTH: "lovin", NORM: "loving"},
{ORTH: "Lovin", LEMMA: "love", NORM: "loving"}, {ORTH: "Lovin", NORM: "loving"},
{ORTH: "havin'", LEMMA: "have", NORM: "having"}, {ORTH: "havin'", NORM: "having"},
{ORTH: "Havin'", LEMMA: "have", NORM: "having"}, {ORTH: "Havin'", NORM: "having"},
{ORTH: "havin", LEMMA: "have", NORM: "having"}, {ORTH: "havin", NORM: "having"},
{ORTH: "Havin", LEMMA: "have", NORM: "having"}, {ORTH: "Havin", NORM: "having"},
{ORTH: "doin'", LEMMA: "do", NORM: "doing"}, {ORTH: "doin'", NORM: "doing"},
{ORTH: "Doin'", LEMMA: "do", NORM: "doing"}, {ORTH: "Doin'", NORM: "doing"},
{ORTH: "doin", LEMMA: "do", NORM: "doing"}, {ORTH: "doin", NORM: "doing"},
{ORTH: "Doin", LEMMA: "do", NORM: "doing"}, {ORTH: "Doin", NORM: "doing"},
{ORTH: "goin'", LEMMA: "go", NORM: "going"}, {ORTH: "goin'", NORM: "going"},
{ORTH: "Goin'", LEMMA: "go", NORM: "going"}, {ORTH: "Goin'", NORM: "going"},
{ORTH: "goin", LEMMA: "go", NORM: "going"}, {ORTH: "goin", NORM: "going"},
{ORTH: "Goin", LEMMA: "go", NORM: "going"}, {ORTH: "Goin", NORM: "going"},
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"}, {ORTH: "Mt.", NORM: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"}, {ORTH: "Ak.", NORM: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"}, {ORTH: "Ala.", NORM: "Alabama"},
{ORTH: "Apr.", LEMMA: "April", NORM: "April"}, {ORTH: "Apr.", NORM: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"}, {ORTH: "Ariz.", NORM: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"}, {ORTH: "Ark.", NORM: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August", NORM: "August"}, {ORTH: "Aug.", NORM: "August"},
{ORTH: "Calif.", LEMMA: "California", NORM: "California"}, {ORTH: "Calif.", NORM: "California"},
{ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"}, {ORTH: "Colo.", NORM: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"}, {ORTH: "Conn.", NORM: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December", NORM: "December"}, {ORTH: "Dec.", NORM: "December"},
{ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"}, {ORTH: "Del.", NORM: "Delaware"},
{ORTH: "Feb.", LEMMA: "February", NORM: "February"}, {ORTH: "Feb.", NORM: "February"},
{ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"}, {ORTH: "Fla.", NORM: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"}, {ORTH: "Ga.", NORM: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"}, {ORTH: "Ia.", NORM: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"}, {ORTH: "Id.", NORM: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"}, {ORTH: "Ill.", NORM: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"}, {ORTH: "Ind.", NORM: "Indiana"},
{ORTH: "Jan.", LEMMA: "January", NORM: "January"}, {ORTH: "Jan.", NORM: "January"},
{ORTH: "Jul.", LEMMA: "July", NORM: "July"}, {ORTH: "Jul.", NORM: "July"},
{ORTH: "Jun.", LEMMA: "June", NORM: "June"}, {ORTH: "Jun.", NORM: "June"},
{ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"}, {ORTH: "Kan.", NORM: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"}, {ORTH: "Kans.", NORM: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"}, {ORTH: "Ky.", NORM: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"}, {ORTH: "La.", NORM: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March", NORM: "March"}, {ORTH: "Mar.", NORM: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"}, {ORTH: "Mass.", NORM: "Massachusetts"},
{ORTH: "May.", LEMMA: "May", NORM: "May"}, {ORTH: "May.", NORM: "May"},
{ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"}, {ORTH: "Mich.", NORM: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"}, {ORTH: "Minn.", NORM: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"}, {ORTH: "Miss.", NORM: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"}, {ORTH: "N.C.", NORM: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"}, {ORTH: "N.D.", NORM: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"}, {ORTH: "N.H.", NORM: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"}, {ORTH: "N.J.", NORM: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"}, {ORTH: "N.M.", NORM: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"}, {ORTH: "N.Y.", NORM: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"}, {ORTH: "Neb.", NORM: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"}, {ORTH: "Nebr.", NORM: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"}, {ORTH: "Nev.", NORM: "Nevada"},
{ORTH: "Nov.", LEMMA: "November", NORM: "November"}, {ORTH: "Nov.", NORM: "November"},
{ORTH: "Oct.", LEMMA: "October", NORM: "October"}, {ORTH: "Oct.", NORM: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"}, {ORTH: "Okla.", NORM: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"}, {ORTH: "Ore.", NORM: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"}, {ORTH: "Pa.", NORM: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"}, {ORTH: "S.C.", NORM: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September", NORM: "September"}, {ORTH: "Sep.", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September", NORM: "September"}, {ORTH: "Sept.", NORM: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"}, {ORTH: "Tenn.", NORM: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"}, {ORTH: "Va.", NORM: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"}, {ORTH: "Wash.", NORM: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}, {ORTH: "Wis.", NORM: "Wisconsin"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -1,27 +1,27 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
_exc = { _exc = {
"pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}], "pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
} }
for exc_data in [ for exc_data in [
{ORTH: "", LEMMA: "número"}, {ORTH: ""},
{ORTH: "°C", LEMMA: "grados Celcius"}, {ORTH: "°C"},
{ORTH: "aprox.", LEMMA: "aproximadamente"}, {ORTH: "aprox."},
{ORTH: "dna.", LEMMA: "docena"}, {ORTH: "dna."},
{ORTH: "dpto.", LEMMA: "departamento"}, {ORTH: "dpto."},
{ORTH: "ej.", LEMMA: "ejemplo"}, {ORTH: "ej."},
{ORTH: "esq.", LEMMA: "esquina"}, {ORTH: "esq."},
{ORTH: "pág.", LEMMA: "página"}, {ORTH: "pág."},
{ORTH: "p.ej.", LEMMA: "por ejemplo"}, {ORTH: "p.ej."},
{ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Ud.", NORM: "usted"},
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Vd.", NORM: "usted"},
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Uds.", NORM: "ustedes"},
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Vds.", NORM: "ustedes"},
{ORTH: "vol.", NORM: "volúmen"}, {ORTH: "vol.", NORM: "volúmen"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -29,14 +29,14 @@ for exc_data in [
# Times # Times
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}] _exc["12m."] = [{ORTH: "12"}, {ORTH: "m."}]
for h in range(1, 12 + 1): for h in range(1, 12 + 1):
for period in ["a.m.", "am"]: for period in ["a.m.", "am"]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period}]
for period in ["p.m.", "pm"]: for period in ["p.m.", "pm"]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period}]
for orth in [ for orth in [

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH
from ...util import update_exc from ...util import update_exc
@ -8,74 +8,74 @@ _exc = {}
# Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html # Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html
for exc_data in [ for exc_data in [
{ORTH: "aik.", LEMMA: "aikaisempi"}, {ORTH: "aik."},
{ORTH: "alk.", LEMMA: "alkaen"}, {ORTH: "alk."},
{ORTH: "alv.", LEMMA: "arvonlisävero"}, {ORTH: "alv."},
{ORTH: "ark.", LEMMA: "arkisin"}, {ORTH: "ark."},
{ORTH: "as.", LEMMA: "asunto"}, {ORTH: "as."},
{ORTH: "eaa.", LEMMA: "ennen ajanlaskun alkua"}, {ORTH: "eaa."},
{ORTH: "ed.", LEMMA: "edellinen"}, {ORTH: "ed."},
{ORTH: "esim.", LEMMA: "esimerkki"}, {ORTH: "esim."},
{ORTH: "huom.", LEMMA: "huomautus"}, {ORTH: "huom."},
{ORTH: "jne.", LEMMA: "ja niin edelleen"}, {ORTH: "jne."},
{ORTH: "joht.", LEMMA: "johtaja"}, {ORTH: "joht."},
{ORTH: "k.", LEMMA: "kuollut"}, {ORTH: "k."},
{ORTH: "ks.", LEMMA: "katso"}, {ORTH: "ks."},
{ORTH: "lk.", LEMMA: "luokka"}, {ORTH: "lk."},
{ORTH: "lkm.", LEMMA: "lukumäärä"}, {ORTH: "lkm."},
{ORTH: "lyh.", LEMMA: "lyhenne"}, {ORTH: "lyh."},
{ORTH: "läh.", LEMMA: "lähettäjä"}, {ORTH: "läh."},
{ORTH: "miel.", LEMMA: "mieluummin"}, {ORTH: "miel."},
{ORTH: "milj.", LEMMA: "miljoona"}, {ORTH: "milj."},
{ORTH: "Mm.", LEMMA: "muun muassa"}, {ORTH: "Mm."},
{ORTH: "mm.", LEMMA: "muun muassa"}, {ORTH: "mm."},
{ORTH: "myöh.", LEMMA: "myöhempi"}, {ORTH: "myöh."},
{ORTH: "n.", LEMMA: "noin"}, {ORTH: "n."},
{ORTH: "nimim.", LEMMA: "nimimerkki"}, {ORTH: "nimim."},
{ORTH: "n:o", LEMMA: "numero"}, {ORTH: "n:o"},
{ORTH: "N:o", LEMMA: "numero"}, {ORTH: "N:o"},
{ORTH: "nro", LEMMA: "numero"}, {ORTH: "nro"},
{ORTH: "ns.", LEMMA: "niin sanottu"}, {ORTH: "ns."},
{ORTH: "nyk.", LEMMA: "nykyinen"}, {ORTH: "nyk."},
{ORTH: "oik.", LEMMA: "oikealla"}, {ORTH: "oik."},
{ORTH: "os.", LEMMA: "osoite"}, {ORTH: "os."},
{ORTH: "p.", LEMMA: "päivä"}, {ORTH: "p."},
{ORTH: "par.", LEMMA: "paremmin"}, {ORTH: "par."},
{ORTH: "per.", LEMMA: "perustettu"}, {ORTH: "per."},
{ORTH: "pj.", LEMMA: "puheenjohtaja"}, {ORTH: "pj."},
{ORTH: "puh.joht.", LEMMA: "puheenjohtaja"}, {ORTH: "puh.joht."},
{ORTH: "prof.", LEMMA: "professori"}, {ORTH: "prof."},
{ORTH: "puh.", LEMMA: "puhelin"}, {ORTH: "puh."},
{ORTH: "pvm.", LEMMA: "päivämäärä"}, {ORTH: "pvm."},
{ORTH: "rak.", LEMMA: "rakennettu"}, {ORTH: "rak."},
{ORTH: "ry.", LEMMA: "rekisteröity yhdistys"}, {ORTH: "ry."},
{ORTH: "s.", LEMMA: "sivu"}, {ORTH: "s."},
{ORTH: "siht.", LEMMA: "sihteeri"}, {ORTH: "siht."},
{ORTH: "synt.", LEMMA: "syntynyt"}, {ORTH: "synt."},
{ORTH: "t.", LEMMA: "toivoo"}, {ORTH: "t."},
{ORTH: "tark.", LEMMA: "tarkastanut"}, {ORTH: "tark."},
{ORTH: "til.", LEMMA: "tilattu"}, {ORTH: "til."},
{ORTH: "tms.", LEMMA: "tai muuta sellaista"}, {ORTH: "tms."},
{ORTH: "toim.", LEMMA: "toimittanut"}, {ORTH: "toim."},
{ORTH: "v.", LEMMA: "vuosi"}, {ORTH: "v."},
{ORTH: "vas.", LEMMA: "vasen"}, {ORTH: "vas."},
{ORTH: "vast.", LEMMA: "vastaus"}, {ORTH: "vast."},
{ORTH: "vrt.", LEMMA: "vertaa"}, {ORTH: "vrt."},
{ORTH: "yht.", LEMMA: "yhteensä"}, {ORTH: "yht."},
{ORTH: "yl.", LEMMA: "yleinen"}, {ORTH: "yl."},
{ORTH: "ym.", LEMMA: "ynnä muuta"}, {ORTH: "ym."},
{ORTH: "yms.", LEMMA: "ynnä muuta sellaista"}, {ORTH: "yms."},
{ORTH: "yo.", LEMMA: "ylioppilas"}, {ORTH: "yo."},
{ORTH: "yliopp.", LEMMA: "ylioppilas"}, {ORTH: "yliopp."},
{ORTH: "ao.", LEMMA: "asianomainen"}, {ORTH: "ao."},
{ORTH: "em.", LEMMA: "edellä mainittu"}, {ORTH: "em."},
{ORTH: "ko.", LEMMA: "kyseessä oleva"}, {ORTH: "ko."},
{ORTH: "ml.", LEMMA: "mukaan luettuna"}, {ORTH: "ml."},
{ORTH: "po.", LEMMA: "puheena oleva"}, {ORTH: "po."},
{ORTH: "so.", LEMMA: "se on"}, {ORTH: "so."},
{ORTH: "ts.", LEMMA: "toisin sanoen"}, {ORTH: "ts."},
{ORTH: "vm.", LEMMA: "viimeksi mainittu"}, {ORTH: "vm."},
{ORTH: "srk.", LEMMA: "seurakunta"}, {ORTH: "srk."},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -3,7 +3,7 @@ import re
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import ELISION, HYPHENS from .punctuation import ELISION, HYPHENS
from ..char_classes import ALPHA_LOWER, ALPHA from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA from ...symbols import ORTH
from ...util import update_exc from ...util import update_exc
@ -28,29 +28,29 @@ def lower_first_letter(text):
return text[0].lower() + text[1:] return text[0].lower() + text[1:]
_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]} _exc = {"J.-C.": [{ORTH: "J."}, {ORTH: "-C."}]}
for exc_data in [ for exc_data in [
{LEMMA: "avant", ORTH: "av."}, {ORTH: "av."},
{LEMMA: "janvier", ORTH: "janv."}, {ORTH: "janv."},
{LEMMA: "février", ORTH: "févr."}, {ORTH: "févr."},
{LEMMA: "avril", ORTH: "avr."}, {ORTH: "avr."},
{LEMMA: "juillet", ORTH: "juill."}, {ORTH: "juill."},
{LEMMA: "septembre", ORTH: "sept."}, {ORTH: "sept."},
{LEMMA: "octobre", ORTH: "oct."}, {ORTH: "oct."},
{LEMMA: "novembre", ORTH: "nov."}, {ORTH: "nov."},
{LEMMA: "décembre", ORTH: "déc."}, {ORTH: "déc."},
{LEMMA: "après", ORTH: "apr."}, {ORTH: "apr."},
{LEMMA: "docteur", ORTH: "Dr."}, {ORTH: "Dr."},
{LEMMA: "monsieur", ORTH: "M."}, {ORTH: "M."},
{LEMMA: "monsieur", ORTH: "Mr."}, {ORTH: "Mr."},
{LEMMA: "madame", ORTH: "Mme."}, {ORTH: "Mme."},
{LEMMA: "mademoiselle", ORTH: "Mlle."}, {ORTH: "Mlle."},
{LEMMA: "numéro", ORTH: ""}, {ORTH: ""},
{LEMMA: "degrés", ORTH: ""}, {ORTH: ""},
{LEMMA: "saint", ORTH: "St."}, {ORTH: "St."},
{LEMMA: "sainte", ORTH: "Ste."}, {ORTH: "Ste."},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -80,55 +80,37 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
for verb, verb_lemma in [ for verb in [
("a", "avoir"), "a",
("est", "être"), "est" "semble",
("semble", "sembler"), "indique",
("indique", "indiquer"), "moque",
("moque", "moquer"), "passe",
("passe", "passer"),
]: ]:
for orth in [verb, verb.title()]: for orth in [verb, verb.title()]:
for pronoun in ["elle", "il", "on"]: for pronoun in ["elle", "il", "on"]:
token = f"{orth}-t-{pronoun}" token = f"{orth}-t-{pronoun}"
_exc[token] = [ _exc[token] = [{ORTH: orth}, {ORTH: "-t"}, {ORTH: "-" + pronoun}]
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
{LEMMA: "t", ORTH: "-t"},
{LEMMA: pronoun, ORTH: "-" + pronoun},
]
for verb, verb_lemma in [("est", "être")]: for verb in ["est"]:
for orth in [verb, verb.title()]: for orth in [verb, verb.title()]:
token = f"{orth}-ce" _exc[f"{orth}-ce"] = [{ORTH: orth}, {ORTH: "-ce"}]
_exc[token] = [
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
{LEMMA: "ce", ORTH: "-ce"},
]
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]: for pre in ["qu'", "n'"]:
for orth in [pre, pre.title()]: for orth in [pre, pre.title()]:
_exc[f"{orth}est-ce"] = [ _exc[f"{orth}est-ce"] = [{ORTH: orth}, {ORTH: "est"}, {ORTH: "-ce"}]
{LEMMA: pre_lemma, ORTH: orth},
{LEMMA: "être", ORTH: "est"},
{LEMMA: "ce", ORTH: "-ce"},
]
for verb, pronoun in [("est", "il"), ("EST", "IL")]: for verb, pronoun in [("est", "il"), ("EST", "IL")]:
token = "{}-{}".format(verb, pronoun) _exc[f"{verb}-{pronoun}"] = [{ORTH: verb}, {ORTH: "-" + pronoun}]
_exc[token] = [
{LEMMA: "être", ORTH: verb},
{LEMMA: pronoun, ORTH: "-" + pronoun},
]
for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]: for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
token = "{}'{}-{}".format(s, verb, pronoun) _exc[f"{s}'{verb}-{pronoun}"] = [
_exc[token] = [ {ORTH: s + "'"},
{LEMMA: "se", ORTH: s + "'"}, {ORTH: verb},
{LEMMA: "être", ORTH: verb}, {ORTH: "-" + pronoun},
{LEMMA: pronoun, ORTH: "-" + pronoun},
] ]

View File

@ -1,81 +1,65 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX from ...symbols import ORTH, NORM
from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc from ...util import update_exc
_exc = { _exc = {
"'acha'n": [ "'acha'n": [{ORTH: "'ach", NORM: "gach"}, {ORTH: "a'n", NORM: "aon"}],
{ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, "dem'": [{ORTH: "de", NORM: "de"}, {ORTH: "m'", NORM: "mo"}],
{ORTH: "a'n", LEMMA: "aon", NORM: "aon", POS: DET}, "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
], "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
"dem'": [ "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
{ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},
{ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET},
],
"ded'": [
{ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},
{ORTH: "d'", LEMMA: "do", NORM: "do", POS: DET},
],
"lem'": [
{ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
{ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET},
],
"led'": [
{ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
{ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET},
],
} }
for exc_data in [ for exc_data in [
{ORTH: "'gus", LEMMA: "agus", NORM: "agus", POS: CCONJ}, {ORTH: "'gus", NORM: "agus"},
{ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, {ORTH: "'ach", NORM: "gach"},
{ORTH: "ao'", LEMMA: "aon", NORM: "aon"}, {ORTH: "ao'", NORM: "aon"},
{ORTH: "'niar", LEMMA: "aniar", NORM: "aniar", POS: ADV}, {ORTH: "'niar", NORM: "aniar"},
{ORTH: "'níos", LEMMA: "aníos", NORM: "aníos", POS: ADV}, {ORTH: "'níos", NORM: "aníos"},
{ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu", POS: ADV}, {ORTH: "'ndiu", NORM: "inniu"},
{ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht", POS: ADV}, {ORTH: "'nocht", NORM: "anocht"},
{ORTH: "m'", LEMMA: "mo", POS: DET}, {ORTH: "m'"},
{ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN}, {ORTH: "Aib."},
{ORTH: "Ath.", LEMMA: "athair", POS: NOUN}, {ORTH: "Ath."},
{ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN}, {ORTH: "Beal."},
{ORTH: "a.C.n.", LEMMA: "ante Christum natum", POS: X}, {ORTH: "a.C.n."},
{ORTH: "m.sh.", LEMMA: "mar shampla", POS: ADV}, {ORTH: "m.sh."},
{ORTH: "M.F.", LEMMA: "Meán Fómhair", POS: NOUN}, {ORTH: "M.F."},
{ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN}, {ORTH: "M.Fómh."},
{ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN}, {ORTH: "D.F."},
{ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN}, {ORTH: "D.Fómh."},
{ORTH: "r.C.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "r.C."},
{ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "R.C."},
{ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "r.Ch."},
{ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "r.Chr."},
{ORTH: "R.Ch.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "R.Ch."},
{ORTH: "R.Chr.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "R.Chr."},
{ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV}, {ORTH: "⁊rl."},
{ORTH: "srl.", LEMMA: "agus araile", POS: ADV}, {ORTH: "srl."},
{ORTH: "Co.", LEMMA: "contae", POS: NOUN}, {ORTH: "Co."},
{ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN}, {ORTH: "Ean."},
{ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN}, {ORTH: "Feab."},
{ORTH: "gCo.", LEMMA: "contae", POS: NOUN}, {ORTH: "gCo."},
{ORTH: ".i.", LEMMA: "eadhon", POS: ADV}, {ORTH: ".i."},
{ORTH: "B'", LEMMA: "ba", POS: AUX}, {ORTH: "B'"},
{ORTH: "b'", LEMMA: "ba", POS: AUX}, {ORTH: "b'"},
{ORTH: "lch.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "lch."},
{ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "Lch."},
{ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "lgh."},
{ORTH: "Lgh.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "Lgh."},
{ORTH: "Lún.", LEMMA: "Lúnasa", POS: NOUN}, {ORTH: "Lún."},
{ORTH: "Már.", LEMMA: "Márta", POS: NOUN}, {ORTH: "Már."},
{ORTH: "Meith.", LEMMA: "Meitheamh", POS: NOUN}, {ORTH: "Meith."},
{ORTH: "Noll.", LEMMA: "Nollaig", POS: NOUN}, {ORTH: "Noll."},
{ORTH: "Samh.", LEMMA: "Samhain", POS: NOUN}, {ORTH: "Samh."},
{ORTH: "tAth.", LEMMA: "athair", POS: NOUN}, {ORTH: "tAth."},
{ORTH: "tUas.", LEMMA: "Uasal", POS: NOUN}, {ORTH: "tUas."},
{ORTH: "teo.", LEMMA: "teoranta", POS: NOUN}, {ORTH: "teo."},
{ORTH: "Teo.", LEMMA: "teoranta", POS: NOUN}, {ORTH: "Teo."},
{ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN}, {ORTH: "Uas."},
{ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN}, {ORTH: "uimh."},
{ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN}, {ORTH: "Uimh."},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -1,6 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -11,53 +11,47 @@ _exc = {}
for orth in ID_BASE_EXCEPTIONS: for orth in ID_BASE_EXCEPTIONS:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
orth_title = orth.title() orth_title = orth.title()
_exc[orth_title] = [{ORTH: orth_title}] _exc[orth_title] = [{ORTH: orth_title}]
orth_caps = orth.upper() orth_caps = orth.upper()
_exc[orth_caps] = [{ORTH: orth_caps}] _exc[orth_caps] = [{ORTH: orth_caps}]
orth_lower = orth.lower() orth_lower = orth.lower()
_exc[orth_lower] = [{ORTH: orth_lower}] _exc[orth_lower] = [{ORTH: orth_lower}]
orth_first_upper = orth[0].upper() + orth[1:] orth_first_upper = orth[0].upper() + orth[1:]
_exc[orth_first_upper] = [{ORTH: orth_first_upper}] _exc[orth_first_upper] = [{ORTH: orth_first_upper}]
if "-" in orth: if "-" in orth:
orth_title = "-".join([part.title() for part in orth.split("-")]) orth_title = "-".join([part.title() for part in orth.split("-")])
_exc[orth_title] = [{ORTH: orth_title}] _exc[orth_title] = [{ORTH: orth_title}]
orth_caps = "-".join([part.upper() for part in orth.split("-")]) orth_caps = "-".join([part.upper() for part in orth.split("-")])
_exc[orth_caps] = [{ORTH: orth_caps}] _exc[orth_caps] = [{ORTH: orth_caps}]
for exc_data in [ for exc_data in [
{ORTH: "Jan.", LEMMA: "Januari", NORM: "Januari"}, {ORTH: "Jan.", NORM: "Januari"},
{ORTH: "Feb.", LEMMA: "Februari", NORM: "Februari"}, {ORTH: "Feb.", NORM: "Februari"},
{ORTH: "Mar.", LEMMA: "Maret", NORM: "Maret"}, {ORTH: "Mar.", NORM: "Maret"},
{ORTH: "Apr.", LEMMA: "April", NORM: "April"}, {ORTH: "Apr.", NORM: "April"},
{ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"}, {ORTH: "Jun.", NORM: "Juni"},
{ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"}, {ORTH: "Jul.", NORM: "Juli"},
{ORTH: "Agu.", LEMMA: "Agustus", NORM: "Agustus"}, {ORTH: "Agu.", NORM: "Agustus"},
{ORTH: "Ags.", LEMMA: "Agustus", NORM: "Agustus"}, {ORTH: "Ags.", NORM: "Agustus"},
{ORTH: "Sep.", LEMMA: "September", NORM: "September"}, {ORTH: "Sep.", NORM: "September"},
{ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"}, {ORTH: "Okt.", NORM: "Oktober"},
{ORTH: "Nov.", LEMMA: "November", NORM: "November"}, {ORTH: "Nov.", NORM: "November"},
{ORTH: "Des.", LEMMA: "Desember", NORM: "Desember"}, {ORTH: "Des.", NORM: "Desember"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
_other_exc = { _other_exc = {
"do'a": [{ORTH: "do'a", LEMMA: "doa", NORM: "doa"}], "do'a": [{ORTH: "do'a", NORM: "doa"}],
"jum'at": [{ORTH: "jum'at", LEMMA: "Jumat", NORM: "Jumat"}], "jum'at": [{ORTH: "jum'at", NORM: "Jumat"}],
"Jum'at": [{ORTH: "Jum'at", LEMMA: "Jumat", NORM: "Jumat"}], "Jum'at": [{ORTH: "Jum'at", NORM: "Jumat"}],
"la'nat": [{ORTH: "la'nat", LEMMA: "laknat", NORM: "laknat"}], "la'nat": [{ORTH: "la'nat", NORM: "laknat"}],
"ma'af": [{ORTH: "ma'af", LEMMA: "maaf", NORM: "maaf"}], "ma'af": [{ORTH: "ma'af", NORM: "maaf"}],
"mu'jizat": [{ORTH: "mu'jizat", LEMMA: "mukjizat", NORM: "mukjizat"}], "mu'jizat": [{ORTH: "mu'jizat", NORM: "mukjizat"}],
"Mu'jizat": [{ORTH: "Mu'jizat", LEMMA: "mukjizat", NORM: "mukjizat"}], "Mu'jizat": [{ORTH: "Mu'jizat", NORM: "mukjizat"}],
"ni'mat": [{ORTH: "ni'mat", LEMMA: "nikmat", NORM: "nikmat"}], "ni'mat": [{ORTH: "ni'mat", NORM: "nikmat"}],
"raka'at": [{ORTH: "raka'at", LEMMA: "rakaat", NORM: "rakaat"}], "raka'at": [{ORTH: "raka'at", NORM: "rakaat"}],
"ta'at": [{ORTH: "ta'at", LEMMA: "taat", NORM: "taat"}], "ta'at": [{ORTH: "ta'at", NORM: "taat"}],
} }
_exc.update(_other_exc) _exc.update(_other_exc)

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH
from ...util import update_exc from ...util import update_exc
@ -10,7 +10,7 @@ _exc = {
"L'art.": [{ORTH: "L'"}, {ORTH: "art."}], "L'art.": [{ORTH: "L'"}, {ORTH: "art."}],
"l'art.": [{ORTH: "l'"}, {ORTH: "art."}], "l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
"nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}], "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
"po'": [{ORTH: "po'", LEMMA: "poco"}], "po'": [{ORTH: "po'"}],
"sett..": [{ORTH: "sett."}, {ORTH: "."}], "sett..": [{ORTH: "sett."}, {ORTH: "."}],
} }

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -10,19 +10,19 @@ _exc = {}
# translate / delete what is not necessary # translate / delete what is not necessary
for exc_data in [ for exc_data in [
{ORTH: "t", LEMMA: "et", NORM: "et"}, {ORTH: "t", NORM: "et"},
{ORTH: "T", LEMMA: "et", NORM: "et"}, {ORTH: "T", NORM: "et"},
{ORTH: "'t", LEMMA: "et", NORM: "et"}, {ORTH: "'t", NORM: "et"},
{ORTH: "'T", LEMMA: "et", NORM: "et"}, {ORTH: "'T", NORM: "et"},
{ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"}, {ORTH: "wgl.", NORM: "wannechgelift"},
{ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"}, {ORTH: "M.", NORM: "Monsieur"},
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"}, {ORTH: "Mme.", NORM: "Madame"},
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"}, {ORTH: "Dr.", NORM: "Dokter"},
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"}, {ORTH: "Tel.", NORM: "Telefon"},
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, {ORTH: "asw.", NORM: "an sou weider"},
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, {ORTH: "etc.", NORM: "et cetera"},
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, {ORTH: "bzw.", NORM: "bezéiungsweis"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, {ORTH: "Jan.", NORM: "Januar"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -1,53 +1,50 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH
from ...util import update_exc from ...util import update_exc
_exc = {} _exc = {}
for raw, lemma in [ for raw in [
("a-a", "a-o"), "a-e",
("a-e", "a-o"), "a-o",
("a-o", "a-o"), "a-i",
("a-i", "a-o"), "a-a",
("co-a", "co-o"), "co-a",
("co-e", "co-o"), "co-e",
("co-i", "co-o"), "co-i",
("co-o", "co-o"), "co-o",
("da-a", "da-o"), "da-a",
("da-e", "da-o"), "da-e",
("da-i", "da-o"), "da-i",
("da-o", "da-o"), "da-o",
("pe-a", "pe-o"), "pe-a",
("pe-e", "pe-o"), "pe-e",
("pe-i", "pe-o"), "pe-i",
("pe-o", "pe-o"), "pe-o",
]: ]:
for orth in [raw, raw.capitalize()]: for orth in [raw, raw.capitalize()]:
_exc[orth] = [{ORTH: orth, LEMMA: lemma}] _exc[orth] = [{ORTH: orth}]
# Prefix + prepositions with à (e.g. "sott'a-o") # Prefix + prepositions with à (e.g. "sott'a-o")
for prep, prep_lemma in [ for prep in [
("a-a", "a-o"), "a-a",
("a-e", "a-o"), "a-e",
("a-o", "a-o"), "a-o",
("a-i", "a-o"), "a-i",
]: ]:
for prefix, prefix_lemma in [ for prefix in [
("sott'", "sotta"), "sott'",
("sott", "sotta"), "sott",
("contr'", "contra"), "contr'",
("contr", "contra"), "contr",
("ch'", "che"), "ch'",
("ch", "che"), "ch",
("s'", "se"), "s'",
("s", "se"), "s",
]: ]:
for prefix_orth in [prefix, prefix.capitalize()]: for prefix_orth in [prefix, prefix.capitalize()]:
_exc[prefix_orth + prep] = [ _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
{ORTH: prefix_orth, LEMMA: prefix_lemma},
{ORTH: prep, LEMMA: prep_lemma},
]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -7,17 +7,17 @@ _exc = {}
for exc_data in [ for exc_data in [
{ORTH: "jan.", LEMMA: "januar"}, {ORTH: "jan.", NORM: "januar"},
{ORTH: "feb.", LEMMA: "februar"}, {ORTH: "feb.", NORM: "februar"},
{ORTH: "mar.", LEMMA: "mars"}, {ORTH: "mar.", NORM: "mars"},
{ORTH: "apr.", LEMMA: "april"}, {ORTH: "apr.", NORM: "april"},
{ORTH: "jun.", LEMMA: "juni"}, {ORTH: "jun.", NORM: "juni"},
{ORTH: "jul.", LEMMA: "juli"}, {ORTH: "jul.", NORM: "juli"},
{ORTH: "aug.", LEMMA: "august"}, {ORTH: "aug.", NORM: "august"},
{ORTH: "sep.", LEMMA: "september"}, {ORTH: "sep.", NORM: "september"},
{ORTH: "okt.", LEMMA: "oktober"}, {ORTH: "okt.", NORM: "oktober"},
{ORTH: "nov.", LEMMA: "november"}, {ORTH: "nov.", NORM: "november"},
{ORTH: "des.", LEMMA: "desember"}, {ORTH: "des.", NORM: "desember"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -7,58 +7,56 @@ _exc = {}
_abbrev_exc = [ _abbrev_exc = [
# Weekdays abbreviations # Weekdays abbreviations
{ORTH: "пн", LEMMA: "понедельник", NORM: "понедельник"}, {ORTH: "пн", NORM: "понедельник"},
{ORTH: "вт", LEMMA: "вторник", NORM: "вторник"}, {ORTH: "вт", NORM: "вторник"},
{ORTH: "ср", LEMMA: "среда", NORM: "среда"}, {ORTH: "ср", NORM: "среда"},
{ORTH: "чт", LEMMA: "четверг", NORM: "четверг"}, {ORTH: "чт", NORM: "четверг"},
{ORTH: "чтв", LEMMA: "четверг", NORM: "четверг"}, {ORTH: "чтв", NORM: "четверг"},
{ORTH: "пт", LEMMA: "пятница", NORM: "пятница"}, {ORTH: "пт", NORM: "пятница"},
{ORTH: "сб", LEMMA: "суббота", NORM: "суббота"}, {ORTH: "сб", NORM: "суббота"},
{ORTH: "сбт", LEMMA: "суббота", NORM: "суббота"}, {ORTH: "сбт", NORM: "суббота"},
{ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"}, {ORTH: "вс", NORM: "воскресенье"},
{ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"}, {ORTH: "вскр", NORM: "воскресенье"},
{ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"}, {ORTH: "воскр", NORM: "воскресенье"},
# Months abbreviations # Months abbreviations
{ORTH: "янв", LEMMA: "январь", NORM: "январь"}, {ORTH: "янв", NORM: "январь"},
{ORTH: "фев", LEMMA: "февраль", NORM: "февраль"}, {ORTH: "фев", NORM: "февраль"},
{ORTH: "февр", LEMMA: "февраль", NORM: "февраль"}, {ORTH: "февр", NORM: "февраль"},
{ORTH: "мар", LEMMA: "март", NORM: "март"}, {ORTH: "мар", NORM: "март"},
# {ORTH: "март", LEMMA: "март", NORM: "март"}, # {ORTH: "март", NORM: "март"},
{ORTH: "мрт", LEMMA: "март", NORM: "март"}, {ORTH: "мрт", NORM: "март"},
{ORTH: "апр", LEMMA: "апрель", NORM: "апрель"}, {ORTH: "апр", NORM: "апрель"},
# {ORTH: "май", LEMMA: "май", NORM: "май"}, # {ORTH: "май", NORM: "май"},
{ORTH: "июн", LEMMA: "июнь", NORM: "июнь"}, {ORTH: "июн", NORM: "июнь"},
# {ORTH: "июнь", LEMMA: "июнь", NORM: "июнь"}, # {ORTH: "июнь", NORM: "июнь"},
{ORTH: "июл", LEMMA: "июль", NORM: "июль"}, {ORTH: "июл", NORM: "июль"},
# {ORTH: "июль", LEMMA: "июль", NORM: "июль"}, # {ORTH: "июль", NORM: "июль"},
{ORTH: "авг", LEMMA: "август", NORM: "август"}, {ORTH: "авг", NORM: "август"},
{ORTH: "сен", LEMMA: "сентябрь", NORM: "сентябрь"}, {ORTH: "сен", NORM: "сентябрь"},
{ORTH: "сент", LEMMA: "сентябрь", NORM: "сентябрь"}, {ORTH: "сент", NORM: "сентябрь"},
{ORTH: "окт", LEMMA: "октябрь", NORM: "октябрь"}, {ORTH: "окт", NORM: "октябрь"},
{ORTH: "октб", LEMMA: "октябрь", NORM: "октябрь"}, {ORTH: "октб", NORM: "октябрь"},
{ORTH: "ноя", LEMMA: "ноябрь", NORM: "ноябрь"}, {ORTH: "ноя", NORM: "ноябрь"},
{ORTH: "нояб", LEMMA: "ноябрь", NORM: "ноябрь"}, {ORTH: "нояб", NORM: "ноябрь"},
{ORTH: "нбр", LEMMA: "ноябрь", NORM: "ноябрь"}, {ORTH: "нбр", NORM: "ноябрь"},
{ORTH: "дек", LEMMA: "декабрь", NORM: "декабрь"}, {ORTH: "дек", NORM: "декабрь"},
] ]
for abbrev_desc in _abbrev_exc: for abbrev_desc in _abbrev_exc:
abbrev = abbrev_desc[ORTH] abbrev = abbrev_desc[ORTH]
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
_exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] _exc[orth] = [{ORTH: orth, NORM: abbrev_desc[NORM]}]
_exc[orth + "."] = [ _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
{ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}
]
_slang_exc = [ _slang_exc = [
{ORTH: "2к15", LEMMA: "2015", NORM: "2015"}, {ORTH: "2к15", NORM: "2015"},
{ORTH: "2к16", LEMMA: "2016", NORM: "2016"}, {ORTH: "2к16", NORM: "2016"},
{ORTH: "2к17", LEMMA: "2017", NORM: "2017"}, {ORTH: "2к17", NORM: "2017"},
{ORTH: "2к18", LEMMA: "2018", NORM: "2018"}, {ORTH: "2к18", NORM: "2018"},
{ORTH: "2к19", LEMMA: "2019", NORM: "2019"}, {ORTH: "2к19", NORM: "2019"},
{ORTH: "2к20", LEMMA: "2020", NORM: "2020"}, {ORTH: "2к20", NORM: "2020"},
] ]
for slang_desc in _slang_exc: for slang_desc in _slang_exc:

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -7,85 +7,83 @@ _exc = {}
_abbrev_exc = [ _abbrev_exc = [
# Weekdays abbreviations # Weekdays abbreviations
{ORTH: "пoн", LEMMA: "понедељак", NORM: "понедељак"}, {ORTH: "пoн", NORM: "понедељак"},
{ORTH: "уто", LEMMA: "уторак", NORM: "уторак"}, {ORTH: "уто", NORM: "уторак"},
{ORTH: "сре", LEMMA: "среда", NORM: "среда"}, {ORTH: "сре", NORM: "среда"},
{ORTH: "чет", LEMMA: "четвртак", NORM: "четвртак"}, {ORTH: "чет", NORM: "четвртак"},
{ORTH: "пет", LEMMA: "петак", NORM: "петак"}, {ORTH: "пет", NORM: "петак"},
{ORTH: "суб", LEMMA: "субота", NORM: "субота"}, {ORTH: "суб", NORM: "субота"},
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"}, {ORTH: "нед", NORM: "недеља"},
# Months abbreviations # Months abbreviations
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"}, {ORTH: "јан", NORM: "јануар"},
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"}, {ORTH: "феб", NORM: "фебруар"},
{ORTH: "мар", LEMMA: "март", NORM: "март"}, {ORTH: "мар", NORM: "март"},
{ORTH: "апр", LEMMA: "април", NORM: "април"}, {ORTH: "апр", NORM: "април"},
{ORTH: "јуни", LEMMA: "јун", NORM: "јун"}, {ORTH: "јуни", NORM: "јун"},
{ORTH: "јули", LEMMA: "јул", NORM: "јул"}, {ORTH: "јули", NORM: "јул"},
{ORTH: "авг", LEMMA: "август", NORM: "август"}, {ORTH: "авг", NORM: "август"},
{ORTH: "сеп", LEMMA: "септембар", NORM: "септембар"}, {ORTH: "сеп", NORM: "септембар"},
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"}, {ORTH: "септ", NORM: "септембар"},
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"}, {ORTH: "окт", NORM: "октобар"},
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"}, {ORTH: "нов", NORM: "новембар"},
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}, {ORTH: "дец", NORM: "децембар"},
] ]
for abbrev_desc in _abbrev_exc: for abbrev_desc in _abbrev_exc:
abbrev = abbrev_desc[ORTH] abbrev = abbrev_desc[ORTH]
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
_exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] _exc[orth] = [{ORTH: orth, NORM: abbrev_desc[NORM]}]
_exc[orth + "."] = [ _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
{ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}
]
# common abbreviations # common abbreviations
_slang_exc = [ _slang_exc = [
# without dot # without dot
{ORTH: "др", LEMMA: "доктор", NORM: "доктор"}, {ORTH: "др", NORM: "доктор"},
{ORTH: "гдин", LEMMA: "господин", NORM: "господин"}, {ORTH: "гдин", NORM: "господин"},
{ORTH: "гђа", LEMMA: "госпођа", NORM: "госпођа"}, {ORTH: "гђа", NORM: "госпођа"},
{ORTH: "гђица", LEMMA: "госпођица", NORM: "госпођица"}, {ORTH: "гђица", NORM: "госпођица"},
{ORTH: "мр", LEMMA: "магистар", NORM: "магистар"}, {ORTH: "мр", NORM: "магистар"},
{ORTH: "Бгд", LEMMA: "Београд", NORM: "београд"}, {ORTH: "Бгд", NORM: "београд"},
{ORTH: "цм", LEMMA: "центиметар", NORM: "центиметар"}, {ORTH: "цм", NORM: "центиметар"},
{ORTH: "м", LEMMA: "метар", NORM: "метар"}, {ORTH: "м", NORM: "метар"},
{ORTH: "км", LEMMA: "километар", NORM: "километар"}, {ORTH: "км", NORM: "километар"},
{ORTH: "мг", LEMMA: "милиграм", NORM: "милиграм"}, {ORTH: "мг", NORM: "милиграм"},
{ORTH: "кг", LEMMA: "килограм", NORM: "килограм"}, {ORTH: "кг", NORM: "килограм"},
{ORTH: "дл", LEMMA: "децилитар", NORM: "децилитар"}, {ORTH: "дл", NORM: "децилитар"},
{ORTH: "хл", LEMMA: "хектолитар", NORM: "хектолитар"}, {ORTH: "хл", NORM: "хектолитар"},
# with dot # with dot
{ORTH: "ул.", LEMMA: "улица", NORM: "улица"}, {ORTH: "ул.", NORM: "улица"},
{ORTH: "бр.", LEMMA: "број", NORM: "број"}, {ORTH: "бр.", NORM: "број"},
{ORTH: "нпр.", LEMMA: "на пример", NORM: "на пример"}, {ORTH: "нпр.", NORM: "на пример"},
{ORTH: "тзв.", LEMMA: "такозван", NORM: "такозван"}, {ORTH: "тзв.", NORM: "такозван"},
{ORTH: "проф.", LEMMA: "професор", NORM: "професор"}, {ORTH: "проф.", NORM: "професор"},
{ORTH: "стр.", LEMMA: "страна", NORM: "страна"}, {ORTH: "стр.", NORM: "страна"},
{ORTH: "једн.", LEMMA: "једнина", NORM: "једнина"}, {ORTH: "једн.", NORM: "једнина"},
{ORTH: "мн.", LEMMA: "множина", NORM: "множина"}, {ORTH: "мн.", NORM: "множина"},
{ORTH: "уч.", LEMMA: "ученик", NORM: "ученик"}, {ORTH: "уч.", NORM: "ученик"},
{ORTH: "разр.", LEMMA: "разред", NORM: "разред"}, {ORTH: "разр.", NORM: "разред"},
{ORTH: "инж.", LEMMA: "инжењер", NORM: "инжењер"}, {ORTH: "инж.", NORM: "инжењер"},
{ORTH: "гимн.", LEMMA: "гимназија", NORM: "гимназија"}, {ORTH: "гимн.", NORM: "гимназија"},
{ORTH: "год.", LEMMA: "година", NORM: "година"}, {ORTH: "год.", NORM: "година"},
{ORTH: "мед.", LEMMA: "медицина", NORM: "медицина"}, {ORTH: "мед.", NORM: "медицина"},
{ORTH: "гимн.", LEMMA: "гимназија", NORM: "гимназија"}, {ORTH: "гимн.", NORM: "гимназија"},
{ORTH: "акад.", LEMMA: "академик", NORM: "академик"}, {ORTH: "акад.", NORM: "академик"},
{ORTH: "доц.", LEMMA: "доцент", NORM: "доцент"}, {ORTH: "доц.", NORM: "доцент"},
{ORTH: "итд.", LEMMA: "и тако даље", NORM: "и тако даље"}, {ORTH: "итд.", NORM: "и тако даље"},
{ORTH: "и сл.", LEMMA: "и слично", NORM: "и слично"}, {ORTH: "и сл.", NORM: "и слично"},
{ORTH: "н.е.", LEMMA: "нова ера", NORM: "нове ере"}, {ORTH: "н.е.", NORM: "нове ере"},
{ORTH: "о.г.", LEMMA: "ова година", NORM: "ове године"}, {ORTH: "о.г.", NORM: "ове године"},
{ORTH: "л.к.", LEMMA: "лична карта", NORM: "лична карта"}, {ORTH: "л.к.", NORM: "лична карта"},
{ORTH: "в.д.", LEMMA: "вршилац дужности", NORM: "вршилац дужности"}, {ORTH: "в.д.", NORM: "вршилац дужности"},
{ORTH: "стр.", LEMMA: "страна", NORM: "страна"}, {ORTH: "стр.", NORM: "страна"},
# with qoute # with qoute
{ORTH: "ал'", LEMMA: "али", NORM: "али"}, {ORTH: "ал'", NORM: "али"},
{ORTH: "ил'", LEMMA: "или", NORM: "или"}, {ORTH: "ил'", NORM: "или"},
{ORTH: "је л'", LEMMA: "је ли", NORM: "је ли"}, {ORTH: "је л'", NORM: "је ли"},
{ORTH: "да л'", LEMMA: "да ли", NORM: "да ли"}, {ORTH: "да л'", NORM: "да ли"},
{ORTH: "држ'те", LEMMA: "држати", NORM: "држите"}, {ORTH: "држ'те", NORM: "држите"},
] ]
for slang_desc in _slang_exc: for slang_desc in _slang_exc:

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA from ...symbols import NORM, ORTH
from ...util import update_exc from ...util import update_exc
_exc = {} _exc = {}
@ -10,61 +10,58 @@ _exc = {}
for verb_data in [ for verb_data in [
{ORTH: "driver"}, {ORTH: "driver"},
{ORTH: "kör"}, {ORTH: "kör"},
{ORTH: "hörr", LEMMA: "hör"}, {ORTH: "hörr"},
{ORTH: "fattar"}, {ORTH: "fattar"},
{ORTH: "hajar", LEMMA: "förstår"}, {ORTH: "hajar"},
{ORTH: "lever"}, {ORTH: "lever"},
{ORTH: "serr", LEMMA: "ser"}, {ORTH: "serr"},
{ORTH: "fixar"}, {ORTH: "fixar"},
]: ]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "u"] = [ _exc[data[ORTH] + "u"] = [data, {ORTH: "u", NORM: "du"}]
dict(data),
{ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"},
]
# Abbreviations for weekdays "sön." (for "söndag" / "söner") # Abbreviations for weekdays "sön." (for "söndag" / "söner")
# are left out because they are ambiguous. The same is the case # are left out because they are ambiguous. The same is the case
# for abbreviations "jul." and "Jul." ("juli" / "jul"). # for abbreviations "jul." and "Jul." ("juli" / "jul").
for exc_data in [ for exc_data in [
{ORTH: "jan.", LEMMA: "januari"}, {ORTH: "jan.", NORM: "januari"},
{ORTH: "febr.", LEMMA: "februari"}, {ORTH: "febr.", NORM: "februari"},
{ORTH: "feb.", LEMMA: "februari"}, {ORTH: "feb.", NORM: "februari"},
{ORTH: "apr.", LEMMA: "april"}, {ORTH: "apr.", NORM: "april"},
{ORTH: "jun.", LEMMA: "juni"}, {ORTH: "jun.", NORM: "juni"},
{ORTH: "aug.", LEMMA: "augusti"}, {ORTH: "aug.", NORM: "augusti"},
{ORTH: "sept.", LEMMA: "september"}, {ORTH: "sept.", NORM: "september"},
{ORTH: "sep.", LEMMA: "september"}, {ORTH: "sep.", NORM: "september"},
{ORTH: "okt.", LEMMA: "oktober"}, {ORTH: "okt.", NORM: "oktober"},
{ORTH: "nov.", LEMMA: "november"}, {ORTH: "nov.", NORM: "november"},
{ORTH: "dec.", LEMMA: "december"}, {ORTH: "dec.", NORM: "december"},
{ORTH: "mån.", LEMMA: "måndag"}, {ORTH: "mån.", NORM: "måndag"},
{ORTH: "tis.", LEMMA: "tisdag"}, {ORTH: "tis.", NORM: "tisdag"},
{ORTH: "ons.", LEMMA: "onsdag"}, {ORTH: "ons.", NORM: "onsdag"},
{ORTH: "tors.", LEMMA: "torsdag"}, {ORTH: "tors.", NORM: "torsdag"},
{ORTH: "fre.", LEMMA: "fredag"}, {ORTH: "fre.", NORM: "fredag"},
{ORTH: "lör.", LEMMA: "lördag"}, {ORTH: "lör.", NORM: "lördag"},
{ORTH: "Jan.", LEMMA: "Januari"}, {ORTH: "Jan.", NORM: "Januari"},
{ORTH: "Febr.", LEMMA: "Februari"}, {ORTH: "Febr.", NORM: "Februari"},
{ORTH: "Feb.", LEMMA: "Februari"}, {ORTH: "Feb.", NORM: "Februari"},
{ORTH: "Apr.", LEMMA: "April"}, {ORTH: "Apr.", NORM: "April"},
{ORTH: "Jun.", LEMMA: "Juni"}, {ORTH: "Jun.", NORM: "Juni"},
{ORTH: "Aug.", LEMMA: "Augusti"}, {ORTH: "Aug.", NORM: "Augusti"},
{ORTH: "Sept.", LEMMA: "September"}, {ORTH: "Sept.", NORM: "September"},
{ORTH: "Sep.", LEMMA: "September"}, {ORTH: "Sep.", NORM: "September"},
{ORTH: "Okt.", LEMMA: "Oktober"}, {ORTH: "Okt.", NORM: "Oktober"},
{ORTH: "Nov.", LEMMA: "November"}, {ORTH: "Nov.", NORM: "November"},
{ORTH: "Dec.", LEMMA: "December"}, {ORTH: "Dec.", NORM: "December"},
{ORTH: "Mån.", LEMMA: "Måndag"}, {ORTH: "Mån.", NORM: "Måndag"},
{ORTH: "Tis.", LEMMA: "Tisdag"}, {ORTH: "Tis.", NORM: "Tisdag"},
{ORTH: "Ons.", LEMMA: "Onsdag"}, {ORTH: "Ons.", NORM: "Onsdag"},
{ORTH: "Tors.", LEMMA: "Torsdag"}, {ORTH: "Tors.", NORM: "Torsdag"},
{ORTH: "Fre.", LEMMA: "Fredag"}, {ORTH: "Fre.", NORM: "Fredag"},
{ORTH: "Lör.", LEMMA: "Lördag"}, {ORTH: "Lör.", NORM: "Lördag"},
{ORTH: "sthlm", LEMMA: "Stockholm"}, {ORTH: "sthlm", NORM: "Stockholm"},
{ORTH: "gbg", LEMMA: "Göteborg"}, {ORTH: "gbg", NORM: "Göteborg"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -154,6 +151,6 @@ for orth in ABBREVIATIONS:
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
# should be tokenized as two separate tokens. # should be tokenized as two separate tokens.
for orth in ["i", "m"]: for orth in ["i", "m"]:
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}] _exc[orth + "."] = [{ORTH: orth, NORM: orth, NORM: orth}, {ORTH: "."}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,469 +1,438 @@
from ...symbols import ORTH, LEMMA from ...symbols import ORTH
_exc = { _exc = {
# หน่วยงานรัฐ / government agency # หน่วยงานรัฐ / government agency
"กกต.": [{ORTH: "กกต.", LEMMA: "คณะกรรมการการเลือกตั้ง"}], "กกต.": [{ORTH: "กกต."}],
"กทท.": [{ORTH: "กทท.", LEMMA: "การท่าเรือแห่งประเทศไทย"}], "กทท.": [{ORTH: "กทท."}],
"กทพ.": [{ORTH: "กทพ.", LEMMA: "การทางพิเศษแห่งประเทศไทย"}], "กทพ.": [{ORTH: "กทพ."}],
"กบข.": [{ORTH: "กบข.", LEMMA: "กองทุนบำเหน็จบำนาญข้าราชการพลเรือน"}], "กบข.": [{ORTH: "กบข."}],
"กบว.": [{ORTH: "กบว.", LEMMA: "คณะกรรมการบริหารวิทยุกระจายเสียงและวิทยุโทรทัศน์"}], "กบว.": [{ORTH: "กบว."}],
"กปน.": [{ORTH: "กปน.", LEMMA: "การประปานครหลวง"}], "กปน.": [{ORTH: "กปน."}],
"กปภ.": [{ORTH: "กปภ.", LEMMA: "การประปาส่วนภูมิภาค"}], "กปภ.": [{ORTH: "กปภ."}],
"กปส.": [{ORTH: "กปส.", LEMMA: "กรมประชาสัมพันธ์"}], "กปส.": [{ORTH: "กปส."}],
"กผม.": [{ORTH: "กผม.", LEMMA: "กองผังเมือง"}], "กผม.": [{ORTH: "กผม."}],
"กฟน.": [{ORTH: "กฟน.", LEMMA: "การไฟฟ้านครหลวง"}], "กฟน.": [{ORTH: "กฟน."}],
"กฟผ.": [{ORTH: "กฟผ.", LEMMA: "การไฟฟ้าฝ่ายผลิตแห่งประเทศไทย"}], "กฟผ.": [{ORTH: "กฟผ."}],
"กฟภ.": [{ORTH: "กฟภ.", LEMMA: "การไฟฟ้าส่วนภูมิภาค"}], "กฟภ.": [{ORTH: "กฟภ."}],
"ก.ช.น.": [{ORTH: "ก.ช.น.", LEMMA: "คณะกรรมการช่วยเหลือชาวนาชาวไร่"}], "ก.ช.น.": [{ORTH: "ก.ช.น."}],
"กยศ.": [{ORTH: "กยศ.", LEMMA: "กองทุนเงินให้กู้ยืมเพื่อการศึกษา"}], "กยศ.": [{ORTH: "กยศ."}],
"ก.ล.ต.": [{ORTH: "ก.ล.ต.", LEMMA: "คณะกรรมการกำกับหลักทรัพย์และตลาดหลักทรัพย์"}], "ก.ล.ต.": [{ORTH: "ก.ล.ต."}],
"กศ.บ.": [{ORTH: "กศ.บ.", LEMMA: "การศึกษาบัณฑิต"}], "กศ.บ.": [{ORTH: "กศ.บ."}],
"กศน.": [{ORTH: "กศน.", LEMMA: "กรมการศึกษานอกโรงเรียน"}], "กศน.": [{ORTH: "กศน."}],
"กสท.": [{ORTH: "กสท.", LEMMA: "การสื่อสารแห่งประเทศไทย"}], "กสท.": [{ORTH: "กสท."}],
"กอ.รมน.": [{ORTH: "กอ.รมน.", LEMMA: "กองอำนวยการรักษาความมั่นคงภายใน"}], "กอ.รมน.": [{ORTH: "กอ.รมน."}],
"กร.": [{ORTH: "กร.", LEMMA: "กองเรือยุทธการ"}], "กร.": [{ORTH: "กร."}],
"ขสมก.": [{ORTH: "ขสมก.", LEMMA: "องค์การขนส่งมวลชนกรุงเทพ"}], "ขสมก.": [{ORTH: "ขสมก."}],
"คตง.": [{ORTH: "คตง.", LEMMA: "คณะกรรมการตรวจเงินแผ่นดิน"}], "คตง.": [{ORTH: "คตง."}],
"ครม.": [{ORTH: "ครม.", LEMMA: "คณะรัฐมนตรี"}], "ครม.": [{ORTH: "ครม."}],
"คมช.": [{ORTH: "คมช.", LEMMA: "คณะมนตรีความมั่นคงแห่งชาติ"}], "คมช.": [{ORTH: "คมช."}],
"ตชด.": [{ORTH: "ตชด.", LEMMA: "ตำรวจตะเวนชายเดน"}], "ตชด.": [{ORTH: "ตชด."}],
"ตม.": [{ORTH: "ตม.", LEMMA: "กองตรวจคนเข้าเมือง"}], "ตม.": [{ORTH: "ตม."}],
"ตร.": [{ORTH: "ตร.", LEMMA: "ตำรวจ"}], "ตร.": [{ORTH: "ตร."}],
"ททท.": [{ORTH: "ททท.", LEMMA: "การท่องเที่ยวแห่งประเทศไทย"}], "ททท.": [{ORTH: "ททท."}],
"ททบ.": [{ORTH: "ททบ.", LEMMA: "สถานีวิทยุโทรทัศน์กองทัพบก"}], "ททบ.": [{ORTH: "ททบ."}],
"ทบ.": [{ORTH: "ทบ.", LEMMA: "กองทัพบก"}], "ทบ.": [{ORTH: "ทบ."}],
"ทร.": [{ORTH: "ทร.", LEMMA: "กองทัพเรือ"}], "ทร.": [{ORTH: "ทร."}],
"ทอ.": [{ORTH: "ทอ.", LEMMA: "กองทัพอากาศ"}], "ทอ.": [{ORTH: "ทอ."}],
"ทอท.": [{ORTH: "ทอท.", LEMMA: "การท่าอากาศยานแห่งประเทศไทย"}], "ทอท.": [{ORTH: "ทอท."}],
"ธ.ก.ส.": [{ORTH: "ธ.ก.ส.", LEMMA: "ธนาคารเพื่อการเกษตรและสหกรณ์การเกษตร"}], "ธ.ก.ส.": [{ORTH: "ธ.ก.ส."}],
"ธปท.": [{ORTH: "ธปท.", LEMMA: "ธนาคารแห่งประเทศไทย"}], "ธปท.": [{ORTH: "ธปท."}],
"ธอส.": [{ORTH: "ธอส.", LEMMA: "ธนาคารอาคารสงเคราะห์"}], "ธอส.": [{ORTH: "ธอส."}],
"นย.": [{ORTH: "นย.", LEMMA: "นาวิกโยธิน"}], "นย.": [{ORTH: "นย."}],
"ปตท.": [{ORTH: "ปตท.", LEMMA: "การปิโตรเลียมแห่งประเทศไทย"}], "ปตท.": [{ORTH: "ปตท."}],
"ป.ป.ช.": [ "ป.ป.ช.": [{ORTH: "ป.ป.ช."}],
{ "ป.ป.ส.": [{ORTH: "ป.ป.ส."}],
ORTH: "ป.ป.ช.", "บพร.": [{ORTH: "บพร."}],
LEMMA: "คณะกรรมการป้องกันและปราบปรามการทุจริตและประพฤติมิชอบในวงราชการ", "บย.": [{ORTH: "บย."}],
} "พสวท.": [{ORTH: "พสวท."}],
], "มอก.": [{ORTH: "มอก."}],
"ป.ป.ส.": [{ORTH: "ป.ป.ส.", LEMMA: "คณะกรรมการป้องกันและปราบปรามยาเสพติด"}], "ยธ.": [{ORTH: "ยธ."}],
"บพร.": [{ORTH: "บพร.", LEMMA: "กรมการบินพลเรือน"}], "รพช.": [{ORTH: "รพช."}],
"บย.": [{ORTH: "บย.", LEMMA: "กองบินยุทธการ"}], "รฟท.": [{ORTH: "รฟท."}],
"พสวท.": [ "รฟม.": [{ORTH: "รฟม."}],
{ "ศธ.": [{ORTH: "ศธ."}],
ORTH: "พสวท.", "ศนธ.": [{ORTH: "ศนธ."}],
LEMMA: "โครงการพัฒนาและส่งเสริมผู้มีความรู้ความสามารถพิเศษทางวิทยาศาสตร์และเทคโนโลยี", "สกจ.": [{ORTH: "สกจ."}],
} "สกท.": [{ORTH: "สกท."}],
], "สกว.": [{ORTH: "สกว."}],
"มอก.": [{ORTH: "มอก.", LEMMA: "สำนักงานมาตรฐานผลิตภัณฑ์อุตสาหกรรม"}], "สคบ.": [{ORTH: "สคบ."}],
"ยธ.": [{ORTH: "ยธ.", LEMMA: "กรมโยธาธิการ"}], "สจร.": [{ORTH: "สจร."}],
"รพช.": [{ORTH: "รพช.", LEMMA: "สำนักงานเร่งรัดพัฒนาชนบท"}], "สตง.": [{ORTH: "สตง."}],
"รฟท.": [{ORTH: "รฟท.", LEMMA: "การรถไฟแห่งประเทศไทย"}], "สทท.": [{ORTH: "สทท."}],
"รฟม.": [{ORTH: "รฟม.", LEMMA: "การรถไฟฟ้าขนส่งมวลชนแห่งประเทศไทย"}], "สทร.": [{ORTH: "สทร."}],
"ศธ.": [{ORTH: "ศธ.", LEMMA: "กระทรวงศึกษาธิการ"}], "สธ": [{ORTH: "สธ"}],
"ศนธ.": [{ORTH: "ศนธ.", LEMMA: "ศูนย์กลางนิสิตนักศึกษาแห่งประเทศไทย"}], "สนช.": [{ORTH: "สนช."}],
"สกจ.": [{ORTH: "สกจ.", LEMMA: "สหกรณ์จังหวัด"}], "สนนท.": [{ORTH: "สนนท."}],
"สกท.": [{ORTH: "สกท.", LEMMA: "สำนักงานคณะกรรมการส่งเสริมการลงทุน"}], "สปก.": [{ORTH: "สปก."}],
"สกว.": [{ORTH: "สกว.", LEMMA: "สำนักงานกองทุนสนับสนุนการวิจัย"}], "สปช.": [{ORTH: "สปช."}],
"สคบ.": [{ORTH: "สคบ.", LEMMA: "สำนักงานคณะกรรมการคุ้มครองผู้บริโภค"}], "สปอ.": [{ORTH: "สปอ."}],
"สจร.": [{ORTH: "สจร.", LEMMA: "สำนักงานคณะกรรมการจัดระบบการจราจรทางบก"}], "สพช.": [{ORTH: "สพช."}],
"สตง.": [{ORTH: "สตง.", LEMMA: "สำนักงานตรวจเงินแผ่นดิน"}], "สยช.": [{ORTH: "สยช."}],
"สทท.": [{ORTH: "สทท.", LEMMA: "สถานีวิทยุโทรทัศน์แห่งประเทศไทย"}], "สวช.": [{ORTH: "สวช."}],
"สทร.": [{ORTH: "สทร.", LEMMA: "สำนักงานกลางทะเบียนราษฎร์"}], "สวท.": [{ORTH: "สวท."}],
"สธ": [{ORTH: "สธ", LEMMA: "กระทรวงสาธารณสุข"}], "สวทช.": [{ORTH: "สวทช."}],
"สนช.": [{ORTH: "สนช.", LEMMA: "สภานิติบัญญัติแห่งชาติ,สำนักงานนวัตกรรมแห่งชาติ"}], "สคช.": [{ORTH: "สคช."}],
"สนนท.": [{ORTH: "สนนท.", LEMMA: "สหพันธ์นิสิตนักศึกษาแห่งประเทศไทย"}], "สสว.": [{ORTH: "สสว."}],
"สปก.": [{ORTH: "สปก.", LEMMA: "สำนักงานการปฏิรูปที่ดินเพื่อเกษตรกรรม"}], "สสส.": [{ORTH: "สสส."}],
"สปช.": [{ORTH: "สปช.", LEMMA: "สำนักงานคณะกรรมการการประถมศึกษาแห่งชาติ"}], "สสวท.": [{ORTH: "สสวท."}],
"สปอ.": [{ORTH: "สปอ.", LEMMA: "สำนักงานการประถมศึกษาอำเภอ"}], "อตก.": [{ORTH: "อตก."}],
"สพช.": [{ORTH: "สพช.", LEMMA: "สำนักงานคณะกรรมการนโยบายพลังงานแห่งชาติ"}], "อบจ.": [{ORTH: "อบจ."}],
"สยช.": [ "อบต.": [{ORTH: "อบต."}],
{ORTH: "สยช.", LEMMA: "สำนักงานคณะกรรมการส่งเสริมและประสานงานเยาวชนแห่งชาติ"} "อปพร.": [{ORTH: "อปพร."}],
], "อย.": [{ORTH: "อย."}],
"สวช.": [{ORTH: "สวช.", LEMMA: "สำนักงานคณะกรรมการวัฒนธรรมแห่งชาติ"}], "อ.ส.ม.ท.": [{ORTH: "อ.ส.ม.ท."}],
"สวท.": [{ORTH: "สวท.", LEMMA: "สถานีวิทยุกระจายเสียงแห่งประเทศไทย"}],
"สวทช.": [{ORTH: "สวทช.", LEMMA: "สำนักงานพัฒนาวิทยาศาสตร์และเทคโนโลยีแห่งชาติ"}],
"สคช.": [
{ORTH: "สคช.", LEMMA: "สำนักงานคณะกรรมการพัฒนาการเศรษฐกิจและสังคมแห่งชาติ"}
],
"สสว.": [{ORTH: "สสว.", LEMMA: "สำนักงานส่งเสริมวิสาหกิจขนาดกลางและขนาดย่อม"}],
"สสส.": [{ORTH: "สสส.", LEMMA: "สำนักงานกองทุนสนับสนุนการสร้างเสริมสุขภาพ"}],
"สสวท.": [{ORTH: "สสวท.", LEMMA: "สถาบันส่งเสริมการสอนวิทยาศาสตร์และเทคโนโลยี"}],
"อตก.": [{ORTH: "อตก.", LEMMA: "องค์การตลาดเพื่อเกษตรกร"}],
"อบจ.": [{ORTH: "อบจ.", LEMMA: "องค์การบริหารส่วนจังหวัด"}],
"อบต.": [{ORTH: "อบต.", LEMMA: "องค์การบริหารส่วนตำบล"}],
"อปพร.": [{ORTH: "อปพร.", LEMMA: "อาสาสมัครป้องกันภัยฝ่ายพลเรือน"}],
"อย.": [{ORTH: "อย.", LEMMA: "สำนักงานคณะกรรมการอาหารและยา"}],
"อ.ส.ม.ท.": [{ORTH: "อ.ส.ม.ท.", LEMMA: "องค์การสื่อสารมวลชนแห่งประเทศไทย"}],
# มหาวิทยาลัย / สถานศึกษา / university / college # มหาวิทยาลัย / สถานศึกษา / university / college
"มทส.": [{ORTH: "มทส.", LEMMA: "มหาวิทยาลัยเทคโนโลยีสุรนารี"}], "มทส.": [{ORTH: "มทส."}],
"มธ.": [{ORTH: "มธ.", LEMMA: "มหาวิทยาลัยธรรมศาสตร์"}], "มธ.": [{ORTH: "มธ."}],
"ม.อ.": [{ORTH: "ม.อ.", LEMMA: "มหาวิทยาลัยสงขลานครินทร์"}], "ม.อ.": [{ORTH: "ม.อ."}],
"มทร.": [{ORTH: "มทร.", LEMMA: "มหาวิทยาลัยเทคโนโลยีราชมงคล"}], "มทร.": [{ORTH: "มทร."}],
"มมส.": [{ORTH: "มมส.", LEMMA: "มหาวิทยาลัยมหาสารคาม"}], "มมส.": [{ORTH: "มมส."}],
"วท.": [{ORTH: "วท.", LEMMA: "วิทยาลัยเทคนิค"}], "วท.": [{ORTH: "วท."}],
"สตม.": [{ORTH: "สตม.", LEMMA: "สำนักงานตรวจคนเข้าเมือง (ตำรวจ)"}], "สตม.": [{ORTH: "สตม."}],
# ยศ / rank # ยศ / rank
"ดร.": [{ORTH: "ดร.", LEMMA: "ดอกเตอร์"}], "ดร.": [{ORTH: "ดร."}],
"ด.ต.": [{ORTH: "ด.ต.", LEMMA: "ดาบตำรวจ"}], "ด.ต.": [{ORTH: "ด.ต."}],
"จ.ต.": [{ORTH: "จ.ต.", LEMMA: "จ่าตรี"}], "จ.ต.": [{ORTH: "จ.ต."}],
"จ.ท.": [{ORTH: "จ.ท.", LEMMA: "จ่าโท"}], "จ.ท.": [{ORTH: "จ.ท."}],
"จ.ส.ต.": [{ORTH: "จ.ส.ต.", LEMMA: "จ่าสิบตรี (ทหารบก)"}], "จ.ส.ต.": [{ORTH: "จ.ส.ต."}],
"จสต.": [{ORTH: "จสต.", LEMMA: "จ่าสิบตำรวจ"}], "จสต.": [{ORTH: "จสต."}],
"จ.ส.ท.": [{ORTH: "จ.ส.ท.", LEMMA: "จ่าสิบโท"}], "จ.ส.ท.": [{ORTH: "จ.ส.ท."}],
"จ.ส.อ.": [{ORTH: "จ.ส.อ.", LEMMA: "จ่าสิบเอก"}], "จ.ส.อ.": [{ORTH: "จ.ส.อ."}],
"จ.อ.": [{ORTH: "จ.อ.", LEMMA: "จ่าเอก"}], "จ.อ.": [{ORTH: "จ.อ."}],
"ทพญ.": [{ORTH: "ทพญ.", LEMMA: "ทันตแพทย์หญิง"}], "ทพญ.": [{ORTH: "ทพญ."}],
"ทนพ.": [{ORTH: "ทนพ.", LEMMA: "เทคนิคการแพทย์"}], "ทนพ.": [{ORTH: "ทนพ."}],
"นจอ.": [{ORTH: "นจอ.", LEMMA: "นักเรียนจ่าอากาศ"}], "นจอ.": [{ORTH: "นจอ."}],
"น.ช.": [{ORTH: "น.ช.", LEMMA: "นักโทษชาย"}], "น.ช.": [{ORTH: "น.ช."}],
"น.ญ.": [{ORTH: "น.ญ.", LEMMA: "นักโทษหญิง"}], "น.ญ.": [{ORTH: "น.ญ."}],
"น.ต.": [{ORTH: "น.ต.", LEMMA: "นาวาตรี"}], "น.ต.": [{ORTH: "น.ต."}],
"น.ท.": [{ORTH: "น.ท.", LEMMA: "นาวาโท"}], "น.ท.": [{ORTH: "น.ท."}],
"นตท.": [{ORTH: "นตท.", LEMMA: "นักเรียนเตรียมทหาร"}], "นตท.": [{ORTH: "นตท."}],
"นนส.": [{ORTH: "นนส.", LEMMA: "นักเรียนนายสิบทหารบก"}], "นนส.": [{ORTH: "นนส."}],
"นนร.": [{ORTH: "นนร.", LEMMA: "นักเรียนนายร้อย"}], "นนร.": [{ORTH: "นนร."}],
"นนอ.": [{ORTH: "นนอ.", LEMMA: "นักเรียนนายเรืออากาศ"}], "นนอ.": [{ORTH: "นนอ."}],
"นพ.": [{ORTH: "นพ.", LEMMA: "นายแพทย์"}], "นพ.": [{ORTH: "นพ."}],
"นพท.": [{ORTH: "นพท.", LEMMA: "นายแพทย์ทหาร"}], "นพท.": [{ORTH: "นพท."}],
"นรจ.": [{ORTH: "นรจ.", LEMMA: "นักเรียนจ่าทหารเรือ"}], "นรจ.": [{ORTH: "นรจ."}],
"นรต.": [{ORTH: "นรต.", LEMMA: "นักเรียนนายร้อยตำรวจ"}], "นรต.": [{ORTH: "นรต."}],
"นศพ.": [{ORTH: "นศพ.", LEMMA: "นักศึกษาแพทย์"}], "นศพ.": [{ORTH: "นศพ."}],
"นศท.": [{ORTH: "นศท.", LEMMA: "นักศึกษาวิชาทหาร"}], "นศท.": [{ORTH: "นศท."}],
"น.สพ.": [{ORTH: "น.สพ.", LEMMA: "นายสัตวแพทย์ (พ.ร.บ.วิชาชีพการสัตวแพทย์)"}], "น.สพ.": [{ORTH: "น.สพ."}],
"น.อ.": [{ORTH: "น.อ.", LEMMA: "นาวาเอก"}], "น.อ.": [{ORTH: "น.อ."}],
"บช.ก.": [{ORTH: "บช.ก.", LEMMA: "กองบัญชาการตำรวจสอบสวนกลาง"}], "บช.ก.": [{ORTH: "บช.ก."}],
"บช.น.": [{ORTH: "บช.น.", LEMMA: "กองบัญชาการตำรวจนครบาล"}], "บช.น.": [{ORTH: "บช.น."}],
"ผกก.": [{ORTH: "ผกก.", LEMMA: "ผู้กำกับการ"}], "ผกก.": [{ORTH: "ผกก."}],
"ผกก.ภ.": [{ORTH: "ผกก.ภ.", LEMMA: "ผู้กำกับการตำรวจภูธร"}], "ผกก.ภ.": [{ORTH: "ผกก.ภ."}],
"ผจก.": [{ORTH: "ผจก.", LEMMA: "ผู้จัดการ"}], "ผจก.": [{ORTH: "ผจก."}],
"ผช.": [{ORTH: "ผช.", LEMMA: "ผู้ช่วย"}], "ผช.": [{ORTH: "ผช."}],
"ผชก.": [{ORTH: "ผชก.", LEMMA: "ผู้ชำนาญการ"}], "ผชก.": [{ORTH: "ผชก."}],
"ผช.ผอ.": [{ORTH: "ผช.ผอ.", LEMMA: "ผู้ช่วยผู้อำนวยการ"}], "ผช.ผอ.": [{ORTH: "ผช.ผอ."}],
"ผญบ.": [{ORTH: "ผญบ.", LEMMA: "ผู้ใหญ่บ้าน"}], "ผญบ.": [{ORTH: "ผญบ."}],
"ผบ.": [{ORTH: "ผบ.", LEMMA: "ผู้บังคับบัญชา"}], "ผบ.": [{ORTH: "ผบ."}],
"ผบก.": [{ORTH: "ผบก.", LEMMA: "ผู้บังคับบัญชาการ (ตำรวจ)"}], "ผบก.": [{ORTH: "ผบก."}],
"ผบก.น.": [{ORTH: "ผบก.น.", LEMMA: "ผู้บังคับการตำรวจนครบาล"}], "ผบก.น.": [{ORTH: "ผบก.น."}],
"ผบก.ป.": [{ORTH: "ผบก.ป.", LEMMA: "ผู้บังคับการตำรวจกองปราบปราม"}], "ผบก.ป.": [{ORTH: "ผบก.ป."}],
"ผบก.ปค.": [ "ผบก.ปค.": [{ORTH: "ผบก.ปค."}],
{ "ผบก.ปม.": [{ORTH: "ผบก.ปม."}],
ORTH: "ผบก.ปค.", "ผบก.ภ.": [{ORTH: "ผบก.ภ."}],
LEMMA: "ผู้บังคับการ กองบังคับการปกครอง (โรงเรียนนายร้อยตำรวจ)", "ผบช.": [{ORTH: "ผบช."}],
} "ผบช.ก.": [{ORTH: "ผบช.ก."}],
], "ผบช.ตชด.": [{ORTH: "ผบช.ตชด."}],
"ผบก.ปม.": [{ORTH: "ผบก.ปม.", LEMMA: "ผู้บังคับการตำรวจป่าไม้"}], "ผบช.น.": [{ORTH: "ผบช.น."}],
"ผบก.ภ.": [{ORTH: "ผบก.ภ.", LEMMA: "ผู้บังคับการตำรวจภูธร"}], "ผบช.ภ.": [{ORTH: "ผบช.ภ."}],
"ผบช.": [{ORTH: "ผบช.", LEMMA: "ผู้บัญชาการ (ตำรวจ)"}], "ผบ.ทบ.": [{ORTH: "ผบ.ทบ."}],
"ผบช.ก.": [{ORTH: "ผบช.ก.", LEMMA: "ผู้บัญชาการตำรวจสอบสวนกลาง"}], "ผบ.ตร.": [{ORTH: "ผบ.ตร."}],
"ผบช.ตชด.": [{ORTH: "ผบช.ตชด.", LEMMA: "ผู้บัญชาการตำรวจตระเวนชายแดน"}], "ผบ.ทร.": [{ORTH: "ผบ.ทร."}],
"ผบช.น.": [{ORTH: "ผบช.น.", LEMMA: "ผู้บัญชาการตำรวจนครบาล"}], "ผบ.ทอ.": [{ORTH: "ผบ.ทอ."}],
"ผบช.ภ.": [{ORTH: "ผบช.ภ.", LEMMA: "ผู้บัญชาการตำรวจภูธร"}], "ผบ.ทสส.": [{ORTH: "ผบ.ทสส."}],
"ผบ.ทบ.": [{ORTH: "ผบ.ทบ.", LEMMA: "ผู้บัญชาการทหารบก"}], "ผวจ.": [{ORTH: "ผวจ."}],
"ผบ.ตร.": [{ORTH: "ผบ.ตร.", LEMMA: "ผู้บัญชาการตำรวจแห่งชาติ"}], "ผู้ว่าฯ": [{ORTH: "ผู้ว่าฯ"}],
"ผบ.ทร.": [{ORTH: "ผบ.ทร.", LEMMA: "ผู้บัญชาการทหารเรือ"}], "พ.จ.ต.": [{ORTH: "พ.จ.ต."}],
"ผบ.ทอ.": [{ORTH: "ผบ.ทอ.", LEMMA: "ผู้บัญชาการทหารอากาศ"}], "พ.จ.ท.": [{ORTH: "พ.จ.ท."}],
"ผบ.ทสส.": [{ORTH: "ผบ.ทสส.", LEMMA: "ผู้บัญชาการทหารสูงสุด"}], "พ.จ.อ.": [{ORTH: "พ.จ.อ."}],
"ผวจ.": [{ORTH: "ผวจ.", LEMMA: "ผู้ว่าราชการจังหวัด"}], "พญ.": [{ORTH: "พญ."}],
"ผู้ว่าฯ": [{ORTH: "ผู้ว่าฯ", LEMMA: "ผู้ว่าราชการจังหวัด"}], "ฯพณฯ": [{ORTH: "ฯพณฯ"}],
"พ.จ.ต.": [{ORTH: "พ.จ.ต.", LEMMA: "พันจ่าตรี"}], "พ.ต.": [{ORTH: "พ.ต."}],
"พ.จ.ท.": [{ORTH: "พ.จ.ท.", LEMMA: "พันจ่าโท"}], "พ.ท.": [{ORTH: "พ.ท."}],
"พ.จ.อ.": [{ORTH: "พ.จ.อ.", LEMMA: "พันจ่าเอก"}], "พ.อ.": [{ORTH: "พ.อ."}],
"พญ.": [{ORTH: "พญ.", LEMMA: "แพทย์หญิง"}], "พ.ต.อ.พิเศษ": [{ORTH: "พ.ต.อ.พิเศษ"}],
"ฯพณฯ": [{ORTH: "ฯพณฯ", LEMMA: "พณท่าน"}], "พลฯ": [{ORTH: "พลฯ"}],
"พ.ต.": [{ORTH: "พ.ต.", LEMMA: "พันตรี"}], "พล.๑ รอ.": [{ORTH: "พล.๑ รอ."}],
"พ.ท.": [{ORTH: "พ.ท.", LEMMA: "พันโท"}], "พล.ต.": [{ORTH: "พล.ต."}],
"พ.อ.": [{ORTH: "พ.อ.", LEMMA: "พันเอก"}], "พล.ต.ต.": [{ORTH: "พล.ต.ต."}],
"พ.ต.อ.พิเศษ": [{ORTH: "พ.ต.อ.พิเศษ", LEMMA: "พันตำรวจเอกพิเศษ"}], "พล.ต.ท.": [{ORTH: "พล.ต.ท."}],
"พลฯ": [{ORTH: "พลฯ", LEMMA: "พลทหาร"}], "พล.ต.อ.": [{ORTH: "พล.ต.อ."}],
"พล.๑ รอ.": [{ORTH: "พล.๑ รอ.", LEMMA: "กองพลที่ ๑ รักษาพระองค์ กองทัพบก"}], "พล.ท.": [{ORTH: "พล.ท."}],
"พล.ต.": [{ORTH: "พล.ต.", LEMMA: "พลตรี"}], "พล.ปตอ.": [{ORTH: "พล.ปตอ."}],
"พล.ต.ต.": [{ORTH: "พล.ต.ต.", LEMMA: "พลตำรวจตรี"}], "พล.ม.": [{ORTH: "พล.ม."}],
"พล.ต.ท.": [{ORTH: "พล.ต.ท.", LEMMA: "พลตำรวจโท"}], "พล.ม.๒": [{ORTH: "พล.ม.๒"}],
"พล.ต.อ.": [{ORTH: "พล.ต.อ.", LEMMA: "พลตำรวจเอก"}], "พล.ร.ต.": [{ORTH: "พล.ร.ต."}],
"พล.ท.": [{ORTH: "พล.ท.", LEMMA: "พลโท"}], "พล.ร.ท.": [{ORTH: "พล.ร.ท."}],
"พล.ปตอ.": [{ORTH: "พล.ปตอ.", LEMMA: "กองพลทหารปืนใหญ่ต่อสู่อากาศยาน"}], "พล.ร.อ.": [{ORTH: "พล.ร.อ."}],
"พล.ม.": [{ORTH: "พล.ม.", LEMMA: "กองพลทหารม้า"}], "พล.อ.": [{ORTH: "พล.อ."}],
"พล.ม.๒": [{ORTH: "พล.ม.๒", LEMMA: "กองพลทหารม้าที่ ๒"}], "พล.อ.ต.": [{ORTH: "พล.อ.ต."}],
"พล.ร.ต.": [{ORTH: "พล.ร.ต.", LEMMA: "พลเรือตรี"}], "พล.อ.ท.": [{ORTH: "พล.อ.ท."}],
"พล.ร.ท.": [{ORTH: "พล.ร.ท.", LEMMA: "พลเรือโท"}], "พล.อ.อ.": [{ORTH: "พล.อ.อ."}],
"พล.ร.อ.": [{ORTH: "พล.ร.อ.", LEMMA: "พลเรือเอก"}], "พ.อ.พิเศษ": [{ORTH: "พ.อ.พิเศษ"}],
"พล.อ.": [{ORTH: "พล.อ.", LEMMA: "พลเอก"}], "พ.อ.ต.": [{ORTH: "พ.อ.ต."}],
"พล.อ.ต.": [{ORTH: "พล.อ.ต.", LEMMA: "พลอากาศตรี"}], "พ.อ.ท.": [{ORTH: "พ.อ.ท."}],
"พล.อ.ท.": [{ORTH: "พล.อ.ท.", LEMMA: "พลอากาศโท"}], "พ.อ.อ.": [{ORTH: "พ.อ.อ."}],
"พล.อ.อ.": [{ORTH: "พล.อ.อ.", LEMMA: "พลอากาศเอก"}], "ภกญ.": [{ORTH: "ภกญ."}],
"พ.อ.พิเศษ": [{ORTH: "พ.อ.พิเศษ", LEMMA: "พันเอกพิเศษ"}], "ม.จ.": [{ORTH: "ม.จ."}],
"พ.อ.ต.": [{ORTH: "พ.อ.ต.", LEMMA: "พันจ่าอากาศตรี"}], "มท1": [{ORTH: "มท1"}],
"พ.อ.ท.": [{ORTH: "พ.อ.ท.", LEMMA: "พันจ่าอากาศโท"}], "ม.ร.ว.": [{ORTH: "ม.ร.ว."}],
"พ.อ.อ.": [{ORTH: "พ.อ.อ.", LEMMA: "พันจ่าอากาศเอก"}], "มล.": [{ORTH: "มล."}],
"ภกญ.": [{ORTH: "ภกญ.", LEMMA: "เภสัชกรหญิง"}], "ร.ต.": [{ORTH: "ร.ต."}],
"ม.จ.": [{ORTH: "ม.จ.", LEMMA: "หม่อมเจ้า"}], "ร.ต.ต.": [{ORTH: "ร.ต.ต."}],
"มท1": [{ORTH: "มท1", LEMMA: "รัฐมนตรีว่าการกระทรวงมหาดไทย"}], "ร.ต.ท.": [{ORTH: "ร.ต.ท."}],
"ม.ร.ว.": [{ORTH: "ม.ร.ว.", LEMMA: "หม่อมราชวงศ์"}], "ร.ต.อ.": [{ORTH: "ร.ต.อ."}],
"มล.": [{ORTH: "มล.", LEMMA: "หม่อมหลวง"}], "ร.ท.": [{ORTH: "ร.ท."}],
"ร.ต.": [{ORTH: "ร.ต.", LEMMA: "ร้อยตรี,เรือตรี,เรืออากาศตรี"}], "รมช.": [{ORTH: "รมช."}],
"ร.ต.ต.": [{ORTH: "ร.ต.ต.", LEMMA: "ร้อยตำรวจตรี"}], "รมต.": [{ORTH: "รมต."}],
"ร.ต.ท.": [{ORTH: "ร.ต.ท.", LEMMA: "ร้อยตำรวจโท"}], "รมว.": [{ORTH: "รมว."}],
"ร.ต.อ.": [{ORTH: "ร.ต.อ.", LEMMA: "ร้อยตำรวจเอก"}], "รศ.": [{ORTH: "รศ."}],
"ร.ท.": [{ORTH: "ร.ท.", LEMMA: "ร้อยโท,เรือโท,เรืออากาศโท"}], "ร.อ.": [{ORTH: "ร.อ."}],
"รมช.": [{ORTH: "รมช.", LEMMA: "รัฐมนตรีช่วยว่าการกระทรวง"}], "ศ.": [{ORTH: "ศ."}],
"รมต.": [{ORTH: "รมต.", LEMMA: "รัฐมนตรี"}], "ส.ต.": [{ORTH: "ส.ต."}],
"รมว.": [{ORTH: "รมว.", LEMMA: "รัฐมนตรีว่าการกระทรวง"}], "ส.ต.ต.": [{ORTH: "ส.ต.ต."}],
"รศ.": [{ORTH: "รศ.", LEMMA: "รองศาสตราจารย์"}], "ส.ต.ท.": [{ORTH: "ส.ต.ท."}],
"ร.อ.": [{ORTH: "ร.อ.", LEMMA: "ร้อยเอก,เรือเอก,เรืออากาศเอก"}], "ส.ต.อ.": [{ORTH: "ส.ต.อ."}],
"ศ.": [{ORTH: "ศ.", LEMMA: "ศาสตราจารย์"}], "ส.ท.": [{ORTH: "ส.ท."}],
"ส.ต.": [{ORTH: "ส.ต.", LEMMA: "สิบตรี"}], "สพ.": [{ORTH: "สพ."}],
"ส.ต.ต.": [{ORTH: "ส.ต.ต.", LEMMA: "สิบตำรวจตรี"}], "สพ.ญ.": [{ORTH: "สพ.ญ."}],
"ส.ต.ท.": [{ORTH: "ส.ต.ท.", LEMMA: "สิบตำรวจโท"}], "สพ.ช.": [{ORTH: "สพ.ช."}],
"ส.ต.อ.": [{ORTH: "ส.ต.อ.", LEMMA: "สิบตำรวจเอก"}], "ส.อ.": [{ORTH: "ส.อ."}],
"ส.ท.": [{ORTH: "ส.ท.", LEMMA: "สิบโท"}], "อจ.": [{ORTH: "อจ."}],
"สพ.": [{ORTH: "สพ.", LEMMA: "สัตวแพทย์"}], "อจญ.": [{ORTH: "อจญ."}],
"สพ.ญ.": [{ORTH: "สพ.ญ.", LEMMA: "สัตวแพทย์หญิง"}],
"สพ.ช.": [{ORTH: "สพ.ช.", LEMMA: "สัตวแพทย์ชาย"}],
"ส.อ.": [{ORTH: "ส.อ.", LEMMA: "สิบเอก"}],
"อจ.": [{ORTH: "อจ.", LEMMA: "อาจารย์"}],
"อจญ.": [{ORTH: "อจญ.", LEMMA: "อาจารย์ใหญ่"}],
# วุฒิ / bachelor degree # วุฒิ / bachelor degree
"ป.": [{ORTH: "ป.", LEMMA: "ประถมศึกษา"}], "ป.": [{ORTH: "ป."}],
"ป.กศ.": [{ORTH: "ป.กศ.", LEMMA: "ประกาศนียบัตรวิชาการศึกษา"}], "ป.กศ.": [{ORTH: "ป.กศ."}],
"ป.กศ.สูง": [{ORTH: "ป.กศ.สูง", LEMMA: "ประกาศนียบัตรวิชาการศึกษาชั้นสูง"}], "ป.กศ.สูง": [{ORTH: "ป.กศ.สูง"}],
"ปวช.": [{ORTH: "ปวช.", LEMMA: "ประกาศนียบัตรวิชาชีพ"}], "ปวช.": [{ORTH: "ปวช."}],
"ปวท.": [{ORTH: "ปวท.", LEMMA: "ประกาศนียบัตรวิชาชีพเทคนิค"}], "ปวท.": [{ORTH: "ปวท."}],
"ปวส.": [{ORTH: "ปวส.", LEMMA: "ประกาศนียบัตรวิชาชีพชั้นสูง"}], "ปวส.": [{ORTH: "ปวส."}],
"ปทส.": [{ORTH: "ปทส.", LEMMA: "ประกาศนียบัตรครูเทคนิคชั้นสูง"}], "ปทส.": [{ORTH: "ปทส."}],
"กษ.บ.": [{ORTH: "กษ.บ.", LEMMA: "เกษตรศาสตรบัณฑิต"}], "กษ.บ.": [{ORTH: "กษ.บ."}],
"กษ.ม.": [{ORTH: "กษ.ม.", LEMMA: "เกษตรศาสตรมหาบัณฑิต"}], "กษ.ม.": [{ORTH: "กษ.ม."}],
"กษ.ด.": [{ORTH: "กษ.ด.", LEMMA: "เกษตรศาสตรดุษฎีบัณฑิต"}], "กษ.ด.": [{ORTH: "กษ.ด."}],
"ค.บ.": [{ORTH: "ค.บ.", LEMMA: "ครุศาสตรบัณฑิต"}], "ค.บ.": [{ORTH: "ค.บ."}],
"คศ.บ.": [{ORTH: "คศ.บ.", LEMMA: "คหกรรมศาสตรบัณฑิต"}], "คศ.บ.": [{ORTH: "คศ.บ."}],
"คศ.ม.": [{ORTH: "คศ.ม.", LEMMA: "คหกรรมศาสตรมหาบัณฑิต"}], "คศ.ม.": [{ORTH: "คศ.ม."}],
"คศ.ด.": [{ORTH: "คศ.ด.", LEMMA: "คหกรรมศาสตรดุษฎีบัณฑิต"}], "คศ.ด.": [{ORTH: "คศ.ด."}],
"ค.อ.บ.": [{ORTH: "ค.อ.บ.", LEMMA: "ครุศาสตรอุตสาหกรรมบัณฑิต"}], "ค.อ.บ.": [{ORTH: "ค.อ.บ."}],
"ค.อ.ม.": [{ORTH: "ค.อ.ม.", LEMMA: "ครุศาสตรอุตสาหกรรมมหาบัณฑิต"}], "ค.อ.ม.": [{ORTH: "ค.อ.ม."}],
"ค.อ.ด.": [{ORTH: "ค.อ.ด.", LEMMA: "ครุศาสตรอุตสาหกรรมดุษฎีบัณฑิต"}], "ค.อ.ด.": [{ORTH: "ค.อ.ด."}],
"ทก.บ.": [{ORTH: "ทก.บ.", LEMMA: "เทคโนโลยีการเกษตรบัณฑิต"}], "ทก.บ.": [{ORTH: "ทก.บ."}],
"ทก.ม.": [{ORTH: "ทก.ม.", LEMMA: "เทคโนโลยีการเกษตรมหาบัณฑิต"}], "ทก.ม.": [{ORTH: "ทก.ม."}],
"ทก.ด.": [{ORTH: "ทก.ด.", LEMMA: "เทคโนโลยีการเกษตรดุษฎีบัณฑิต"}], "ทก.ด.": [{ORTH: "ทก.ด."}],
"ท.บ.": [{ORTH: "ท.บ.", LEMMA: "ทันตแพทยศาสตรบัณฑิต"}], "ท.บ.": [{ORTH: "ท.บ."}],
"ท.ม.": [{ORTH: "ท.ม.", LEMMA: "ทันตแพทยศาสตรมหาบัณฑิต"}], "ท.ม.": [{ORTH: "ท.ม."}],
"ท.ด.": [{ORTH: "ท.ด.", LEMMA: "ทันตแพทยศาสตรดุษฎีบัณฑิต"}], "ท.ด.": [{ORTH: "ท.ด."}],
"น.บ.": [{ORTH: "น.บ.", LEMMA: "นิติศาสตรบัณฑิต"}], "น.บ.": [{ORTH: "น.บ."}],
"น.ม.": [{ORTH: "น.ม.", LEMMA: "นิติศาสตรมหาบัณฑิต"}], "น.ม.": [{ORTH: "น.ม."}],
"น.ด.": [{ORTH: "น.ด.", LEMMA: "นิติศาสตรดุษฎีบัณฑิต"}], "น.ด.": [{ORTH: "น.ด."}],
"นศ.บ.": [{ORTH: "นศ.บ.", LEMMA: "นิเทศศาสตรบัณฑิต"}], "นศ.บ.": [{ORTH: "นศ.บ."}],
"นศ.ม.": [{ORTH: "นศ.ม.", LEMMA: "นิเทศศาสตรมหาบัณฑิต"}], "นศ.ม.": [{ORTH: "นศ.ม."}],
"นศ.ด.": [{ORTH: "นศ.ด.", LEMMA: "นิเทศศาสตรดุษฎีบัณฑิต"}], "นศ.ด.": [{ORTH: "นศ.ด."}],
"บช.บ.": [{ORTH: "บช.บ.", LEMMA: "บัญชีบัณฑิต"}], "บช.บ.": [{ORTH: "บช.บ."}],
"บช.ม.": [{ORTH: "บช.ม.", LEMMA: "บัญชีมหาบัณฑิต"}], "บช.ม.": [{ORTH: "บช.ม."}],
"บช.ด.": [{ORTH: "บช.ด.", LEMMA: "บัญชีดุษฎีบัณฑิต"}], "บช.ด.": [{ORTH: "บช.ด."}],
"บธ.บ.": [{ORTH: "บธ.บ.", LEMMA: "บริหารธุรกิจบัณฑิต"}], "บธ.บ.": [{ORTH: "บธ.บ."}],
"บธ.ม.": [{ORTH: "บธ.ม.", LEMMA: "บริหารธุรกิจมหาบัณฑิต"}], "บธ.ม.": [{ORTH: "บธ.ม."}],
"บธ.ด.": [{ORTH: "บธ.ด.", LEMMA: "บริหารธุรกิจดุษฎีบัณฑิต"}], "บธ.ด.": [{ORTH: "บธ.ด."}],
"พณ.บ.": [{ORTH: "พณ.บ.", LEMMA: "พาณิชยศาสตรบัณฑิต"}], "พณ.บ.": [{ORTH: "พณ.บ."}],
"พณ.ม.": [{ORTH: "พณ.ม.", LEMMA: "พาณิชยศาสตรมหาบัณฑิต"}], "พณ.ม.": [{ORTH: "พณ.ม."}],
"พณ.ด.": [{ORTH: "พณ.ด.", LEMMA: "พาณิชยศาสตรดุษฎีบัณฑิต"}], "พณ.ด.": [{ORTH: "พณ.ด."}],
"พ.บ.": [{ORTH: "พ.บ.", LEMMA: "แพทยศาสตรบัณฑิต"}], "พ.บ.": [{ORTH: "พ.บ."}],
"พ.ม.": [{ORTH: "พ.ม.", LEMMA: "แพทยศาสตรมหาบัณฑิต"}], "พ.ม.": [{ORTH: "พ.ม."}],
"พ.ด.": [{ORTH: "พ.ด.", LEMMA: "แพทยศาสตรดุษฎีบัณฑิต"}], "พ.ด.": [{ORTH: "พ.ด."}],
"พธ.บ.": [{ORTH: "พธ.บ.", LEMMA: "พุทธศาสตรบัณฑิต"}], "พธ.บ.": [{ORTH: "พธ.บ."}],
"พธ.ม.": [{ORTH: "พธ.ม.", LEMMA: "พุทธศาสตรมหาบัณฑิต"}], "พธ.ม.": [{ORTH: "พธ.ม."}],
"พธ.ด.": [{ORTH: "พธ.ด.", LEMMA: "พุทธศาสตรดุษฎีบัณฑิต"}], "พธ.ด.": [{ORTH: "พธ.ด."}],
"พบ.บ.": [{ORTH: "พบ.บ.", LEMMA: "พัฒนบริหารศาสตรบัณฑิต"}], "พบ.บ.": [{ORTH: "พบ.บ."}],
"พบ.ม.": [{ORTH: "พบ.ม.", LEMMA: "พัฒนบริหารศาสตรมหาบัณฑิต"}], "พบ.ม.": [{ORTH: "พบ.ม."}],
"พบ.ด.": [{ORTH: "พบ.ด.", LEMMA: "พัฒนบริหารศาสตรดุษฎีบัณฑิต"}], "พบ.ด.": [{ORTH: "พบ.ด."}],
"พย.บ.": [{ORTH: "พย.บ.", LEMMA: "พยาบาลศาสตรดุษฎีบัณฑิต"}], "พย.บ.": [{ORTH: "พย.บ."}],
"พย.ม.": [{ORTH: "พย.ม.", LEMMA: "พยาบาลศาสตรมหาบัณฑิต"}], "พย.ม.": [{ORTH: "พย.ม."}],
"พย.ด.": [{ORTH: "พย.ด.", LEMMA: "พยาบาลศาสตรดุษฎีบัณฑิต"}], "พย.ด.": [{ORTH: "พย.ด."}],
"พศ.บ.": [{ORTH: "พศ.บ.", LEMMA: "พาณิชยศาสตรบัณฑิต"}], "พศ.บ.": [{ORTH: "พศ.บ."}],
"พศ.ม.": [{ORTH: "พศ.ม.", LEMMA: "พาณิชยศาสตรมหาบัณฑิต"}], "พศ.ม.": [{ORTH: "พศ.ม."}],
"พศ.ด.": [{ORTH: "พศ.ด.", LEMMA: "พาณิชยศาสตรดุษฎีบัณฑิต"}], "พศ.ด.": [{ORTH: "พศ.ด."}],
"ภ.บ.": [{ORTH: "ภ.บ.", LEMMA: "เภสัชศาสตรบัณฑิต"}], "ภ.บ.": [{ORTH: "ภ.บ."}],
"ภ.ม.": [{ORTH: "ภ.ม.", LEMMA: "เภสัชศาสตรมหาบัณฑิต"}], "ภ.ม.": [{ORTH: "ภ.ม."}],
"ภ.ด.": [{ORTH: "ภ.ด.", LEMMA: "เภสัชศาสตรดุษฎีบัณฑิต"}], "ภ.ด.": [{ORTH: "ภ.ด."}],
"ภ.สถ.บ.": [{ORTH: "ภ.สถ.บ.", LEMMA: "ภูมิสถาปัตยกรรมศาสตรบัณฑิต"}], "ภ.สถ.บ.": [{ORTH: "ภ.สถ.บ."}],
"รป.บ.": [{ORTH: "รป.บ.", LEMMA: "รัฐประศาสนศาสตร์บัณฑิต"}], "รป.บ.": [{ORTH: "รป.บ."}],
"รป.ม.": [{ORTH: "รป.ม.", LEMMA: "รัฐประศาสนศาสตร์มหาบัณฑิต"}], "รป.ม.": [{ORTH: "รป.ม."}],
"วท.บ.": [{ORTH: "วท.บ.", LEMMA: "วิทยาศาสตรบัณฑิต"}], "วท.บ.": [{ORTH: "วท.บ."}],
"วท.ม.": [{ORTH: "วท.ม.", LEMMA: "วิทยาศาสตรมหาบัณฑิต"}], "วท.ม.": [{ORTH: "วท.ม."}],
"วท.ด.": [{ORTH: "วท.ด.", LEMMA: "วิทยาศาสตรดุษฎีบัณฑิต"}], "วท.ด.": [{ORTH: "วท.ด."}],
"ศ.บ.": [{ORTH: "ศ.บ.", LEMMA: "ศิลปบัณฑิต"}], "ศ.บ.": [{ORTH: "ศ.บ."}],
"ศศ.บ.": [{ORTH: "ศศ.บ.", LEMMA: "ศิลปศาสตรบัณฑิต"}], "ศศ.บ.": [{ORTH: "ศศ.บ."}],
"ศษ.บ.": [{ORTH: "ศษ.บ.", LEMMA: "ศึกษาศาสตรบัณฑิต"}], "ศษ.บ.": [{ORTH: "ศษ.บ."}],
"ศส.บ.": [{ORTH: "ศส.บ.", LEMMA: "เศรษฐศาสตรบัณฑิต"}], "ศส.บ.": [{ORTH: "ศส.บ."}],
"สถ.บ.": [{ORTH: "สถ.บ.", LEMMA: "สถาปัตยกรรมศาสตรบัณฑิต"}], "สถ.บ.": [{ORTH: "สถ.บ."}],
"สถ.ม.": [{ORTH: "สถ.ม.", LEMMA: "สถาปัตยกรรมศาสตรมหาบัณฑิต"}], "สถ.ม.": [{ORTH: "สถ.ม."}],
"สถ.ด.": [{ORTH: "สถ.ด.", LEMMA: "สถาปัตยกรรมศาสตรดุษฎีบัณฑิต"}], "สถ.ด.": [{ORTH: "สถ.ด."}],
"สพ.บ.": [{ORTH: "สพ.บ.", LEMMA: "สัตวแพทยศาสตรบัณฑิต"}], "สพ.บ.": [{ORTH: "สพ.บ."}],
"อ.บ.": [{ORTH: "อ.บ.", LEMMA: "อักษรศาสตรบัณฑิต"}], "อ.บ.": [{ORTH: "อ.บ."}],
"อ.ม.": [{ORTH: "อ.ม.", LEMMA: "อักษรศาสตรมหาบัณฑิต"}], "อ.ม.": [{ORTH: "อ.ม."}],
"อ.ด.": [{ORTH: "อ.ด.", LEMMA: "อักษรศาสตรดุษฎีบัณฑิต"}], "อ.ด.": [{ORTH: "อ.ด."}],
# ปี / เวลา / year / time # ปี / เวลา / year / time
"ชม.": [{ORTH: "ชม.", LEMMA: "ชั่วโมง"}], "ชม.": [{ORTH: "ชม."}],
"จ.ศ.": [{ORTH: "จ.ศ.", LEMMA: "จุลศักราช"}], "จ.ศ.": [{ORTH: "จ.ศ."}],
"ค.ศ.": [{ORTH: "ค.ศ.", LEMMA: "คริสต์ศักราช"}], "ค.ศ.": [{ORTH: "ค.ศ."}],
"ฮ.ศ.": [{ORTH: "ฮ.ศ.", LEMMA: "ฮิจเราะห์ศักราช"}], "ฮ.ศ.": [{ORTH: "ฮ.ศ."}],
"ว.ด.ป.": [{ORTH: "ว.ด.ป.", LEMMA: "วัน เดือน ปี"}], "ว.ด.ป.": [{ORTH: "ว.ด.ป."}],
# ระยะทาง / distance # ระยะทาง / distance
"ฮม.": [{ORTH: "ฮม.", LEMMA: "เฮกโตเมตร"}], "ฮม.": [{ORTH: "ฮม."}],
"ดคม.": [{ORTH: "ดคม.", LEMMA: "เดคาเมตร"}], "ดคม.": [{ORTH: "ดคม."}],
"ดม.": [{ORTH: "ดม.", LEMMA: "เดซิเมตร"}], "ดม.": [{ORTH: "ดม."}],
"มม.": [{ORTH: "มม.", LEMMA: "มิลลิเมตร"}], "มม.": [{ORTH: "มม."}],
"ซม.": [{ORTH: "ซม.", LEMMA: "เซนติเมตร"}], "ซม.": [{ORTH: "ซม."}],
"กม.": [{ORTH: "กม.", LEMMA: "กิโลเมตร"}], "กม.": [{ORTH: "กม."}],
# น้ำหนัก / weight # น้ำหนัก / weight
"น.น.": [{ORTH: "น.น.", LEMMA: "น้ำหนัก"}], "น.น.": [{ORTH: "น.น."}],
"ฮก.": [{ORTH: "ฮก.", LEMMA: "เฮกโตกรัม"}], "ฮก.": [{ORTH: "ฮก."}],
"ดคก.": [{ORTH: "ดคก.", LEMMA: "เดคากรัม"}], "ดคก.": [{ORTH: "ดคก."}],
"ดก.": [{ORTH: "ดก.", LEMMA: "เดซิกรัม"}], "ดก.": [{ORTH: "ดก."}],
"ซก.": [{ORTH: "ซก.", LEMMA: "เซนติกรัม"}], "ซก.": [{ORTH: "ซก."}],
"มก.": [{ORTH: "มก.", LEMMA: "มิลลิกรัม"}], "มก.": [{ORTH: "มก."}],
"ก.": [{ORTH: "ก.", LEMMA: "กรัม"}], "ก.": [{ORTH: "ก."}],
"กก.": [{ORTH: "กก.", LEMMA: "กิโลกรัม"}], "กก.": [{ORTH: "กก."}],
# ปริมาตร / volume # ปริมาตร / volume
"ฮล.": [{ORTH: "ฮล.", LEMMA: "เฮกโตลิตร"}], "ฮล.": [{ORTH: "ฮล."}],
"ดคล.": [{ORTH: "ดคล.", LEMMA: "เดคาลิตร"}], "ดคล.": [{ORTH: "ดคล."}],
"ดล.": [{ORTH: "ดล.", LEMMA: "เดซิลิตร"}], "ดล.": [{ORTH: "ดล."}],
"ซล.": [{ORTH: "ซล.", LEMMA: "เซนติลิตร"}], "ซล.": [{ORTH: "ซล."}],
"ล.": [{ORTH: "ล.", LEMMA: "ลิตร"}], "ล.": [{ORTH: "ล."}],
"กล.": [{ORTH: "กล.", LEMMA: "กิโลลิตร"}], "กล.": [{ORTH: "กล."}],
"ลบ.": [{ORTH: "ลบ.", LEMMA: "ลูกบาศก์"}], "ลบ.": [{ORTH: "ลบ."}],
# พื้นที่ / area # พื้นที่ / area
"ตร.ซม.": [{ORTH: "ตร.ซม.", LEMMA: "ตารางเซนติเมตร"}], "ตร.ซม.": [{ORTH: "ตร.ซม."}],
"ตร.ม.": [{ORTH: "ตร.ม.", LEMMA: "ตารางเมตร"}], "ตร.ม.": [{ORTH: "ตร.ม."}],
"ตร.ว.": [{ORTH: "ตร.ว.", LEMMA: "ตารางวา"}], "ตร.ว.": [{ORTH: "ตร.ว."}],
"ตร.กม.": [{ORTH: "ตร.กม.", LEMMA: "ตารางกิโลเมตร"}], "ตร.กม.": [{ORTH: "ตร.กม."}],
# เดือน / month # เดือน / month
"ม.ค.": [{ORTH: "ม.ค.", LEMMA: "มกราคม"}], "ม.ค.": [{ORTH: "ม.ค."}],
"ก.พ.": [{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}], "ก.พ.": [{ORTH: "ก.พ."}],
"มี.ค.": [{ORTH: "มี.ค.", LEMMA: "มีนาคม"}], "มี.ค.": [{ORTH: "มี.ค."}],
"เม.ย.": [{ORTH: "เม.ย.", LEMMA: "เมษายน"}], "เม.ย.": [{ORTH: "เม.ย."}],
"พ.ค.": [{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}], "พ.ค.": [{ORTH: "พ.ค."}],
"มิ.ย.": [{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}], "มิ.ย.": [{ORTH: "มิ.ย."}],
"ก.ค.": [{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}], "ก.ค.": [{ORTH: "ก.ค."}],
"ส.ค.": [{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}], "ส.ค.": [{ORTH: "ส.ค."}],
"ก.ย.": [{ORTH: "ก.ย.", LEMMA: "กันยายน"}], "ก.ย.": [{ORTH: "ก.ย."}],
"ต.ค.": [{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}], "ต.ค.": [{ORTH: "ต.ค."}],
"พ.ย.": [{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}], "พ.ย.": [{ORTH: "พ.ย."}],
"ธ.ค.": [{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}], "ธ.ค.": [{ORTH: "ธ.ค."}],
# เพศ / gender # เพศ / gender
"ช.": [{ORTH: "ช.", LEMMA: "ชาย"}], "ช.": [{ORTH: "ช."}],
"ญ.": [{ORTH: "ญ.", LEMMA: "หญิง"}], "ญ.": [{ORTH: "ญ."}],
"ด.ช.": [{ORTH: "ด.ช.", LEMMA: "เด็กชาย"}], "ด.ช.": [{ORTH: "ด.ช."}],
"ด.ญ.": [{ORTH: "ด.ญ.", LEMMA: "เด็กหญิง"}], "ด.ญ.": [{ORTH: "ด.ญ."}],
# ที่อยู่ / address # ที่อยู่ / address
"ถ.": [{ORTH: "ถ.", LEMMA: "ถนน"}], "ถ.": [{ORTH: "ถ."}],
"ต.": [{ORTH: "ต.", LEMMA: "ตำบล"}], "ต.": [{ORTH: "ต."}],
"อ.": [{ORTH: "อ.", LEMMA: "อำเภอ"}], "อ.": [{ORTH: "อ."}],
"จ.": [{ORTH: "จ.", LEMMA: "จังหวัด"}], "จ.": [{ORTH: "จ."}],
# สรรพนาม / pronoun # สรรพนาม / pronoun
"ข้าฯ": [{ORTH: "ข้าฯ", LEMMA: "ข้าพระพุทธเจ้า"}], "ข้าฯ": [{ORTH: "ข้าฯ"}],
"ทูลเกล้าฯ": [{ORTH: "ทูลเกล้าฯ", LEMMA: "ทูลเกล้าทูลกระหม่อม"}], "ทูลเกล้าฯ": [{ORTH: "ทูลเกล้าฯ"}],
"น้อมเกล้าฯ": [{ORTH: "น้อมเกล้าฯ", LEMMA: "น้อมเกล้าน้อมกระหม่อม"}], "น้อมเกล้าฯ": [{ORTH: "น้อมเกล้าฯ"}],
"โปรดเกล้าฯ": [{ORTH: "โปรดเกล้าฯ", LEMMA: "โปรดเกล้าโปรดกระหม่อม"}], "โปรดเกล้าฯ": [{ORTH: "โปรดเกล้าฯ"}],
# การเมือง / politic # การเมือง / politic
"ขจก.": [{ORTH: "ขจก.", LEMMA: "ขบวนการโจรก่อการร้าย"}], "ขจก.": [{ORTH: "ขจก."}],
"ขบด.": [{ORTH: "ขบด.", LEMMA: "ขบวนการแบ่งแยกดินแดน"}], "ขบด.": [{ORTH: "ขบด."}],
"นปช.": [{ORTH: "นปช.", LEMMA: "แนวร่วมประชาธิปไตยขับไล่เผด็จการ"}], "นปช.": [{ORTH: "นปช."}],
"ปชป.": [{ORTH: "ปชป.", LEMMA: "พรรคประชาธิปัตย์"}], "ปชป.": [{ORTH: "ปชป."}],
"ผกค.": [{ORTH: "ผกค.", LEMMA: "ผู้ก่อการร้ายคอมมิวนิสต์"}], "ผกค.": [{ORTH: "ผกค."}],
"พท.": [{ORTH: "พท.", LEMMA: "พรรคเพื่อไทย"}], "พท.": [{ORTH: "พท."}],
"พ.ร.ก.": [{ORTH: "พ.ร.ก.", LEMMA: "พระราชกำหนด"}], "พ.ร.ก.": [{ORTH: "พ.ร.ก."}],
"พ.ร.ฎ.": [{ORTH: "พ.ร.ฎ.", LEMMA: "พระราชกฤษฎีกา"}], "พ.ร.ฎ.": [{ORTH: "พ.ร.ฎ."}],
"พ.ร.บ.": [{ORTH: "พ.ร.บ.", LEMMA: "พระราชบัญญัติ"}], "พ.ร.บ.": [{ORTH: "พ.ร.บ."}],
"รธน.": [{ORTH: "รธน.", LEMMA: "รัฐธรรมนูญ"}], "รธน.": [{ORTH: "รธน."}],
"รบ.": [{ORTH: "รบ.", LEMMA: "รัฐบาล"}], "รบ.": [{ORTH: "รบ."}],
"รสช.": [{ORTH: "รสช.", LEMMA: "คณะรักษาความสงบเรียบร้อยแห่งชาติ"}], "รสช.": [{ORTH: "รสช."}],
"ส.ก.": [{ORTH: "ส.ก.", LEMMA: "สมาชิกสภากรุงเทพมหานคร"}], "ส.ก.": [{ORTH: "ส.ก."}],
"สจ.": [{ORTH: "สจ.", LEMMA: "สมาชิกสภาจังหวัด"}], "สจ.": [{ORTH: "สจ."}],
"สว.": [{ORTH: "สว.", LEMMA: "สมาชิกวุฒิสภา"}], "สว.": [{ORTH: "สว."}],
"ส.ส.": [{ORTH: "ส.ส.", LEMMA: "สมาชิกสภาผู้แทนราษฎร"}], "ส.ส.": [{ORTH: "ส.ส."}],
# ทั่วไป / general # ทั่วไป / general
"ก.ข.ค.": [{ORTH: "ก.ข.ค.", LEMMA: "ก้างขวางคอ"}], "ก.ข.ค.": [{ORTH: "ก.ข.ค."}],
"กทม.": [{ORTH: "กทม.", LEMMA: "กรุงเทพมหานคร"}], "กทม.": [{ORTH: "กทม."}],
"กรุงเทพฯ": [{ORTH: "กรุงเทพฯ", LEMMA: "กรุงเทพมหานคร"}], "กรุงเทพฯ": [{ORTH: "กรุงเทพฯ"}],
"ขรก.": [{ORTH: "ขรก.", LEMMA: "ข้าราชการ"}], "ขรก.": [{ORTH: "ขรก."}],
"ขส": [{ORTH: "ขส.", LEMMA: "ขนส่ง"}], "ขส": [{ORTH: "ขส."}],
"ค.ร.น.": [{ORTH: "ค.ร.น.", LEMMA: "คูณร่วมน้อย"}], "ค.ร.น.": [{ORTH: "ค.ร.น."}],
"ค.ร.ม.": [{ORTH: "ค.ร.ม.", LEMMA: "คูณร่วมมาก"}], "ค.ร.ม.": [{ORTH: "ค.ร.ม."}],
"ง.ด.": [{ORTH: "ง.ด.", LEMMA: "เงินเดือน"}], "ง.ด.": [{ORTH: "ง.ด."}],
"งป.": [{ORTH: "งป.", LEMMA: "งบประมาณ"}], "งป.": [{ORTH: "งป."}],
"จก.": [{ORTH: "จก.", LEMMA: "จำกัด"}], "จก.": [{ORTH: "จก."}],
"จขกท.": [{ORTH: "จขกท.", LEMMA: "เจ้าของกระทู้"}], "จขกท.": [{ORTH: "จขกท."}],
"จนท.": [{ORTH: "จนท.", LEMMA: "เจ้าหน้าที่"}], "จนท.": [{ORTH: "จนท."}],
"จ.ป.ร.": [ "จ.ป.ร.": [{ORTH: "จ.ป.ร."}],
{ "จ.ม.": [{ORTH: "จ.ม."}],
ORTH: "จ.ป.ร.", "จย.": [{ORTH: "จย."}],
LEMMA: "มหาจุฬาลงกรณ ปรมราชาธิราช (พระปรมาภิไธยในพระบาทสมเด็จพระจุลจอมเกล้าเจ้าอยู่หัว)", "จยย.": [{ORTH: "จยย."}],
} "ตจว.": [{ORTH: "ตจว."}],
], "โทร.": [{ORTH: "โทร."}],
"จ.ม.": [{ORTH: "จ.ม.", LEMMA: "จดหมาย"}], "ธ.": [{ORTH: "ธ."}],
"จย.": [{ORTH: "จย.", LEMMA: "จักรยาน"}], "น.ร.": [{ORTH: "น.ร."}],
"จยย.": [{ORTH: "จยย.", LEMMA: "จักรยานยนต์"}], "น.ศ.": [{ORTH: "น.ศ."}],
"ตจว.": [{ORTH: "ตจว.", LEMMA: "ต่างจังหวัด"}], "น.ส.": [{ORTH: "น.ส."}],
"โทร.": [{ORTH: "โทร.", LEMMA: "โทรศัพท์"}], "น.ส.๓": [{ORTH: "น.ส.๓"}],
"ธ.": [{ORTH: "ธ.", LEMMA: "ธนาคาร"}], "น.ส.๓ ก.": [{ORTH: "น.ส.๓ ก"}],
"น.ร.": [{ORTH: "น.ร.", LEMMA: "นักเรียน"}], "นสพ.": [{ORTH: "นสพ."}],
"น.ศ.": [{ORTH: "น.ศ.", LEMMA: "นักศึกษา"}], "บ.ก.": [{ORTH: "บ.ก."}],
"น.ส.": [{ORTH: "น.ส.", LEMMA: "นางสาว"}], "บจก.": [{ORTH: "บจก."}],
"น.ส.๓": [{ORTH: "น.ส.๓", LEMMA: "หนังสือรับรองการทำประโยชน์ในที่ดิน"}], "บงล.": [{ORTH: "บงล."}],
"น.ส.๓ ก.": [ "บบส.": [{ORTH: "บบส."}],
{ORTH: "น.ส.๓ ก", LEMMA: "หนังสือแสดงกรรมสิทธิ์ในที่ดิน (มีระวางกำหนด)"} "บมจ.": [{ORTH: "บมจ."}],
], "บลจ.": [{ORTH: "บลจ."}],
"นสพ.": [{ORTH: "นสพ.", LEMMA: "หนังสือพิมพ์"}], "บ/ช": [{ORTH: "บ/ช"}],
"บ.ก.": [{ORTH: "บ.ก.", LEMMA: "บรรณาธิการ"}], "บร.": [{ORTH: "บร."}],
"บจก.": [{ORTH: "บจก.", LEMMA: "บริษัทจำกัด"}], "ปชช.": [{ORTH: "ปชช."}],
"บงล.": [{ORTH: "บงล.", LEMMA: "บริษัทเงินทุนและหลักทรัพย์จำกัด"}], "ปณ.": [{ORTH: "ปณ."}],
"บบส.": [{ORTH: "บบส.", LEMMA: "บรรษัทบริหารสินทรัพย์สถาบันการเงิน"}], "ปณก.": [{ORTH: "ปณก."}],
"บมจ.": [{ORTH: "บมจ.", LEMMA: "บริษัทมหาชนจำกัด"}], "ปณส.": [{ORTH: "ปณส."}],
"บลจ.": [{ORTH: "บลจ.", LEMMA: "บริษัทหลักทรัพย์จัดการกองทุนรวมจำกัด"}], "ปธ.": [{ORTH: "ปธ."}],
"บ/ช": [{ORTH: "บ/ช", LEMMA: "บัญชี"}], "ปธน.": [{ORTH: "ปธน."}],
"บร.": [{ORTH: "บร.", LEMMA: "บรรณารักษ์"}], "ปอ.": [{ORTH: "ปอ."}],
"ปชช.": [{ORTH: "ปชช.", LEMMA: "ประชาชน"}], "ปอ.พ.": [{ORTH: "ปอ.พ."}],
"ปณ.": [{ORTH: "ปณ.", LEMMA: "ที่ทำการไปรษณีย์"}], "พ.ก.ง.": [{ORTH: "พ.ก.ง."}],
"ปณก.": [{ORTH: "ปณก.", LEMMA: "ที่ทำการไปรษณีย์กลาง"}], "พ.ก.ส.": [{ORTH: "พ.ก.ส."}],
"ปณส.": [{ORTH: "ปณส.", LEMMA: "ที่ทำการไปรษณีย์สาขา"}], "พขร.": [{ORTH: "พขร."}],
"ปธ.": [{ORTH: "ปธ.", LEMMA: "ประธาน"}], "ภ.ง.ด.": [{ORTH: "ภ.ง.ด."}],
"ปธน.": [{ORTH: "ปธน.", LEMMA: "ประธานาธิบดี"}], "ภ.ง.ด.๙": [{ORTH: "ภ.ง.ด.๙"}],
"ปอ.": [{ORTH: "ปอ.", LEMMA: "รถยนต์โดยสารประจำทางปรับอากาศ"}], "ภ.ป.ร.": [{ORTH: "ภ.ป.ร."}],
"ปอ.พ.": [{ORTH: "ปอ.พ.", LEMMA: "รถยนต์โดยสารประจำทางปรับอากาศพิเศษ"}], "ภ.พ.": [{ORTH: "ภ.พ."}],
"พ.ก.ง.": [{ORTH: "พ.ก.ง.", LEMMA: "พัสดุเก็บเงินปลายทาง"}], "ร.": [{ORTH: "ร."}],
"พ.ก.ส.": [{ORTH: "พ.ก.ส.", LEMMA: "พนักงานเก็บค่าโดยสาร"}], "ร.ง.": [{ORTH: "ร.ง."}],
"พขร.": [{ORTH: "พขร.", LEMMA: "พนักงานขับรถ"}], "ร.ด.": [{ORTH: "ร.ด."}],
"ภ.ง.ด.": [{ORTH: "ภ.ง.ด.", LEMMA: "ภาษีเงินได้"}], "รปภ.": [{ORTH: "รปภ."}],
"ภ.ง.ด.๙": [{ORTH: "ภ.ง.ด.๙", LEMMA: "แบบแสดงรายการเสียภาษีเงินได้ของกรมสรรพากร"}], "รพ.": [{ORTH: "รพ."}],
"ภ.ป.ร.": [ "ร.พ.": [{ORTH: "ร.พ."}],
{ "รร.": [{ORTH: "รร."}],
ORTH: "ภ.ป.ร.", "รสก.": [{ORTH: "รสก."}],
LEMMA: "ภูมิพลอดุยเดช ปรมราชาธิราช (พระปรมาภิไธยในพระบาทสมเด็จพระปรมินทรมหาภูมิพลอดุลยเดช)", "ส.ค.ส.": [{ORTH: "ส.ค.ส."}],
} "สต.": [{ORTH: "สต."}],
], "สน.": [{ORTH: "สน."}],
"ภ.พ.": [{ORTH: "ภ.พ.", LEMMA: "ภาษีมูลค่าเพิ่ม"}], "สนข.": [{ORTH: "สนข."}],
"ร.": [{ORTH: "ร.", LEMMA: "รัชกาล"}], "สนง.": [{ORTH: "สนง."}],
"ร.ง.": [{ORTH: "ร.ง.", LEMMA: "โรงงาน"}], "สนญ.": [{ORTH: "สนญ."}],
"ร.ด.": [{ORTH: "ร.ด.", LEMMA: "รักษาดินแดน"}], "ส.ป.ช.": [{ORTH: "ส.ป.ช."}],
"รปภ.": [{ORTH: "รปภ.", LEMMA: "รักษาความปลอดภัย"}], "สภ.": [{ORTH: "สภ."}],
"รพ.": [{ORTH: "รพ.", LEMMA: "โรงพยาบาล"}], "ส.ล.น.": [{ORTH: "ส.ล.น."}],
"ร.พ.": [{ORTH: "ร.พ.", LEMMA: "โรงพิมพ์"}], "สวญ.": [{ORTH: "สวญ."}],
"รร.": [{ORTH: "รร.", LEMMA: "โรงเรียน,โรงแรม"}], "สวป.": [{ORTH: "สวป."}],
"รสก.": [{ORTH: "รสก.", LEMMA: "รัฐวิสาหกิจ"}], "สว.สส.": [{ORTH: "สว.สส."}],
"ส.ค.ส.": [{ORTH: "ส.ค.ส.", LEMMA: "ส่งความสุขปีใหม่"}], "ส.ห.": [{ORTH: "ส.ห."}],
"สต.": [{ORTH: "สต.", LEMMA: "สตางค์"}], "สอ.": [{ORTH: "สอ."}],
"สน.": [{ORTH: "สน.", LEMMA: "สถานีตำรวจ"}], "สอท.": [{ORTH: "สอท."}],
"สนข.": [{ORTH: "สนข.", LEMMA: "สำนักงานเขต"}], "เสธ.": [{ORTH: "เสธ."}],
"สนง.": [{ORTH: "สนง.", LEMMA: "สำนักงาน"}], "หจก.": [{ORTH: "หจก."}],
"สนญ.": [{ORTH: "สนญ.", LEMMA: "สำนักงานใหญ่"}], "ห.ร.ม.": [{ORTH: "ห.ร.ม."}],
"ส.ป.ช.": [{ORTH: "ส.ป.ช.", LEMMA: "สร้างเสริมประสบการณ์ชีวิต"}],
"สภ.": [{ORTH: "สภ.", LEMMA: "สถานีตำรวจภูธร"}],
"ส.ล.น.": [{ORTH: "ส.ล.น.", LEMMA: "สร้างเสริมลักษณะนิสัย"}],
"สวญ.": [{ORTH: "สวญ.", LEMMA: "สารวัตรใหญ่"}],
"สวป.": [{ORTH: "สวป.", LEMMA: "สารวัตรป้องกันปราบปราม"}],
"สว.สส.": [{ORTH: "สว.สส.", LEMMA: "สารวัตรสืบสวน"}],
"ส.ห.": [{ORTH: "ส.ห.", LEMMA: "สารวัตรทหาร"}],
"สอ.": [{ORTH: "สอ.", LEMMA: "สถานีอนามัย"}],
"สอท.": [{ORTH: "สอท.", LEMMA: "สถานเอกอัครราชทูต"}],
"เสธ.": [{ORTH: "เสธ.", LEMMA: "เสนาธิการ"}],
"หจก.": [{ORTH: "หจก.", LEMMA: "ห้างหุ้นส่วนจำกัด"}],
"ห.ร.ม.": [{ORTH: "ห.ร.ม.", LEMMA: "ตัวหารร่วมมาก"}],
} }

View File

@ -1,18 +1,18 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
_exc = { _exc = {
"tayo'y": [{ORTH: "tayo", LEMMA: "tayo"}, {ORTH: "'y", LEMMA: "ay"}], "tayo'y": [{ORTH: "tayo"}, {ORTH: "'y", NORM: "ay"}],
"isa'y": [{ORTH: "isa", LEMMA: "isa"}, {ORTH: "'y", LEMMA: "ay"}], "isa'y": [{ORTH: "isa"}, {ORTH: "'y", NORM: "ay"}],
"baya'y": [{ORTH: "baya", LEMMA: "bayan"}, {ORTH: "'y", LEMMA: "ay"}], "baya'y": [{ORTH: "baya"}, {ORTH: "'y", NORM: "ay"}],
"sa'yo": [{ORTH: "sa", LEMMA: "sa"}, {ORTH: "'yo", LEMMA: "iyo"}], "sa'yo": [{ORTH: "sa"}, {ORTH: "'yo", NORM: "iyo"}],
"ano'ng": [{ORTH: "ano", LEMMA: "ano"}, {ORTH: "'ng", LEMMA: "ang"}], "ano'ng": [{ORTH: "ano"}, {ORTH: "'ng", NORM: "ang"}],
"siya'y": [{ORTH: "siya", LEMMA: "siya"}, {ORTH: "'y", LEMMA: "ay"}], "siya'y": [{ORTH: "siya"}, {ORTH: "'y", NORM: "ay"}],
"nawa'y": [{ORTH: "nawa", LEMMA: "nawa"}, {ORTH: "'y", LEMMA: "ay"}], "nawa'y": [{ORTH: "nawa"}, {ORTH: "'y", NORM: "ay"}],
"papa'no": [{ORTH: "papa'no", LEMMA: "papaano"}], "papa'no": [{ORTH: "papa'no", NORM: "papaano"}],
"'di": [{ORTH: "'di", LEMMA: "hindi"}], "'di": [{ORTH: "'di", NORM: "hindi"}],
} }

View File

@ -1,7 +1,7 @@
import re import re
from .char_classes import ALPHA_LOWER from .char_classes import ALPHA_LOWER
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE from ..symbols import ORTH, NORM
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
@ -62,13 +62,13 @@ BASE_EXCEPTIONS = {}
for exc_data in [ for exc_data in [
{ORTH: " ", POS: SPACE, TAG: "_SP"}, {ORTH: " "},
{ORTH: "\t", POS: SPACE, TAG: "_SP"}, {ORTH: "\t"},
{ORTH: "\\t", POS: SPACE, TAG: "_SP"}, {ORTH: "\\t"},
{ORTH: "\n", POS: SPACE, TAG: "_SP"}, {ORTH: "\n"},
{ORTH: "\\n", POS: SPACE, TAG: "_SP"}, {ORTH: "\\n"},
{ORTH: "\u2014"}, {ORTH: "\u2014"},
{ORTH: "\u00a0", POS: SPACE, LEMMA: " ", TAG: "_SP"}, {ORTH: "\u00a0", NORM: " "},
]: ]:
BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data] BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]

View File

@ -1,5 +1,5 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -7,35 +7,35 @@ _exc = {}
_abbrev_exc = [ _abbrev_exc = [
# Weekdays abbreviations # Weekdays abbreviations
{ORTH: "дш", LEMMA: "дүшәмбе"}, {ORTH: "дш", NORM: "дүшәмбе"},
{ORTH: "сш", LEMMA: "сишәмбе"}, {ORTH: "сш", NORM: "сишәмбе"},
{ORTH: "чш", LEMMA: "чәршәмбе"}, {ORTH: "чш", NORM: "чәршәмбе"},
{ORTH: "пш", LEMMA: "пәнҗешәмбе"}, {ORTH: "пш", NORM: "пәнҗешәмбе"},
{ORTH: "җм", LEMMA: "җомга"}, {ORTH: "җм", NORM: "җомга"},
{ORTH: "шб", LEMMA: "шимбә"}, {ORTH: "шб", NORM: "шимбә"},
{ORTH: "яш", LEMMA: "якшәмбе"}, {ORTH: "яш", NORM: "якшәмбе"},
# Months abbreviations # Months abbreviations
{ORTH: "гый", LEMMA: "гыйнвар"}, {ORTH: "гый", NORM: "гыйнвар"},
{ORTH: "фев", LEMMA: "февраль"}, {ORTH: "фев", NORM: "февраль"},
{ORTH: "мар", LEMMA: "март"}, {ORTH: "мар", NORM: "март"},
{ORTH: "мар", LEMMA: "март"}, {ORTH: "мар", NORM: "март"},
{ORTH: "апр", LEMMA: "апрель"}, {ORTH: "апр", NORM: "апрель"},
{ORTH: "июн", LEMMA: "июнь"}, {ORTH: "июн", NORM: "июнь"},
{ORTH: "июл", LEMMA: "июль"}, {ORTH: "июл", NORM: "июль"},
{ORTH: "авг", LEMMA: "август"}, {ORTH: "авг", NORM: "август"},
{ORTH: "сен", LEMMA: "сентябрь"}, {ORTH: "сен", NORM: "сентябрь"},
{ORTH: "окт", LEMMA: "октябрь"}, {ORTH: "окт", NORM: "октябрь"},
{ORTH: "ноя", LEMMA: "ноябрь"}, {ORTH: "ноя", NORM: "ноябрь"},
{ORTH: "дек", LEMMA: "декабрь"}, {ORTH: "дек", NORM: "декабрь"},
# Number abbreviations # Number abbreviations
{ORTH: "млрд", LEMMA: "миллиард"}, {ORTH: "млрд", NORM: "миллиард"},
{ORTH: "млн", LEMMA: "миллион"}, {ORTH: "млн", NORM: "миллион"},
] ]
for abbr in _abbrev_exc: for abbr in _abbrev_exc:
for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()): for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()):
_exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] _exc[orth] = [{ORTH: orth, NORM: abbr[NORM]}]
_exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbr[NORM]}]
for exc_data in [ # "etc." abbreviations for exc_data in [ # "etc." abbreviations
{ORTH: "һ.б.ш.", NORM: "һәм башка шундыйлар"}, {ORTH: "һ.б.ш.", NORM: "һәм башка шундыйлар"},
@ -43,7 +43,6 @@ for exc_data in [ # "etc." abbreviations
{ORTH: "б.э.к.", NORM: "безнең эрага кадәр"}, {ORTH: "б.э.к.", NORM: "безнең эрага кадәр"},
{ORTH: "б.э.", NORM: "безнең эра"}, {ORTH: "б.э.", NORM: "безнең эра"},
]: ]:
exc_data[LEMMA] = exc_data[NORM]
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,24 +1,24 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
_exc = {} _exc = {}
for exc_data in [ for exc_data in [
{ORTH: "вул.", LEMMA: "вулиця", NORM: "вулиця", POS: NOUN}, {ORTH: "вул.", NORM: "вулиця"},
{ORTH: "ім.", LEMMA: "ім'я", NORM: "імені", POS: NOUN}, {ORTH: "ім.", NORM: "імені"},
{ORTH: "просп.", LEMMA: "проспект", NORM: "проспект", POS: NOUN}, {ORTH: "просп.", NORM: "проспект"},
{ORTH: "бул.", LEMMA: "бульвар", NORM: "бульвар", POS: NOUN}, {ORTH: "бул.", NORM: "бульвар"},
{ORTH: "пров.", LEMMA: "провулок", NORM: "провулок", POS: NOUN}, {ORTH: "пров.", NORM: "провулок"},
{ORTH: "пл.", LEMMA: "площа", NORM: "площа", POS: NOUN}, {ORTH: "пл.", NORM: "площа"},
{ORTH: "г.", LEMMA: "гора", NORM: "гора", POS: NOUN}, {ORTH: "г.", NORM: "гора"},
{ORTH: "п.", LEMMA: "пан", NORM: "пан", POS: NOUN}, {ORTH: "п.", NORM: "пан"},
{ORTH: "м.", LEMMA: "місто", NORM: "місто", POS: NOUN}, {ORTH: "м.", NORM: "місто"},
{ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN}, {ORTH: "проф.", NORM: "професор"},
{ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN}, {ORTH: "акад.", NORM: "академік"},
{ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN}, {ORTH: "доц.", NORM: "доцент"},
{ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}, {ORTH: "оз.", NORM: "озеро"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -12,7 +12,6 @@ def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer):
tokens = ar_tokenizer(text) tokens = ar_tokenizer(text)
assert len(tokens) == 7 assert len(tokens) == 7
assert tokens[6].text == "ق.م" assert tokens[6].text == "ق.م"
assert tokens[6].lemma_ == "قبل الميلاد"
def test_ar_tokenizer_handles_exc_in_text_2(ar_tokenizer): def test_ar_tokenizer_handles_exc_in_text_2(ar_tokenizer):

View File

@ -8,7 +8,6 @@ import pytest
def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma): def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
tokens = ca_tokenizer(text) tokens = ca_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
assert tokens[0].lemma_ == lemma
def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer): def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
@ -16,4 +15,3 @@ def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
tokens = ca_tokenizer(text) tokens = ca_tokenizer(text)
assert len(tokens) == 15 assert len(tokens) == 15
assert tokens[7].text == "aprox." assert tokens[7].text == "aprox."
assert tokens[7].lemma_ == "aproximadament"

View File

@ -18,4 +18,3 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
assert len(tokens) == 6 assert len(tokens) == 6
assert tokens[2].text == "z.Zt." assert tokens[2].text == "z.Zt."
assert tokens[2].lemma_ == "zur Zeit"

View File

@ -49,7 +49,6 @@ def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0] assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'ll" assert tokens[1].text == "'ll"
assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -104,7 +103,6 @@ def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
def test_en_tokenizer_handles_times(en_tokenizer, text): def test_en_tokenizer_handles_times(en_tokenizer, text):
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[1].lemma_ in ["a.m.", "p.m."]
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -13,7 +13,6 @@ import pytest
def test_es_tokenizer_handles_abbr(es_tokenizer, text, lemma): def test_es_tokenizer_handles_abbr(es_tokenizer, text, lemma):
tokens = es_tokenizer(text) tokens = es_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
assert tokens[0].lemma_ == lemma
def test_es_tokenizer_handles_exc_in_text(es_tokenizer): def test_es_tokenizer_handles_exc_in_text(es_tokenizer):
@ -21,4 +20,3 @@ def test_es_tokenizer_handles_exc_in_text(es_tokenizer):
tokens = es_tokenizer(text) tokens = es_tokenizer(text)
assert len(tokens) == 7 assert len(tokens) == 7
assert tokens[4].text == "aprox." assert tokens[4].text == "aprox."
assert tokens[4].lemma_ == "aproximadamente"

View File

@ -37,19 +37,11 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,lemma", "text", ["janv.", "juill.", "Dr.", "av.", "sept."],
[
("janv.", "janvier"),
("juill.", "juillet"),
("Dr.", "docteur"),
("av.", "avant"),
("sept.", "septembre"),
],
) )
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma): def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
tokens = fr_tokenizer(text) tokens = fr_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
assert tokens[0].lemma_ == lemma
def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer): def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer):
@ -57,7 +49,6 @@ def test_fr_tokenizer_handles_exc_in_text(fr_tokenizer):
tokens = fr_tokenizer(text) tokens = fr_tokenizer(text)
assert len(tokens) == 10 assert len(tokens) == 10
assert tokens[6].text == "janv." assert tokens[6].text == "janv."
assert tokens[6].lemma_ == "janvier"
assert tokens[8].text == "prudhommes" assert tokens[8].text == "prudhommes"
@ -74,11 +65,8 @@ def test_fr_tokenizer_handles_title(fr_tokenizer):
tokens = fr_tokenizer(text) tokens = fr_tokenizer(text)
assert len(tokens) == 6 assert len(tokens) == 6
assert tokens[0].text == "N'" assert tokens[0].text == "N'"
assert tokens[0].lemma_ == "ne"
assert tokens[1].text == "est" assert tokens[1].text == "est"
assert tokens[1].lemma_ == "être"
assert tokens[2].text == "-ce" assert tokens[2].text == "-ce"
assert tokens[2].lemma_ == "ce"
def test_fr_tokenizer_handles_title_2(fr_tokenizer): def test_fr_tokenizer_handles_title_2(fr_tokenizer):
@ -86,9 +74,7 @@ def test_fr_tokenizer_handles_title_2(fr_tokenizer):
tokens = fr_tokenizer(text) tokens = fr_tokenizer(text)
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[0].text == "Est" assert tokens[0].text == "Est"
assert tokens[0].lemma_ == "être"
assert tokens[1].text == "-ce" assert tokens[1].text == "-ce"
assert tokens[1].lemma_ == "ce"
def test_fr_tokenizer_handles_title_3(fr_tokenizer): def test_fr_tokenizer_handles_title_3(fr_tokenizer):
@ -96,4 +82,3 @@ def test_fr_tokenizer_handles_title_3(fr_tokenizer):
tokens = fr_tokenizer(text) tokens = fr_tokenizer(text)
assert len(tokens) == 7 assert len(tokens) == 7
assert tokens[0].text == "Qu'" assert tokens[0].text == "Qu'"
assert tokens[0].lemma_ == "que"

View File

@ -18,4 +18,3 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
tokens = lb_tokenizer(text) tokens = lb_tokenizer(text)
assert len(tokens) == 9 assert len(tokens) == 9
assert tokens[1].text == "'t" assert tokens[1].text == "'t"
assert tokens[1].lemma_ == "et"

View File

@ -157,8 +157,6 @@ def test_issue1758(en_tokenizer):
"""Test that "would've" is handled by the English tokenizer exceptions.""" """Test that "would've" is handled by the English tokenizer exceptions."""
tokens = en_tokenizer("would've") tokens = en_tokenizer("would've")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].tag_ == "MD"
assert tokens[1].lemma_ == "have"
def test_issue1773(en_tokenizer): def test_issue1773(en_tokenizer):

View File

@ -166,7 +166,6 @@ def test_issue2822(it_tokenizer):
assert doc[0].text == "Vuoi" assert doc[0].text == "Vuoi"
assert doc[1].text == "un" assert doc[1].text == "un"
assert doc[2].text == "po'" assert doc[2].text == "po'"
assert doc[2].lemma_ == "poco"
assert doc[3].text == "di" assert doc[3].text == "di"
assert doc[4].text == "zucchero" assert doc[4].text == "zucchero"
assert doc[5].text == "?" assert doc[5].text == "?"