Tidy up and auto-format

This commit is contained in:
Ines Montani 2019-08-20 17:36:34 +02:00
parent 364aaf5bc2
commit f580302673
69 changed files with 83201 additions and 82191 deletions

View File

@ -430,8 +430,7 @@ class Errors(object):
E150 = ("The language of the `nlp` object and the `vocab` should be the " E150 = ("The language of the `nlp` object and the `vocab` should be the "
"same, but found '{nlp}' and '{vocab}' respectively.") "same, but found '{nlp}' and '{vocab}' respectively.")
E151 = ("Trying to call nlp.update without required annotation types. " E151 = ("Trying to call nlp.update without required annotation types. "
"Expected top-level keys: {expected_keys}." "Expected top-level keys: {exp}. Got: {unexp}.")
" Got: {unexpected_keys}.")
E152 = ("The `nlp` object should have a pre-trained `ner` component.") E152 = ("The `nlp` object should have a pre-trained `ner` component.")
E153 = ("Either provide a path to a preprocessed training directory, " E153 = ("Either provide a path to a preprocessed training directory, "
"or to the original Wikipedia XML dump.") "or to the original Wikipedia XML dump.")

View File

@ -10,8 +10,4 @@ Example sentences to test spaCy and its language models.
""" """
sentences = [ sentences = ["তুই খুব ভালো", "আজ আমরা ডাক্তার দেখতে যাবো", "আমি জানি না "]
'তুই খুব ভালো',
'আজ আমরা ডাক্তার দেখতে যাবো',
'আমি জানি না '
]

View File

@ -22,7 +22,9 @@ _suffixes = (
r"(?<=°[FfCcKk])\.", r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:[{c}])".format(c=_currency), r"(?<=[0-9])(?:[{c}])".format(c=_currency),
r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency), r"(?<=[{al}{e}{q}(?:{c})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
),
] ]
) )
@ -35,8 +37,8 @@ _infixes = (
), ),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])({h})(?=[{ae}])".format(a=ALPHA, h=HYPHENS, ae=""), r"(?<=[{a}])({h})(?=[{ae}])".format(a=ALPHA, h=HYPHENS, ae=""),
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA), r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )

View File

@ -13,7 +13,7 @@ _infixes = (
+ [ + [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),

View File

@ -59,7 +59,9 @@ _suffixes = (
r"([0-9])+\&", # 12& r"([0-9])+\&", # 12&
r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"²\-\+", q=CONCAT_QUOTES), r"(?<=[0-9{al}{e}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"²\-\+", q=CONCAT_QUOTES
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\-", # όνομα- r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\-", # όνομα-
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\.", r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\.",
@ -87,8 +89,8 @@ _infixes = (
r"([a-zA-Z]+)(\-([a-zA-Z]+))+", # abc-abc r"([a-zA-Z]+)(\-([a-zA-Z]+))+", # abc-abc
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA), r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )

View File

@ -27,5 +27,5 @@ ADVERBS_IRREG = {
"slower": ("slow",), "slower": ("slow",),
"slowest": ("slowest",), "slowest": ("slowest",),
"sooner": ("soon",), "sooner": ("soon",),
"soonest": ("soon",) "soonest": ("soon",),
} }

View File

@ -44,7 +44,7 @@ NOUNS_IRREG = {
"allodia": ("allodium",), "allodia": ("allodium",),
"alluvia": ("alluvium",), "alluvia": ("alluvium",),
"alodia": ("alodium",), "alodia": ("alodium",),
"alto-relievos": ("alto-relievo", "alto-rilievo",), "alto-relievos": ("alto-relievo", "alto-rilievo"),
"altocumuli": ("altocumulus",), "altocumuli": ("altocumulus",),
"altostrati": ("altostratus",), "altostrati": ("altostratus",),
"alulae": ("alula",), "alulae": ("alula",),
@ -81,7 +81,7 @@ NOUNS_IRREG = {
"anamorphoses": ("anamorphosis",), "anamorphoses": ("anamorphosis",),
"anastomoses": ("anastomosis",), "anastomoses": ("anastomosis",),
"anatyxes": ("anaptyxis",), "anatyxes": ("anaptyxis",),
"ancones": ("ancon", "ancone",), "ancones": ("ancon", "ancone"),
"androclinia": ("androclinium",), "androclinia": ("androclinium",),
"androecia": ("androecium",), "androecia": ("androecium",),
"androsphinges": ("androsphinx",), "androsphinges": ("androsphinx",),
@ -90,7 +90,7 @@ NOUNS_IRREG = {
"angiomata": ("angioma",), "angiomata": ("angioma",),
"animalcula": ("animalculum",), "animalcula": ("animalculum",),
"anlagen": ("anlage",), "anlagen": ("anlage",),
"annattos": ("anatto", "annatto",), "annattos": ("anatto", "annatto"),
"annuli": ("annulus",), "annuli": ("annulus",),
"antae": ("anta",), "antae": ("anta",),
"antalkalies": ("antalkali",), "antalkalies": ("antalkali",),
@ -158,7 +158,7 @@ NOUNS_IRREG = {
"aspergilli": ("aspergillus",), "aspergilli": ("aspergillus",),
"aspergilloses": ("aspergillosis",), "aspergilloses": ("aspergillosis",),
"aspersoria": ("aspersorium",), "aspersoria": ("aspersorium",),
"assegais": ("assagai", "assegai",), "assegais": ("assagai", "assegai"),
"astragali": ("astragalus",), "astragali": ("astragalus",),
"asyndeta": ("asyndeton",), "asyndeta": ("asyndeton",),
"atheromata": ("atheroma",), "atheromata": ("atheroma",),
@ -172,15 +172,15 @@ NOUNS_IRREG = {
"aurei": ("aureus",), "aurei": ("aureus",),
"auriculae": ("auricula",), "auriculae": ("auricula",),
"aurorae": ("aurora",), "aurorae": ("aurora",),
"auspices": ("auspex", "auspice",), "auspices": ("auspex", "auspice"),
"autocatalyses": ("autocatalysis",), "autocatalyses": ("autocatalysis",),
"autochthones": ("autochthon",), "autochthones": ("autochthon",),
"automata": ("automaton",), "automata": ("automaton",),
"autos-da-fe": ("auto-da-fe",), "autos-da-fe": ("auto-da-fe",),
"avitaminoses": ("avitaminosis",), "avitaminoses": ("avitaminosis",),
"axes": ("ax", "axis",), "axes": ("ax", "axis"),
"axillae": ("axilla",), "axillae": ("axilla",),
"bacchantes": ("bacchant", "bacchante",), "bacchantes": ("bacchant", "bacchante"),
"bacchii": ("bacchius",), "bacchii": ("bacchius",),
"bacilli": ("bacillus",), "bacilli": ("bacillus",),
"bacteriostases": ("bacteriostasis",), "bacteriostases": ("bacteriostasis",),
@ -195,7 +195,7 @@ NOUNS_IRREG = {
"banjoes": ("banjo",), "banjoes": ("banjo",),
"barklice": ("barklouse",), "barklice": ("barklouse",),
"barramundies": ("barramundi",), "barramundies": ("barramundi",),
"bases": ("base", "basis",), "bases": ("base", "basis"),
"bases-on-balls": ("base_on_balls",), "bases-on-balls": ("base_on_balls",),
"bases_on_balls": ("base_on_balls",), "bases_on_balls": ("base_on_balls",),
"basidia": ("basidium",), "basidia": ("basidium",),
@ -204,15 +204,15 @@ NOUNS_IRREG = {
"bastinadoes": ("bastinado",), "bastinadoes": ("bastinado",),
"bateaux": ("bateau",), "bateaux": ("bateau",),
"batfishes": ("batfish",), "batfishes": ("batfish",),
"beadsmen": ("beadsman", "bedesman",), "beadsmen": ("beadsman", "bedesman"),
"beaux": ("beau",), "beaux": ("beau",),
"beches-de-mer": ("beche-de-mer",), "beches-de-mer": ("beche-de-mer",),
"beeves": ("beef",), "beeves": ("beef",),
"behooves": ("behoof",), "behooves": ("behoof",),
"bersaglieri": ("bersagliere",), "bersaglieri": ("bersagliere",),
"bhishties": ("bheesty", "bhishti",), "bhishties": ("bheesty", "bhishti"),
"bibliothecae": ("bibliotheca",), "bibliothecae": ("bibliotheca",),
"bicennaries": ("bicentenary", "bicentennial",), "bicennaries": ("bicentenary", "bicentennial"),
"bijoux": ("bijou",), "bijoux": ("bijou",),
"bilboes": ("bilbo",), "bilboes": ("bilbo",),
"billets-doux": ("billet-doux",), "billets-doux": ("billet-doux",),
@ -245,7 +245,7 @@ NOUNS_IRREG = {
"brachia": ("brachium",), "brachia": ("brachium",),
"brainchildren": ("brainchild",), "brainchildren": ("brainchild",),
"branchiae": ("branchia",), "branchiae": ("branchia",),
"brants": ("brant", "brent",), "brants": ("brant", "brent"),
"bravadoes": ("bravado",), "bravadoes": ("bravado",),
"bravoes": ("bravo",), "bravoes": ("bravo",),
"bregmata": ("bregma",), "bregmata": ("bregma",),
@ -275,7 +275,7 @@ NOUNS_IRREG = {
"caesurae": ("caesura",), "caesurae": ("caesura",),
"calami": ("calamus",), "calami": ("calamus",),
"calathi": ("calathus",), "calathi": ("calathus",),
"calcanei": ("calcaneum", "calcaneus",), "calcanei": ("calcaneum", "calcaneus"),
"calces": ("calx",), "calces": ("calx",),
"calculi": ("calculus",), "calculi": ("calculus",),
"caldaria": ("caldarium",), "caldaria": ("caldarium",),
@ -421,7 +421,7 @@ NOUNS_IRREG = {
"comae": ("coma",), "comae": ("coma",),
"comatulae": ("comatula",), "comatulae": ("comatula",),
"comedones": ("comedo",), "comedones": ("comedo",),
"comics": ("comic_strip", "comic",), "comics": ("comic_strip", "comic"),
"commandoes": ("commando",), "commandoes": ("commando",),
"concertanti": ("concertante",), "concertanti": ("concertante",),
"concerti": ("concerto",), "concerti": ("concerto",),
@ -549,11 +549,11 @@ NOUNS_IRREG = {
"diplococci": ("diplococcus",), "diplococci": ("diplococcus",),
"directors-general": ("director-general",), "directors-general": ("director-general",),
"disci": ("discus",), "disci": ("discus",),
"discoboli": ("discobolos", "discobolus",), "discoboli": ("discobolos", "discobolus"),
"dive": ("diva",), "dive": ("diva",),
"diverticula": ("diverticulum",), "diverticula": ("diverticulum",),
"divertimenti": ("divertimento",), "divertimenti": ("divertimento",),
"djinn": ("djinni", "djinny",), "djinn": ("djinni", "djinny"),
"dodoes": ("dodo",), "dodoes": ("dodo",),
"dogfishes": ("dogfish",), "dogfishes": ("dogfish",),
"dogmata": ("dogma",), "dogmata": ("dogma",),
@ -593,7 +593,7 @@ NOUNS_IRREG = {
"ellipses": ("ellipsis",), "ellipses": ("ellipsis",),
"eluvia": ("eluvium",), "eluvia": ("eluvium",),
"elves": ("elf",), "elves": ("elf",),
"elytra": ("elytron", "elytrum",), "elytra": ("elytron", "elytrum"),
"embargoes": ("embargo",), "embargoes": ("embargo",),
"emboli": ("embolus",), "emboli": ("embolus",),
"emphases": ("emphasis",), "emphases": ("emphasis",),
@ -623,7 +623,7 @@ NOUNS_IRREG = {
"entases": ("entasis",), "entases": ("entasis",),
"entera": ("enteron",), "entera": ("enteron",),
"entia": ("ens",), "entia": ("ens",),
"entozoa": ("entozoan", "entozoon",), "entozoa": ("entozoan", "entozoon"),
"epencephala": ("epencephalon",), "epencephala": ("epencephalon",),
"epentheses": ("epenthesis",), "epentheses": ("epenthesis",),
"epexegeses": ("epexegesis",), "epexegeses": ("epexegesis",),
@ -643,10 +643,10 @@ NOUNS_IRREG = {
"epiphenomena": ("epiphenomenon",), "epiphenomena": ("epiphenomenon",),
"epiphyses": ("epiphysis",), "epiphyses": ("epiphysis",),
"episterna": ("episternum",), "episterna": ("episternum",),
"epithalamia": ("epithalamion", "epithalamium",), "epithalamia": ("epithalamion", "epithalamium"),
"epithelia": ("epithelium",), "epithelia": ("epithelium",),
"epitheliomata": ("epithelioma",), "epitheliomata": ("epithelioma",),
"epizoa": ("epizoan", "epizoon",), "epizoa": ("epizoan", "epizoon"),
"epyllia": ("epyllion",), "epyllia": ("epyllion",),
"equilibria": ("equilibrium",), "equilibria": ("equilibrium",),
"equiseta": ("equisetum",), "equiseta": ("equisetum",),
@ -845,11 +845,11 @@ NOUNS_IRREG = {
"groszy": ("grosz",), "groszy": ("grosz",),
"grottoes": ("grotto",), "grottoes": ("grotto",),
"guilder": ("guilde",), "guilder": ("guilde",),
"guilders": ("guilde", "guilder",), "guilders": ("guilde", "guilder"),
"guitarfishes": ("guitarfish",), "guitarfishes": ("guitarfish",),
"gummata": ("gumma",), "gummata": ("gumma",),
"gurnard": ("gurnar",), "gurnard": ("gurnar",),
"gurnards": ("gurnar", "gurnard",), "gurnards": ("gurnar", "gurnard"),
"guttae": ("gutta",), "guttae": ("gutta",),
"gymnasia": ("gymnasium",), "gymnasia": ("gymnasium",),
"gynaecea": ("gynaeceum",), "gynaecea": ("gynaeceum",),
@ -870,7 +870,7 @@ NOUNS_IRREG = {
"haeredes": ("haeres",), "haeredes": ("haeres",),
"haftaroth": ("haftarah",), "haftaroth": ("haftarah",),
"hagfishes": ("hagfish",), "hagfishes": ("hagfish",),
"haggadas": ("haggada", "haggadah",), "haggadas": ("haggada", "haggadah"),
"haggadoth": ("haggada",), "haggadoth": ("haggada",),
"hajjes": ("hajj",), "hajjes": ("hajj",),
"haleru": ("haler",), "haleru": ("haler",),
@ -879,7 +879,7 @@ NOUNS_IRREG = {
"halloth": ("hallah",), "halloth": ("hallah",),
"halluces": ("hallux",), "halluces": ("hallux",),
"haloes": ("halo",), "haloes": ("halo",),
"halteres": ("halter", "haltere",), "halteres": ("halter", "haltere"),
"halves": ("half",), "halves": ("half",),
"hamuli": ("hamulus",), "hamuli": ("hamulus",),
"hangers-on": ("hanger-on",), "hangers-on": ("hanger-on",),
@ -909,7 +909,7 @@ NOUNS_IRREG = {
"heraclidae": ("heraclid",), "heraclidae": ("heraclid",),
"heraklidae": ("heraklid",), "heraklidae": ("heraklid",),
"herbaria": ("herbarium",), "herbaria": ("herbarium",),
"hermae": ("herm", "herma",), "hermae": ("herm", "herma"),
"hermai": ("herma",), "hermai": ("herma",),
"herniae": ("hernia",), "herniae": ("hernia",),
"heroes": ("hero",), "heroes": ("hero",),
@ -955,8 +955,8 @@ NOUNS_IRREG = {
"ibices": ("ibex",), "ibices": ("ibex",),
"ibo": ("igbo",), "ibo": ("igbo",),
"ichthyosauri": ("ichthyosaurus",), "ichthyosauri": ("ichthyosaurus",),
"ichthyosauruses": ("ichthyosaur", "ichthyosaurus",), "ichthyosauruses": ("ichthyosaur", "ichthyosaurus"),
"iconostases": ("iconostas", "iconostasis",), "iconostases": ("iconostas", "iconostasis"),
"icosahedra": ("icosahedron",), "icosahedra": ("icosahedron",),
"ideata": ("ideatum",), "ideata": ("ideatum",),
"igorrorote": ("igorrote",), "igorrorote": ("igorrote",),
@ -991,7 +991,7 @@ NOUNS_IRREG = {
"is": ("is",), "is": ("is",),
"ischia": ("ischium",), "ischia": ("ischium",),
"isthmi": ("isthmus",), "isthmi": ("isthmus",),
"jackeroos": ("jackaroo", "jackeroo",), "jackeroos": ("jackaroo", "jackeroo"),
"jackfishes": ("jackfish",), "jackfishes": ("jackfish",),
"jackknives": ("jackknife",), "jackknives": ("jackknife",),
"jacks-in-the-box": ("jack-in-the-box",), "jacks-in-the-box": ("jack-in-the-box",),
@ -1001,12 +1001,12 @@ NOUNS_IRREG = {
"jewfishes": ("jewfish",), "jewfishes": ("jewfish",),
"jingoes": ("jingo",), "jingoes": ("jingo",),
"jinn": ("jinni",), "jinn": ("jinni",),
"joes": ("jo", "joe",), "joes": ("jo", "joe"),
"judge_advocates_general": ("judge_advocate_general",), "judge_advocates_general": ("judge_advocate_general",),
"jura": ("jus",), "jura": ("jus",),
"kaddishim": ("kaddish",), "kaddishim": ("kaddish",),
"kalmuck": ("kalmuc",), "kalmuck": ("kalmuc",),
"kalmucks": ("kalmuc", "kalmuck",), "kalmucks": ("kalmuc", "kalmuck"),
"katabases": ("katabasis",), "katabases": ("katabasis",),
"keeshonden": ("keeshond",), "keeshonden": ("keeshond",),
"kibbutzim": ("kibbutz",), "kibbutzim": ("kibbutz",),
@ -1045,7 +1045,7 @@ NOUNS_IRREG = {
"latifundia": ("latifundium",), "latifundia": ("latifundium",),
"latu": ("lat",), "latu": ("lat",),
"lavaboes": ("lavabo",), "lavaboes": ("lavabo",),
"leaves": ("leaf", "leave",), "leaves": ("leaf", "leave"),
"lecythi": ("lecythus",), "lecythi": ("lecythus",),
"leges": ("lex",), "leges": ("lex",),
"lei": ("leu",), "lei": ("leu",),
@ -1078,7 +1078,7 @@ NOUNS_IRREG = {
"liriodendra": ("liriodendron",), "liriodendra": ("liriodendron",),
"lisente": ("sente",), "lisente": ("sente",),
"listente": ("sente",), "listente": ("sente",),
"litai": ("lit", "litas",), "litai": ("lit", "litas"),
"litu": ("litas",), "litu": ("litas",),
"lives": ("life",), "lives": ("life",),
"lixivia": ("lixivium",), "lixivia": ("lixivium",),
@ -1098,7 +1098,7 @@ NOUNS_IRREG = {
"lumpfishes": ("lumpfish",), "lumpfishes": ("lumpfish",),
"lungfishes": ("lungfish",), "lungfishes": ("lungfish",),
"lunulae": ("lunula",), "lunulae": ("lunula",),
"lures": ("lur", "lure",), "lures": ("lur", "lure"),
"lustra": ("lustre",), "lustra": ("lustre",),
"lyings-in": ("lying-in",), "lyings-in": ("lying-in",),
"lymphangitides": ("lymphangitis",), "lymphangitides": ("lymphangitis",),
@ -1142,7 +1142,7 @@ NOUNS_IRREG = {
"marsupia": ("marsupium",), "marsupia": ("marsupium",),
"marvels-of-peru": ("marvel-of-peru",), "marvels-of-peru": ("marvel-of-peru",),
"mass_media": ("mass_medium",), "mass_media": ("mass_medium",),
"masses": ("mass", "masse",), "masses": ("mass", "masse"),
"masters-at-arms": ("master-at-arms",), "masters-at-arms": ("master-at-arms",),
"matrices": ("matrix",), "matrices": ("matrix",),
"matzoth": ("matzo",), "matzoth": ("matzo",),
@ -1210,7 +1210,7 @@ NOUNS_IRREG = {
"mioses": ("miosis",), "mioses": ("miosis",),
"miracidia": ("miracidium",), "miracidia": ("miracidium",),
"miri": ("mir",), "miri": ("mir",),
"mishnayoth": ("mishna", "mishnah",), "mishnayoth": ("mishna", "mishnah"),
"mitochondria": ("mitochondrion",), "mitochondria": ("mitochondrion",),
"mitzvoth": ("mitzvah",), "mitzvoth": ("mitzvah",),
"modioli": ("modiolus",), "modioli": ("modiolus",),
@ -1218,7 +1218,7 @@ NOUNS_IRREG = {
"momenta": ("momentum",), "momenta": ("momentum",),
"moments_of_truth": ("moment_of_truth",), "moments_of_truth": ("moment_of_truth",),
"momi": ("momus",), "momi": ("momus",),
"monades": ("monad", "monas",), "monades": ("monad", "monas"),
"monkfishes": ("monkfish",), "monkfishes": ("monkfish",),
"monochasia": ("monochasium",), "monochasia": ("monochasium",),
"monopodia": ("monopodium",), "monopodia": ("monopodium",),
@ -1235,7 +1235,7 @@ NOUNS_IRREG = {
"moriscoes": ("morisco",), "moriscoes": ("morisco",),
"morphallaxes": ("morphallaxis",), "morphallaxes": ("morphallaxis",),
"morphoses": ("morphosis",), "morphoses": ("morphosis",),
"morses": ("morse", "mors",), "morses": ("morse", "mors"),
"morulae": ("morula",), "morulae": ("morula",),
"mosasauri": ("mosasaurus",), "mosasauri": ("mosasaurus",),
"moshavim": ("moshav",), "moshavim": ("moshav",),
@ -1328,13 +1328,13 @@ NOUNS_IRREG = {
"oceanides": ("oceanid",), "oceanides": ("oceanid",),
"ocelli": ("ocellus",), "ocelli": ("ocellus",),
"ochreae": ("ochrea",), "ochreae": ("ochrea",),
"ocreae": ("ochrea", "ocrea",), "ocreae": ("ochrea", "ocrea"),
"octahedra": ("octahedron",), "octahedra": ("octahedron",),
"octopi": ("octopus",), "octopi": ("octopus",),
"oculi": ("oculus",), "oculi": ("oculus",),
"odea": ("odeum",), "odea": ("odeum",),
"oedemata": ("edema", "oedema",), "oedemata": ("edema", "oedema"),
"oesophagi": ("esophagus", "oesophagus",), "oesophagi": ("esophagus", "oesophagus"),
"oldwives": ("oldwife",), "oldwives": ("oldwife",),
"olea": ("oleum",), "olea": ("oleum",),
"omasa": ("omasum",), "omasa": ("omasum",),
@ -1350,15 +1350,15 @@ NOUNS_IRREG = {
"optic_axes": ("optic_axis",), "optic_axes": ("optic_axis",),
"optima": ("optimum",), "optima": ("optimum",),
"ora": ("os",), "ora": ("os",),
"organa": ("organon", "organum",), "organa": ("organon", "organum"),
"organums": ("organa", "organum",), "organums": ("organa", "organum"),
"orthoptera": ("orthopteron",), "orthoptera": ("orthopteron",),
"osar": ("os",), "osar": ("os",),
"oscula": ("osculum",), "oscula": ("osculum",),
"ossa": ("os",), "ossa": ("os",),
"osteomata": ("osteoma",), "osteomata": ("osteoma",),
"ostia": ("ostium",), "ostia": ("ostium",),
"ottomans": ("othman", "ottoman",), "ottomans": ("othman", "ottoman"),
"ova": ("ovum",), "ova": ("ovum",),
"ovoli": ("ovolo",), "ovoli": ("ovolo",),
"ovotestes": ("ovotestis",), "ovotestes": ("ovotestis",),
@ -1382,7 +1382,7 @@ NOUNS_IRREG = {
"papulae": ("papula",), "papulae": ("papula",),
"papyri": ("papyrus",), "papyri": ("papyrus",),
"parabases": ("parabasis",), "parabases": ("parabasis",),
"paraleipses": ("paraleipsis", "paralipsis",), "paraleipses": ("paraleipsis", "paralipsis"),
"paralyses": ("paralysis",), "paralyses": ("paralysis",),
"paramecia": ("paramecium",), "paramecia": ("paramecium",),
"paramenta": ("parament",), "paramenta": ("parament",),
@ -1442,13 +1442,13 @@ NOUNS_IRREG = {
"personae": ("persona",), "personae": ("persona",),
"petechiae": ("petechia",), "petechiae": ("petechia",),
"pfennige": ("pfennig",), "pfennige": ("pfennig",),
"phalanges": ("phalange", "phalanx",), "phalanges": ("phalange", "phalanx"),
"phalli": ("phallus",), "phalli": ("phallus",),
"pharynges": ("pharynx",), "pharynges": ("pharynx",),
"phenomena": ("phenomenon",), "phenomena": ("phenomenon",),
"phi-phenomena": ("phi-phenomenon",), "phi-phenomena": ("phi-phenomenon",),
"philodendra": ("philodendron",), "philodendra": ("philodendron",),
"phlyctenae": ("phlyctaena", "phlyctena",), "phlyctenae": ("phlyctaena", "phlyctena"),
"phyla": ("phylum",), "phyla": ("phylum",),
"phylae": ("phyle",), "phylae": ("phyle",),
"phyllotaxes": ("phyllotaxis",), "phyllotaxes": ("phyllotaxis",),
@ -1475,12 +1475,12 @@ NOUNS_IRREG = {
"plasmodesmata": ("plasmodesma",), "plasmodesmata": ("plasmodesma",),
"plasmodia": ("plasmodium",), "plasmodia": ("plasmodium",),
"plateaux": ("plateau",), "plateaux": ("plateau",),
"plectra": ("plectron", "plectrum",), "plectra": ("plectron", "plectrum"),
"plena": ("plenum",), "plena": ("plenum",),
"pleura": ("pleuron",), "pleura": ("pleuron",),
"pleurae": ("pleura",), "pleurae": ("pleura",),
"plicae": ("plica",), "plicae": ("plica",),
"ploughmen": ("ploughman", "plowman",), "ploughmen": ("ploughman", "plowman"),
"pneumobacilli": ("pneumobacillus",), "pneumobacilli": ("pneumobacillus",),
"pneumococci": ("pneumococcus",), "pneumococci": ("pneumococcus",),
"pocketknives": ("pocketknife",), "pocketknives": ("pocketknife",),
@ -1515,7 +1515,7 @@ NOUNS_IRREG = {
"principia": ("principium",), "principia": ("principium",),
"proboscides": ("proboscis",), "proboscides": ("proboscis",),
"proces-verbaux": ("proces-verbal",), "proces-verbaux": ("proces-verbal",),
"proglottides": ("proglottid", "proglottis",), "proglottides": ("proglottid", "proglottis"),
"prognoses": ("prognosis",), "prognoses": ("prognosis",),
"prolegomena": ("prolegomenon",), "prolegomena": ("prolegomenon",),
"prolepses": ("prolepsis",), "prolepses": ("prolepsis",),
@ -1532,7 +1532,7 @@ NOUNS_IRREG = {
"prostheses": ("prosthesis",), "prostheses": ("prosthesis",),
"prostomia": ("prostomium",), "prostomia": ("prostomium",),
"protases": ("protasis",), "protases": ("protasis",),
"prothalamia": ("prothalamion", "prothalamium",), "prothalamia": ("prothalamion", "prothalamium"),
"prothalli": ("prothallus",), "prothalli": ("prothallus",),
"prothallia": ("prothallium",), "prothallia": ("prothallium",),
"prothoraces": ("prothorax",), "prothoraces": ("prothorax",),
@ -1572,7 +1572,7 @@ NOUNS_IRREG = {
"quezales": ("quezal",), "quezales": ("quezal",),
"quinquennia": ("quinquennium",), "quinquennia": ("quinquennium",),
"quizzes": ("quiz",), "quizzes": ("quiz",),
"rabatos": ("rabato", "rebato",), "rabatos": ("rabato", "rebato"),
"rabbitfishes": ("rabbitfish",), "rabbitfishes": ("rabbitfish",),
"rachides": ("rhachis",), "rachides": ("rhachis",),
"radices": ("radix",), "radices": ("radix",),
@ -1583,7 +1583,7 @@ NOUNS_IRREG = {
"ranulae": ("ranula",), "ranulae": ("ranula",),
"ranunculi": ("ranunculus",), "ranunculi": ("ranunculus",),
"raphae": ("raphe",), "raphae": ("raphe",),
"raphides": ("raphide", "raphis",), "raphides": ("raphide", "raphis"),
"ratfishes": ("ratfish",), "ratfishes": ("ratfish",),
"reales": ("real",), "reales": ("real",),
"rearmice": ("rearmouse",), "rearmice": ("rearmouse",),
@ -1598,7 +1598,7 @@ NOUNS_IRREG = {
"reis": ("real",), "reis": ("real",),
"relata": ("relatum",), "relata": ("relatum",),
"remiges": ("remex",), "remiges": ("remex",),
"reremice": ("rearmouse", "reremouse",), "reremice": ("rearmouse", "reremouse"),
"reseaux": ("reseau",), "reseaux": ("reseau",),
"residua": ("residuum",), "residua": ("residuum",),
"responsa": ("responsum",), "responsa": ("responsum",),
@ -1609,7 +1609,7 @@ NOUNS_IRREG = {
"retinae": ("retina",), "retinae": ("retina",),
"rhabdomyomata": ("rhabdomyoma",), "rhabdomyomata": ("rhabdomyoma",),
"rhachides": ("rhachis",), "rhachides": ("rhachis",),
"rhachises": ("rachis", "rhachis",), "rhachises": ("rachis", "rhachis"),
"rhinencephala": ("rhinencephalon",), "rhinencephala": ("rhinencephalon",),
"rhizobia": ("rhizobium",), "rhizobia": ("rhizobium",),
"rhombi": ("rhombus",), "rhombi": ("rhombus",),
@ -1636,7 +1636,7 @@ NOUNS_IRREG = {
"runners-up": ("runner-up",), "runners-up": ("runner-up",),
"sacra": ("sacrum",), "sacra": ("sacrum",),
"sacraria": ("sacrarium",), "sacraria": ("sacrarium",),
"saguaros": ("saguaro", "sahuaro",), "saguaros": ("saguaro", "sahuaro"),
"sailfishes": ("sailfish",), "sailfishes": ("sailfish",),
"salespeople": ("salesperson",), "salespeople": ("salesperson",),
"salmonellae": ("salmonella",), "salmonellae": ("salmonella",),
@ -1657,7 +1657,7 @@ NOUNS_IRREG = {
"scapulae": ("scapula",), "scapulae": ("scapula",),
"scarabaei": ("scarabaeus",), "scarabaei": ("scarabaeus",),
"scarves": ("scarf",), "scarves": ("scarf",),
"schatchonim": ("schatchen", "shadchan",), "schatchonim": ("schatchen", "shadchan"),
"schemata": ("schema",), "schemata": ("schema",),
"scherzandi": ("scherzando",), "scherzandi": ("scherzando",),
"scherzi": ("scherzo",), "scherzi": ("scherzo",),
@ -1690,7 +1690,7 @@ NOUNS_IRREG = {
"senores": ("senor",), "senores": ("senor",),
"sensilla": ("sensillum",), "sensilla": ("sensillum",),
"senti": ("sent",), "senti": ("sent",),
"senussis": ("senusi", "senussi",), "senussis": ("senusi", "senussi"),
"separatrices": ("separatrix",), "separatrices": ("separatrix",),
"sephardim": ("sephardi",), "sephardim": ("sephardi",),
"septa": ("septum",), "septa": ("septum",),
@ -1707,9 +1707,9 @@ NOUNS_IRREG = {
"shabbatim": ("shabbat",), "shabbatim": ("shabbat",),
"shackoes": ("shacko",), "shackoes": ("shacko",),
"shadchanim": ("shadchan",), "shadchanim": ("shadchan",),
"shadchans": ("schatchen", "shadchan",), "shadchans": ("schatchen", "shadchan"),
"shakoes": ("shako",), "shakoes": ("shako",),
"shammosim": ("shammas", "shammes",), "shammosim": ("shammas", "shammes"),
"sheatfishes": ("sheatfish",), "sheatfishes": ("sheatfish",),
"sheaves": ("sheaf",), "sheaves": ("sheaf",),
"shellfishes": ("shellfish",), "shellfishes": ("shellfish",),
@ -1717,14 +1717,14 @@ NOUNS_IRREG = {
"shinleaves": ("shinleaf",), "shinleaves": ("shinleaf",),
"shittim": ("shittah",), "shittim": ("shittah",),
"shmoes": ("shmo",), "shmoes": ("shmo",),
"shofroth": ("shofar", "shophar",), "shofroth": ("shofar", "shophar"),
"shophroth": ("shophar",), "shophroth": ("shophar",),
"shrewmice": ("shrewmouse",), "shrewmice": ("shrewmouse",),
"shuln": ("shul",), "shuln": ("shul",),
"siddurim": ("siddur",), "siddurim": ("siddur",),
"sigloi": ("siglos",), "sigloi": ("siglos",),
"signore": ("signora",), "signore": ("signora",),
"signori": ("signior", "signore",), "signori": ("signior", "signore"),
"signorine": ("signorina",), "signorine": ("signorina",),
"siliquae": ("siliqua",), "siliquae": ("siliqua",),
"silvae": ("silva",), "silvae": ("silva",),
@ -1739,12 +1739,12 @@ NOUNS_IRREG = {
"snaggleteeth": ("snaggletooth",), "snaggleteeth": ("snaggletooth",),
"snailfishes": ("snailfish",), "snailfishes": ("snailfish",),
"snipefishes": ("snipefish",), "snipefishes": ("snipefish",),
"socmen": ("socman", "sokeman",), "socmen": ("socman", "sokeman"),
"sola": ("solum",), "sola": ("solum",),
"solaria": ("solarium",), "solaria": ("solarium",),
"solatia": ("solatium",), "solatia": ("solatium",),
"soldi": ("soldo",), "soldi": ("soldo",),
"soles": ("sol", "sole",), "soles": ("sol", "sole"),
"solfeggi": ("solfeggio",), "solfeggi": ("solfeggio",),
"soli": ("solo",), "soli": ("solo",),
"solidi": ("solidus",), "solidi": ("solidus",),
@ -1864,7 +1864,7 @@ NOUNS_IRREG = {
"syringes": ("syrinx",), "syringes": ("syrinx",),
"syssarcoses": ("syssarcosis",), "syssarcoses": ("syssarcosis",),
"tableaux": ("tableau",), "tableaux": ("tableau",),
"taeniae": ("taenia", "tenia",), "taeniae": ("taenia", "tenia"),
"tali": ("talus",), "tali": ("talus",),
"tallaisim": ("tallith",), "tallaisim": ("tallith",),
"tallithes": ("tallith",), "tallithes": ("tallith",),
@ -1874,14 +1874,14 @@ NOUNS_IRREG = {
"tarsi": ("tarsus",), "tarsi": ("tarsus",),
"tarsometatarsi": ("tarsometatarsus",), "tarsometatarsi": ("tarsometatarsus",),
"taxa": ("taxon",), "taxa": ("taxon",),
"taxes": ("tax", "taxis",), "taxes": ("tax", "taxis"),
"taxies": ("taxi",), "taxies": ("taxi",),
"tectrices": ("tectrix",), "tectrices": ("tectrix",),
"teeth": ("tooth",), "teeth": ("tooth",),
"tegmina": ("tegmen",), "tegmina": ("tegmen",),
"telae": ("tela",), "telae": ("tela",),
"telamones": ("telamon",), "telamones": ("telamon",),
"telangiectases": ("telangiectasia", "telangiectasis",), "telangiectases": ("telangiectasia", "telangiectasis"),
"telia": ("telium",), "telia": ("telium",),
"tempi": ("tempo",), "tempi": ("tempo",),
"tenacula": ("tenaculum",), "tenacula": ("tenaculum",),
@ -1932,7 +1932,7 @@ NOUNS_IRREG = {
"tornadoes": ("tornado",), "tornadoes": ("tornado",),
"torpedoes": ("torpedo",), "torpedoes": ("torpedo",),
"torsi": ("torso",), "torsi": ("torso",),
"touracos": ("touraco", "turaco",), "touracos": ("touraco", "turaco"),
"trabeculae": ("trabecula",), "trabeculae": ("trabecula",),
"tracheae": ("trachea",), "tracheae": ("trachea",),
"traditores": ("traditor",), "traditores": ("traditor",),
@ -1960,7 +1960,7 @@ NOUNS_IRREG = {
"tubae": ("tuba",), "tubae": ("tuba",),
"turves": ("turf",), "turves": ("turf",),
"tympana": ("tympanum",), "tympana": ("tympanum",),
"tyros": ("tiro", "tyro",), "tyros": ("tiro", "tyro"),
"ubermenschen": ("ubermensch",), "ubermenschen": ("ubermensch",),
"uglies": ("ugli",), "uglies": ("ugli",),
"uigurs": ("uighur",), "uigurs": ("uighur",),
@ -1980,7 +1980,7 @@ NOUNS_IRREG = {
"utriculi": ("utriculus",), "utriculi": ("utriculus",),
"uvulae": ("uvula",), "uvulae": ("uvula",),
"vacua": ("vacuum",), "vacua": ("vacuum",),
"vagi": ("vagus", "vagus",), "vagi": ("vagus", "vagus"),
"vaginae": ("vagina",), "vaginae": ("vagina",),
"valleculae": ("vallecula",), "valleculae": ("vallecula",),
"vaporetti": ("vaporetto",), "vaporetti": ("vaporetto",),
@ -2026,7 +2026,7 @@ NOUNS_IRREG = {
"vortices": ("vortex",), "vortices": ("vortex",),
"vulvae": ("vulva",), "vulvae": ("vulva",),
"wagons-lits": ("wagon-lit",), "wagons-lits": ("wagon-lit",),
"wahhabis": ("wahabi", "wahhabi",), "wahhabis": ("wahabi", "wahhabi"),
"wanderjahre": ("wanderjahr",), "wanderjahre": ("wanderjahr",),
"weakfishes": ("weakfish",), "weakfishes": ("weakfish",),
"werewolves": ("werewolf",), "werewolves": ("werewolf",),
@ -2044,13 +2044,13 @@ NOUNS_IRREG = {
"yeshivoth": ("yeshiva",), "yeshivoth": ("yeshiva",),
"yogin": ("yogi",), "yogin": ("yogi",),
"yourselves": ("yourself",), "yourselves": ("yourself",),
"zamindaris": ("zamindari", "zemindari",), "zamindaris": ("zamindari", "zemindari"),
"zecchini": ("zecchino",), "zecchini": ("zecchino",),
"zeroes": ("zero",), "zeroes": ("zero",),
"zoa": ("zoon",), "zoa": ("zoon",),
"zoaeae": ("zoaea", "zoea",), "zoaeae": ("zoaea", "zoea"),
"zoeae": ("zoea",), "zoeae": ("zoea",),
"zoeas": ("zoaea",), "zoeas": ("zoaea",),
"zoonoses": ("zoonosis",), "zoonoses": ("zoonosis",),
"zoosporangia": ("zoosporangium",) "zoosporangia": ("zoosporangium",),
} }

View File

@ -42,8 +42,8 @@ VERBS_IRREG = {
"anglified": ("anglify",), "anglified": ("anglify",),
"annulled": ("annul",), "annulled": ("annul",),
"annulling": ("annul",), "annulling": ("annul",),
"appalled": ("appal", "appall",), "appalled": ("appal", "appall"),
"appalling": ("appal", "appall",), "appalling": ("appal", "appall"),
"applied": ("apply",), "applied": ("apply",),
"arcked": ("arc",), "arcked": ("arc",),
"arcking": ("arc",), "arcking": ("arc",),
@ -244,9 +244,9 @@ VERBS_IRREG = {
"bypast": ("bypass",), "bypast": ("bypass",),
"caballed": ("cabal",), "caballed": ("cabal",),
"caballing": ("cabal",), "caballing": ("cabal",),
"caddied": ("caddie", "caddy",), "caddied": ("caddie", "caddy"),
"caddies": ("caddie", "caddy",), "caddies": ("caddie", "caddy"),
"caddying": ("caddie", "caddy",), "caddying": ("caddie", "caddy"),
"calcified": ("calcify",), "calcified": ("calcify",),
"came": ("come",), "came": ("come",),
"canalled": ("canal",), "canalled": ("canal",),
@ -506,8 +506,8 @@ VERBS_IRREG = {
"disembodied": ("disembody",), "disembodied": ("disembody",),
"disembowelled": ("disembowel",), "disembowelled": ("disembowel",),
"disembowelling": ("disembowel",), "disembowelling": ("disembowel",),
"disenthralled": ("disenthral", "disenthrall",), "disenthralled": ("disenthral", "disenthrall"),
"disenthralling": ("disenthral", "disenthrall",), "disenthralling": ("disenthral", "disenthrall"),
"disenthralls": ("disenthral",), "disenthralls": ("disenthral",),
"disenthrals": ("disenthrall",), "disenthrals": ("disenthrall",),
"dishevelled": ("dishevel",), "dishevelled": ("dishevel",),
@ -518,8 +518,8 @@ VERBS_IRREG = {
"dispelling": ("dispel",), "dispelling": ("dispel",),
"disqualified": ("disqualify",), "disqualified": ("disqualify",),
"dissatisfied": ("dissatisfy",), "dissatisfied": ("dissatisfy",),
"distilled": ("distil", "distill",), "distilled": ("distil", "distill"),
"distilling": ("distil", "distill",), "distilling": ("distil", "distill"),
"diversified": ("diversify",), "diversified": ("diversify",),
"divvied": ("divvy",), "divvied": ("divvy",),
"dizzied": ("dizzy",), "dizzied": ("dizzy",),
@ -595,10 +595,10 @@ VERBS_IRREG = {
"enamelling": ("enamel",), "enamelling": ("enamel",),
"englutted": ("englut",), "englutted": ("englut",),
"englutting": ("englut",), "englutting": ("englut",),
"enrolled": ("enrol", "enroll",), "enrolled": ("enrol", "enroll"),
"enrolling": ("enrol", "enroll",), "enrolling": ("enrol", "enroll"),
"enthralled": ("enthral", "enthrall",), "enthralled": ("enthral", "enthrall"),
"enthralling": ("enthral", "enthrall",), "enthralling": ("enthral", "enthrall"),
"entrammelled": ("entrammel",), "entrammelled": ("entrammel",),
"entrammelling": ("entrammel",), "entrammelling": ("entrammel",),
"entrapped": ("entrap",), "entrapped": ("entrap",),
@ -621,8 +621,8 @@ VERBS_IRREG = {
"exemplified": ("exemplify",), "exemplified": ("exemplify",),
"expelled": ("expel",), "expelled": ("expel",),
"expelling": ("expel",), "expelling": ("expel",),
"extolled": ("extol", "extoll",), "extolled": ("extol", "extoll"),
"extolling": ("extol", "extoll",), "extolling": ("extol", "extoll"),
"facetted": ("facet",), "facetted": ("facet",),
"facetting": ("facet",), "facetting": ("facet",),
"fagged": ("fag",), "fagged": ("fag",),
@ -638,7 +638,7 @@ VERBS_IRREG = {
"featherbedded": ("featherbed",), "featherbedded": ("featherbed",),
"featherbedding": ("featherbed",), "featherbedding": ("featherbed",),
"fed": ("feed",), "fed": ("feed",),
"feed": ("feed", "fee",), "feed": ("feed", "fee"),
"fell": ("fall",), "fell": ("fall",),
"felt": ("feel",), "felt": ("feel",),
"ferried": ("ferry",), "ferried": ("ferry",),
@ -744,8 +744,8 @@ VERBS_IRREG = {
"fried": ("fry",), "fried": ("fry",),
"frigged": ("frig",), "frigged": ("frig",),
"frigging": ("frig",), "frigging": ("frig",),
"fritted": ("frit", "fritt",), "fritted": ("frit", "fritt"),
"fritting": ("frit", "fritt",), "fritting": ("frit", "fritt"),
"frivolled": ("frivol",), "frivolled": ("frivol",),
"frivolling": ("frivol",), "frivolling": ("frivol",),
"frogged": ("frog",), "frogged": ("frog",),
@ -757,8 +757,8 @@ VERBS_IRREG = {
"fructified": ("fructify",), "fructified": ("fructify",),
"fuelled": ("fuel",), "fuelled": ("fuel",),
"fuelling": ("fuel",), "fuelling": ("fuel",),
"fulfilled": ("fulfil", "fulfill",), "fulfilled": ("fulfil", "fulfill"),
"fulfilling": ("fulfil", "fulfill",), "fulfilling": ("fulfil", "fulfill"),
"funned": ("fun",), "funned": ("fun",),
"funnelled": ("funnel",), "funnelled": ("funnel",),
"funnelling": ("funnel",), "funnelling": ("funnel",),
@ -955,8 +955,8 @@ VERBS_IRREG = {
"insetting": ("inset",), "insetting": ("inset",),
"inspanned": ("inspan",), "inspanned": ("inspan",),
"inspanning": ("inspan",), "inspanning": ("inspan",),
"installed": ("instal", "install",), "installed": ("instal", "install"),
"installing": ("instal", "install",), "installing": ("instal", "install"),
"intensified": ("intensify",), "intensified": ("intensify",),
"interbred": ("interbreed",), "interbred": ("interbreed",),
"intercropped": ("intercrop",), "intercropped": ("intercrop",),
@ -1303,7 +1303,7 @@ VERBS_IRREG = {
"overdriven": ("overdrive",), "overdriven": ("overdrive",),
"overdrove": ("overdrive",), "overdrove": ("overdrive",),
"overflew": ("overfly",), "overflew": ("overfly",),
"overflown": ("overflow", "overfly",), "overflown": ("overflow", "overfly"),
"overgrew": ("overgrow",), "overgrew": ("overgrow",),
"overgrown": ("overgrow",), "overgrown": ("overgrow",),
"overheard": ("overhear",), "overheard": ("overhear",),
@ -1547,8 +1547,8 @@ VERBS_IRREG = {
"red": ("red",), "red": ("red",),
"red-pencilled": ("red-pencil",), "red-pencilled": ("red-pencil",),
"red-pencilling": ("red-pencil",), "red-pencilling": ("red-pencil",),
"redded": ("red", "redd",), "redded": ("red", "redd"),
"redding": ("red", "redd",), "redding": ("red", "redd"),
"redid": ("redo",), "redid": ("redo",),
"redone": ("redo",), "redone": ("redo",),
"referred": ("refer",), "referred": ("refer",),
@ -1763,7 +1763,7 @@ VERBS_IRREG = {
"signified": ("signify",), "signified": ("signify",),
"silicified": ("silicify",), "silicified": ("silicify",),
"simplified": ("simplify",), "simplified": ("simplify",),
"singing": ("sing", "singe",), "singing": ("sing", "singe"),
"single-stepped": ("single-step",), "single-stepped": ("single-step",),
"single-stepping": ("single-step",), "single-stepping": ("single-step",),
"sinned": ("sin",), "sinned": ("sin",),
@ -2404,5 +2404,5 @@ VERBS_IRREG = {
"zigzagged": ("zigzag",), "zigzagged": ("zigzag",),
"zigzagging": ("zigzag",), "zigzagging": ("zigzag",),
"zipped": ("zip",), "zipped": ("zip",),
"zipping": ("zip",) "zipping": ("zip",),
} }

View File

@ -538,7 +538,7 @@ for orth in [
"Sen.", "Sen.",
"St.", "St.",
"vs.", "vs.",
"v.s." "v.s.",
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]

File diff suppressed because it is too large Load Diff

View File

@ -20,14 +20,22 @@ from ....util import load_language_data
BASE_PATH = Path(__file__).parent BASE_PATH = Path(__file__).parent
LOOKUP = load_language_data(BASE_PATH / 'lookup.json') LOOKUP = load_language_data(BASE_PATH / "lookup.json")
VERBS_IRREG = load_language_data(BASE_PATH / '_verbs_irreg.json') VERBS_IRREG = load_language_data(BASE_PATH / "_verbs_irreg.json")
ADJECTIVES_IRREG = load_language_data(BASE_PATH / '_adjectives_irreg.json') ADJECTIVES_IRREG = load_language_data(BASE_PATH / "_adjectives_irreg.json")
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS} LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adp': ADP_IRREG, 'aux': AUXILIARY_VERBS_IRREG, LEMMA_EXC = {
'cconj': CCONJ_IRREG, 'det': DETS_IRREG, 'noun': NOUNS_IRREG, 'verb': VERBS_IRREG, "adj": ADJECTIVES_IRREG,
'pron': PRONOUNS_IRREG, 'sconj': SCONJ_IRREG} "adp": ADP_IRREG,
"aux": AUXILIARY_VERBS_IRREG,
"cconj": CCONJ_IRREG,
"det": DETS_IRREG,
"noun": NOUNS_IRREG,
"verb": VERBS_IRREG,
"pron": PRONOUNS_IRREG,
"sconj": SCONJ_IRREG,
}
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES} LEMMA_RULES = {"adj": ADJECTIVE_RULES, "noun": NOUN_RULES, "verb": VERB_RULES}

View File

@ -20,5 +20,5 @@ ADP_IRREG = {
"pr": ("pour",), "pr": ("pour",),
"/": ("sur",), "/": ("sur",),
"versus": ("vs",), "versus": ("vs",),
"vs.": ("vs",) "vs.": ("vs",),
} }

View File

@ -365,5 +365,5 @@ AUXILIARY_VERBS_IRREG = {
"va": ("aller",), "va": ("aller",),
"vais": ("aller",), "vais": ("aller",),
"vas": ("aller",), "vas": ("aller",),
"vont": ("aller",) "vont": ("aller",),
} }

View File

@ -13,5 +13,5 @@ CCONJ_IRREG = {
"i.e.": ("c'est-à-dire",), "i.e.": ("c'est-à-dire",),
"ie": ("c'est-à-dire",), "ie": ("c'est-à-dire",),
"ou/et": ("et-ou",), "ou/et": ("et-ou",),
"+": ("plus",) "+": ("plus",),
} }

View File

@ -9963,5 +9963,5 @@ NOUNS_IRREG = {
"zurichoises": ("zurichois",), "zurichoises": ("zurichois",),
"zurichois": ("zurichois",), "zurichois": ("zurichois",),
"zyras": ("zyras",), "zyras": ("zyras",),
"zyzomys": ("zyzomys",) "zyzomys": ("zyzomys",),
} }

View File

@ -15,5 +15,5 @@ SCONJ_IRREG = {
"puisqu'": ("puisque",), "puisqu'": ("puisque",),
"qd": ("quand",), "qd": ("quand",),
"quoiqu'": ("quoique",), "quoiqu'": ("quoique",),
"qu'": ("que",) "qu'": ("que",),
} }

View File

@ -3,20 +3,22 @@ from __future__ import unicode_literals
from pathlib import Path from pathlib import Path
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP, SCONJ, CCONJ from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ....symbols import SCONJ, CCONJ
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
from ....util import load_language_data from ....util import load_language_data
LOOKUP = load_language_data(Path(__file__).parent / 'lookup.json') LOOKUP = load_language_data(Path(__file__).parent / "lookup.json")
''' """
French language lemmatizer applies the default rule based lemmatization French language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better French language support. procedure with some modifications for better French language support.
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use the The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use the
rule-based lemmatization. As a last resort, the lemmatizer checks in rule-based lemmatization. As a last resort, the lemmatizer checks in
the lookup table. the lookup table.
''' """
class FrenchLemmatizer(object): class FrenchLemmatizer(object):
@classmethod @classmethod
@ -32,36 +34,39 @@ class FrenchLemmatizer(object):
def __call__(self, string, univ_pos, morphology=None): def __call__(self, string, univ_pos, morphology=None):
if not self.rules: if not self.rules:
return [self.lookup_table.get(string, string)] return [self.lookup_table.get(string, string)]
if univ_pos in (NOUN, 'NOUN', 'noun'): if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = 'noun' univ_pos = "noun"
elif univ_pos in (VERB, 'VERB', 'verb'): elif univ_pos in (VERB, "VERB", "verb"):
univ_pos = 'verb' univ_pos = "verb"
elif univ_pos in (ADJ, 'ADJ', 'adj'): elif univ_pos in (ADJ, "ADJ", "adj"):
univ_pos = 'adj' univ_pos = "adj"
elif univ_pos in (ADP, 'ADP', 'adp'): elif univ_pos in (ADP, "ADP", "adp"):
univ_pos = 'adp' univ_pos = "adp"
elif univ_pos in (ADV, 'ADV', 'adv'): elif univ_pos in (ADV, "ADV", "adv"):
univ_pos = 'adv' univ_pos = "adv"
elif univ_pos in (AUX, 'AUX', 'aux'): elif univ_pos in (AUX, "AUX", "aux"):
univ_pos = 'aux' univ_pos = "aux"
elif univ_pos in (CCONJ, 'CCONJ', 'cconj'): elif univ_pos in (CCONJ, "CCONJ", "cconj"):
univ_pos = 'cconj' univ_pos = "cconj"
elif univ_pos in (DET, 'DET', 'det'): elif univ_pos in (DET, "DET", "det"):
univ_pos = 'det' univ_pos = "det"
elif univ_pos in (PRON, 'PRON', 'pron'): elif univ_pos in (PRON, "PRON", "pron"):
univ_pos = 'pron' univ_pos = "pron"
elif univ_pos in (PUNCT, 'PUNCT', 'punct'): elif univ_pos in (PUNCT, "PUNCT", "punct"):
univ_pos = 'punct' univ_pos = "punct"
elif univ_pos in (SCONJ, 'SCONJ', 'sconj'): elif univ_pos in (SCONJ, "SCONJ", "sconj"):
univ_pos = 'sconj' univ_pos = "sconj"
else: else:
return [self.lookup(string)] return [self.lookup(string)]
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology): if self.is_base_form(univ_pos, morphology):
return list(set([string.lower()])) return list(set([string.lower()]))
lemmas = lemmatize(string, self.index.get(univ_pos, {}), lemmas = lemmatize(
string,
self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}), self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, [])) self.rules.get(univ_pos, []),
)
return lemmas return lemmas
def is_base_form(self, univ_pos, morphology=None): def is_base_form(self, univ_pos, morphology=None):
@ -70,20 +75,25 @@ class FrenchLemmatizer(object):
avoid lemmatization entirely. avoid lemmatization entirely.
""" """
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology others = [
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')] key
if univ_pos == 'noun' and morphology.get('Number') == 'sing': for key in morphology
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
]
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True return True
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf': elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE' # This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology # morphology
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and elif univ_pos == "verb" and (
morphology.get('Tense') == 'pres' and morphology.get("VerbForm") == "fin"
morphology.get('Number') is None and and morphology.get("Tense") == "pres"
not others): and morphology.get("Number") is None
and not others
):
return True return True
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True return True
elif VerbForm_inf in morphology: elif VerbForm_inf in morphology:
return True return True
@ -97,16 +107,16 @@ class FrenchLemmatizer(object):
return False return False
def noun(self, string, morphology=None): def noun(self, string, morphology=None):
return self(string, 'noun', morphology) return self(string, "noun", morphology)
def verb(self, string, morphology=None): def verb(self, string, morphology=None):
return self(string, 'verb', morphology) return self(string, "verb", morphology)
def adj(self, string, morphology=None): def adj(self, string, morphology=None):
return self(string, 'adj', morphology) return self(string, "adj", morphology)
def punct(self, string, morphology=None): def punct(self, string, morphology=None):
return self(string, 'punct', morphology) return self(string, "punct", morphology)
def lookup(self, string): def lookup(self, string):
if string in self.lookup_table: if string in self.lookup_table:
@ -117,7 +127,7 @@ class FrenchLemmatizer(object):
def lemmatize(string, index, exceptions, rules): def lemmatize(string, index, exceptions, rules):
string = string.lower() string = string.lower()
forms = [] forms = []
if (string in index): if string in index:
forms.append(string) forms.append(string)
return forms return forms
forms.extend(exceptions.get(string, [])) forms.extend(exceptions.get(string, []))

View File

@ -2,8 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA from ...symbols import ORTH, LEMMA
_exc = { _exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]}
"po'": [{ORTH: "po'", LEMMA: 'poco'}]
}
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc

View File

@ -1,5 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.
@ -11,5 +12,5 @@ sentences = [
"애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.", "애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.",
"자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.", "자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.",
"자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.", "자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.",
"런던은 영국의 수도이자 가장 큰 도시입니다." "런던은 영국의 수도이자 가장 큰 도시입니다.",
] ]

View File

@ -1,7 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
STOP_WORDS = set(""" STOP_WORDS = set(
"""
@ -65,4 +66,5 @@ STOP_WORDS = set("""
""".split()) """.split()
)

View File

@ -20,10 +20,10 @@ LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
BASE_PATH = Path(__file__).parent BASE_PATH = Path(__file__).parent
LEMMA_EXC = { LEMMA_EXC = {
"adj": load_language_data(BASE_PATH / '_adjectives_wordforms.json'), "adj": load_language_data(BASE_PATH / "_adjectives_wordforms.json"),
"adv": ADVERBS_WORDFORMS, "adv": ADVERBS_WORDFORMS,
"noun": load_language_data(BASE_PATH / '_nouns_wordforms.json'), "noun": load_language_data(BASE_PATH / "_nouns_wordforms.json"),
"verb": load_language_data(BASE_PATH / '_verbs_wordforms.json'), "verb": load_language_data(BASE_PATH / "_verbs_wordforms.json"),
} }
LEMMA_RULES = { LEMMA_RULES = {
@ -39,5 +39,3 @@ LEMMA_RULES = {
# https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en # https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
# License: # License:
# Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/) # Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -14,7 +14,7 @@ _infixes = (
+ [ + [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),

View File

@ -118,7 +118,7 @@ for orth in [
"o.l.", "o.l.",
"on.", "on.",
"op.", "op.",
"org." "org.",
"osv.", "osv.",
"ovf.", "ovf.",
"p.", "p.",

View File

@ -14,5 +14,5 @@ sentences = [
"Apple overweegt om voor 1 miljard een U.K. startup te kopen", "Apple overweegt om voor 1 miljard een U.K. startup te kopen",
"Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten", "Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",
"San Francisco overweegt robots op voetpaden te verbieden", "San Francisco overweegt robots op voetpaden te verbieden",
"Londen is een grote stad in het Verenigd Koninkrijk" "Londen is een grote stad in het Verenigd Koninkrijk",
] ]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,22 +3,25 @@ from __future__ import unicode_literals
ADPOSITIONS = set( ADPOSITIONS = set(
('aan aangaande aanwezig achter af afgezien al als an annex anno anti ' (
'behalve behoudens beneden benevens benoorden beoosten betreffende bewesten ' "aan aangaande aanwezig achter af afgezien al als an annex anno anti "
'bezijden bezuiden bij binnen binnenuit binst bladzij blijkens boven bovenop ' "behalve behoudens beneden benevens benoorden beoosten betreffende bewesten "
'buiten conform contra cq daaraan daarbij daarbuiten daarin daarnaar ' "bezijden bezuiden bij binnen binnenuit binst bladzij blijkens boven bovenop "
'daaronder daartegenover daarvan dankzij deure dichtbij door doordat doorheen ' "buiten conform contra cq daaraan daarbij daarbuiten daarin daarnaar "
'echter eraf erop erover errond eruit ervoor evenals exclusief gedaan ' "daaronder daartegenover daarvan dankzij deure dichtbij door doordat doorheen "
'gedurende gegeven getuige gezien halfweg halverwege heen hierdoorheen hierop ' "echter eraf erop erover errond eruit ervoor evenals exclusief gedaan "
'houdende in inclusief indien ingaande ingevolge inzake jegens kortweg ' "gedurende gegeven getuige gezien halfweg halverwege heen hierdoorheen hierop "
'krachtens kralj langs langsheen langst lastens linksom lopende luidens mede ' "houdende in inclusief indien ingaande ingevolge inzake jegens kortweg "
'mee met middels midden middenop mits na naan naar naartoe naast naat nabij ' "krachtens kralj langs langsheen langst lastens linksom lopende luidens mede "
'nadat namens neer neffe neffen neven nevenst niettegenstaande nopens ' "mee met middels midden middenop mits na naan naar naartoe naast naat nabij "
'officieel om omheen omstreeks omtrent onafgezien ondanks onder onderaan ' "nadat namens neer neffe neffen neven nevenst niettegenstaande nopens "
'ondere ongeacht ooit op open over per plus pro qua rechtover rond rondom ' "officieel om omheen omstreeks omtrent onafgezien ondanks onder onderaan "
"ondere ongeacht ooit op open over per plus pro qua rechtover rond rondom "
"sedert sinds spijts strekkende te tegen tegenaan tegenop tegenover telde " "sedert sinds spijts strekkende te tegen tegenaan tegenop tegenover telde "
'teneinde terug tijdens toe tot totdat trots tussen tégen uit uitgenomen ' "teneinde terug tijdens toe tot totdat trots tussen tégen uit uitgenomen "
'ultimo van vanaf vandaan vandoor vanop vanuit vanwege versus via vinnen ' "ultimo van vanaf vandaan vandoor vanop vanuit vanwege versus via vinnen "
'vlakbij volgens voor voor- voorbij voordat voort voren vòòr vóór waaraan ' "vlakbij volgens voor voor- voorbij voordat voort voren vòòr vóór waaraan "
'waarbij waardoor waaronder weg wegens weleens zijdens zoals zodat zonder ' "waarbij waardoor waaronder weg wegens weleens zijdens zoals zodat zonder "
'zónder à').split()) "zónder à"
).split()
)

View File

@ -3,10 +3,10 @@ from __future__ import unicode_literals
ADPOSITIONS_IRREG = { ADPOSITIONS_IRREG = {
"'t": ('te',), "'t": ("te",),
'me': ('mee',), "me": ("mee",),
'meer': ('mee',), "meer": ("mee",),
'on': ('om',), "on": ("om",),
'ten': ('te',), "ten": ("te",),
'ter': ('te',) "ter": ("te",),
} }

View File

@ -3,17 +3,17 @@ from __future__ import unicode_literals
ADVERBS_IRREG = { ADVERBS_IRREG = {
"'ns": ('eens',), "'ns": ("eens",),
"'s": ('eens',), "'s": ("eens",),
"'t": ('het',), "'t": ("het",),
"d'r": ('er',), "d'r": ("er",),
"d'raf": ('eraf',), "d'raf": ("eraf",),
"d'rbij": ('erbij',), "d'rbij": ("erbij",),
"d'rheen": ('erheen',), "d'rheen": ("erheen",),
"d'rin": ('erin',), "d'rin": ("erin",),
"d'rna": ('erna',), "d'rna": ("erna",),
"d'rnaar": ('ernaar',), "d'rnaar": ("ernaar",),
'hele': ('heel',), "hele": ("heel",),
'nevenst': ('nevens',), "nevenst": ("nevens",),
'overend': ('overeind',) "overend": ("overeind",),
} }

View File

@ -3,15 +3,18 @@ from __future__ import unicode_literals
DETERMINERS = set( DETERMINERS = set(
("al allebei allerhande allerminst alletwee" (
"al allebei allerhande allerminst alletwee"
"beide clip-on d'n d'r dat datgeen datgene de dees degeen degene den dewelke " "beide clip-on d'n d'r dat datgeen datgene de dees degeen degene den dewelke "
'deze dezelfde die diegeen diegene diehien dien diene diens diezelfde dit ' "deze dezelfde die diegeen diegene diehien dien diene diens diezelfde dit "
'ditgene e een eene eigen elk elkens elkes enig enkel enne ettelijke eure ' "ditgene e een eene eigen elk elkens elkes enig enkel enne ettelijke eure "
'euren evenveel ewe ge geen ginds géén haar haaren halfelf het hetgeen ' "euren evenveel ewe ge geen ginds géén haar haaren halfelf het hetgeen "
'hetwelk hetzelfde heur heure hulder hulle hullen hullie hun hunder hunderen ' "hetwelk hetzelfde heur heure hulder hulle hullen hullie hun hunder hunderen "
'ieder iederes ja je jen jouw jouwen jouwes jullie junder keiveel keiweinig ' "ieder iederes ja je jen jouw jouwen jouwes jullie junder keiveel keiweinig "
"m'ne me meer meerder meerdere menen menig mijn mijnes minst méér niemendal " "m'ne me meer meerder meerdere menen menig mijn mijnes minst méér niemendal "
'oe ons onse se sommig sommigeder superveel telken teveel titulair ulder ' "oe ons onse se sommig sommigeder superveel telken teveel titulair ulder "
'uldere ulderen ulle under une uw vaak veel veels véél wat weinig welk welken ' "uldere ulderen ulle under une uw vaak veel veels véél wat weinig welk welken "
"welkene welksten z'nen ze zenen zijn zo'n zo'ne zoiet zoveel zovele zovelen " "welkene welksten z'nen ze zenen zijn zo'n zo'ne zoiet zoveel zovele zovelen "
'zuk zulk zulkdanig zulken zulks zullie zíjn àlle álle').split()) "zuk zulk zulkdanig zulken zulks zullie zíjn àlle álle"
).split()
)

View File

@ -3,67 +3,67 @@ from __future__ import unicode_literals
DETERMINERS_IRREG = { DETERMINERS_IRREG = {
"'r": ('haar',), "'r": ("haar",),
"'s": ('de',), "'s": ("de",),
"'t": ('het',), "'t": ("het",),
"'tgene": ('hetgeen',), "'tgene": ("hetgeen",),
'alle': ('al',), "alle": ("al",),
'allen': ('al',), "allen": ("al",),
'aller': ('al',), "aller": ("al",),
'beiden': ('beide',), "beiden": ("beide",),
'beider': ('beide',), "beider": ("beide",),
"d'": ('het',), "d'": ("het",),
"d'r": ('haar',), "d'r": ("haar",),
'der': ('de',), "der": ("de",),
'des': ('de',), "des": ("de",),
'dezer': ('deze',), "dezer": ("deze",),
'dienen': ('die',), "dienen": ("die",),
'dier': ('die',), "dier": ("die",),
'elke': ('elk',), "elke": ("elk",),
'ene': ('een',), "ene": ("een",),
'enen': ('een',), "enen": ("een",),
'ener': ('een',), "ener": ("een",),
'enige': ('enig',), "enige": ("enig",),
'enigen': ('enig',), "enigen": ("enig",),
'er': ('haar',), "er": ("haar",),
'gene': ('geen',), "gene": ("geen",),
'genen': ('geen',), "genen": ("geen",),
'hare': ('haar',), "hare": ("haar",),
'haren': ('haar',), "haren": ("haar",),
'harer': ('haar',), "harer": ("haar",),
'hunne': ('hun',), "hunne": ("hun",),
'hunnen': ('hun',), "hunnen": ("hun",),
'jou': ('jouw',), "jou": ("jouw",),
'jouwe': ('jouw',), "jouwe": ("jouw",),
'julliejen': ('jullie',), "julliejen": ("jullie",),
"m'n": ('mijn',), "m'n": ("mijn",),
'mee': ('meer',), "mee": ("meer",),
'meer': ('veel',), "meer": ("veel",),
'meerderen': ('meerdere',), "meerderen": ("meerdere",),
'meest': ('veel',), "meest": ("veel",),
'meesten': ('veel',), "meesten": ("veel",),
'meet': ('veel',), "meet": ("veel",),
'menige': ('menig',), "menige": ("menig",),
'mij': ('mijn',), "mij": ("mijn",),
'mijnen': ('mijn',), "mijnen": ("mijn",),
'minder': ('weinig',), "minder": ("weinig",),
'mindere': ('weinig',), "mindere": ("weinig",),
'minst': ('weinig',), "minst": ("weinig",),
'minste': ('minst',), "minste": ("minst",),
'ne': ('een',), "ne": ("een",),
'onze': ('ons',), "onze": ("ons",),
'onzent': ('ons',), "onzent": ("ons",),
'onzer': ('ons',), "onzer": ("ons",),
'ouw': ('uw',), "ouw": ("uw",),
'sommige': ('sommig',), "sommige": ("sommig",),
'sommigen': ('sommig',), "sommigen": ("sommig",),
'u': ('uw',), "u": ("uw",),
'vaker': ('vaak',), "vaker": ("vaak",),
'vele': ('veel',), "vele": ("veel",),
'velen': ('veel',), "velen": ("veel",),
'welke': ('welk',), "welke": ("welk",),
'zijne': ('zijn',), "zijne": ("zijn",),
'zijnen': ('zijn',), "zijnen": ("zijn",),
'zijns': ('zijn',), "zijns": ("zijn",),
'één': ('een',) "één": ("een",),
} }

View File

@ -9,7 +9,7 @@ ADJECTIVE_SUFFIX_RULES = [
["er", ""], ["er", ""],
["en", ""], ["en", ""],
["e", ""], ["e", ""],
["ende", "end"] ["ende", "end"],
] ]
VERB_SUFFIX_RULES = [ VERB_SUFFIX_RULES = [
@ -39,7 +39,7 @@ NOUN_SUFFIX_RULES = [
["ssen", "s"], ["ssen", "s"],
["rren", "r"], ["rren", "r"],
["kken", "k"], ["kken", "k"],
["bben", "b"] ["bben", "b"],
] ]
NUM_SUFFIX_RULES = [ NUM_SUFFIX_RULES = [
@ -50,23 +50,20 @@ NUM_SUFFIX_RULES = [
["de", ""], ["de", ""],
["er", ""], ["er", ""],
["ër", ""], ["ër", ""],
["tjes", ""] ["tjes", ""],
] ]
PUNCT_SUFFIX_RULES = [ PUNCT_SUFFIX_RULES = [["", '"'], ["", '"'], ["\u2018", "'"], ["\u2019", "'"]]
["", "\""],
["", "\""],
["\u2018", "'"],
["\u2019", "'"]
]
# In-place sort guaranteeing that longer -- more specific -- rules are # In-place sort guaranteeing that longer -- more specific -- rules are
# applied first. # applied first.
for rule_set in (ADJECTIVE_SUFFIX_RULES, for rule_set in (
ADJECTIVE_SUFFIX_RULES,
NOUN_SUFFIX_RULES, NOUN_SUFFIX_RULES,
NUM_SUFFIX_RULES, NUM_SUFFIX_RULES,
VERB_SUFFIX_RULES): VERB_SUFFIX_RULES,
):
rule_set.sort(key=lambda r: len(r[0]), reverse=True) rule_set.sort(key=lambda r: len(r[0]), reverse=True)
@ -75,5 +72,5 @@ RULES = {
"noun": NOUN_SUFFIX_RULES, "noun": NOUN_SUFFIX_RULES,
"verb": VERB_SUFFIX_RULES, "verb": VERB_SUFFIX_RULES,
"num": NUM_SUFFIX_RULES, "num": NUM_SUFFIX_RULES,
"punct": PUNCT_SUFFIX_RULES "punct": PUNCT_SUFFIX_RULES,
} }

File diff suppressed because it is too large Load Diff

View File

@ -3,29 +3,29 @@ from __future__ import unicode_literals
NUMBERS_IRREG = { NUMBERS_IRREG = {
'achten': ('acht',), "achten": ("acht",),
'biljoenen': ('biljoen',), "biljoenen": ("biljoen",),
'drieën': ('drie',), "drieën": ("drie",),
'duizenden': ('duizend',), "duizenden": ("duizend",),
'eentjes': ('één',), "eentjes": ("één",),
'elven': ('elf',), "elven": ("elf",),
'miljoenen': ('miljoen',), "miljoenen": ("miljoen",),
'negenen': ('negen',), "negenen": ("negen",),
'negentiger': ('negentig',), "negentiger": ("negentig",),
'tienduizenden': ('tienduizend',), "tienduizenden": ("tienduizend",),
'tienen': ('tien',), "tienen": ("tien",),
'tientjes': ('tien',), "tientjes": ("tien",),
'twaalven': ('twaalf',), "twaalven": ("twaalf",),
'tweeën': ('twee',), "tweeën": ("twee",),
'twintiger': ('twintig',), "twintiger": ("twintig",),
'twintigsten': ('twintig',), "twintigsten": ("twintig",),
'vieren': ('vier',), "vieren": ("vier",),
'vijftiger': ('vijftig',), "vijftiger": ("vijftig",),
'vijven': ('vijf',), "vijven": ("vijf",),
'zessen': ('zes',), "zessen": ("zes",),
'zestiger': ('zestig',), "zestiger": ("zestig",),
'zevenen': ('zeven',), "zevenen": ("zeven",),
'zeventiger': ('zeventig',), "zeventiger": ("zeventig",),
'zovele': ('zoveel',), "zovele": ("zoveel",),
'zovelen': ('zoveel',) "zovelen": ("zoveel",),
} }

View File

@ -3,33 +3,33 @@ from __future__ import unicode_literals
PRONOUNS_IRREG = { PRONOUNS_IRREG = {
"'r": ('haar',), "'r": ("haar",),
"'rzelf": ('haarzelf',), "'rzelf": ("haarzelf",),
"'t": ('het',), "'t": ("het",),
"d'r": ('haar',), "d'r": ("haar",),
'da': ('dat',), "da": ("dat",),
'dienen': ('die',), "dienen": ("die",),
'diens': ('die',), "diens": ("die",),
'dies': ('die',), "dies": ("die",),
'elkaars': ('elkaar',), "elkaars": ("elkaar",),
'elkanders': ('elkander',), "elkanders": ("elkander",),
'ene': ('een',), "ene": ("een",),
'enen': ('een',), "enen": ("een",),
'fik': ('ik',), "fik": ("ik",),
'gaat': ('gaan',), "gaat": ("gaan",),
'gene': ('geen',), "gene": ("geen",),
'harer': ('haar',), "harer": ("haar",),
'ieders': ('ieder',), "ieders": ("ieder",),
'iemands': ('iemand',), "iemands": ("iemand",),
'ikke': ('ik',), "ikke": ("ik",),
'mijnen': ('mijn',), "mijnen": ("mijn",),
'oe': ('je',), "oe": ("je",),
'onzer': ('ons',), "onzer": ("ons",),
'wa': ('wat',), "wa": ("wat",),
'watte': ('wat',), "watte": ("wat",),
'wier': ('wie',), "wier": ("wie",),
'zijns': ('zijn',), "zijns": ("zijn",),
'zoietsken': ('zoietske',), "zoietsken": ("zoietske",),
'zulks': ('zulk',), "zulks": ("zulk",),
'één': ('een',) "één": ("een",),
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -7,15 +7,33 @@ from ....symbols import POS, NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
class DutchLemmatizer(object): class DutchLemmatizer(object):
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB. # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
univ_pos_name_variants = { univ_pos_name_variants = {
NOUN: "noun", "NOUN": "noun", "noun": "noun", NOUN: "noun",
VERB: "verb", "VERB": "verb", "verb": "verb", "NOUN": "noun",
AUX: "verb", "AUX": "verb", "aux": "verb", "noun": "noun",
ADJ: "adj", "ADJ": "adj", "adj": "adj", VERB: "verb",
ADV: "adv", "ADV": "adv", "adv": "adv", "VERB": "verb",
PRON: "pron", "PRON": "pron", "pron": "pron", "verb": "verb",
DET: "det", "DET": "det", "det": "det", AUX: "verb",
ADP: "adp", "ADP": "adp", "adp": "adp", "AUX": "verb",
NUM: "num", "NUM": "num", "num": "num" "aux": "verb",
ADJ: "adj",
"ADJ": "adj",
"adj": "adj",
ADV: "adv",
"ADV": "adv",
"adv": "adv",
PRON: "pron",
"PRON": "pron",
"pron": "pron",
DET: "det",
"DET": "det",
"det": "det",
ADP: "adp",
"ADP": "adp",
"adp": "adp",
NUM: "num",
"NUM": "num",
"num": "num",
} }
@classmethod @classmethod
@ -62,10 +80,8 @@ class DutchLemmatizer(object):
return [looked_up_lemma] return [looked_up_lemma]
forms, is_known = lemmatize( forms, is_known = lemmatize(
string, string, lemma_index, exceptions, self.rules.get(univ_pos, [])
lemma_index, )
exceptions,
self.rules.get(univ_pos, []))
# Back-off through remaining return value candidates. # Back-off through remaining return value candidates.
if forms: if forms:
@ -92,25 +108,25 @@ class DutchLemmatizer(object):
return self.lookup_table.get(string, string) return self.lookup_table.get(string, string)
def noun(self, string, morphology=None): def noun(self, string, morphology=None):
return self(string, 'noun', morphology) return self(string, "noun", morphology)
def verb(self, string, morphology=None): def verb(self, string, morphology=None):
return self(string, 'verb', morphology) return self(string, "verb", morphology)
def adj(self, string, morphology=None): def adj(self, string, morphology=None):
return self(string, 'adj', morphology) return self(string, "adj", morphology)
def det(self, string, morphology=None): def det(self, string, morphology=None):
return self(string, 'det', morphology) return self(string, "det", morphology)
def pron(self, string, morphology=None): def pron(self, string, morphology=None):
return self(string, 'pron', morphology) return self(string, "pron", morphology)
def adp(self, string, morphology=None): def adp(self, string, morphology=None):
return self(string, 'adp', morphology) return self(string, "adp", morphology)
def punct(self, string, morphology=None): def punct(self, string, morphology=None):
return self(string, 'punct', morphology) return self(string, "punct", morphology)
# Reimplemented to focus more on application of suffix rules and to return # Reimplemented to focus more on application of suffix rules and to return

View File

@ -4,18 +4,22 @@ from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = set(""" _num_words = set(
"""
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
duizend miljoen miljard biljoen biljard triljoen triljard duizend miljoen miljard biljoen biljard triljoen triljard
""".split()) """.split()
)
_ordinal_words = set(""" _ordinal_words = set(
"""
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
miljardste biljoenste biljardste triljoenste triljardste miljardste biljoenste biljardste triljoenste triljardste
""".split()) """.split()
)
def like_num(text): def like_num(text):
@ -23,11 +27,11 @@ def like_num(text):
# or matches one of the number words. In order to handle numbers like # or matches one of the number words. In order to handle numbers like
# "drieëntwintig", more work is required. # "drieëntwintig", more work is required.
# See this discussion: https://github.com/explosion/spaCy/pull/1177 # See this discussion: https://github.com/explosion/spaCy/pull/1177
text = text.replace(',', '').replace('.', '') text = text.replace(",", "").replace(".", "")
if text.isdigit(): if text.isdigit():
return True return True
if text.count('/') == 1: if text.count("/") == 1:
num, denom = text.split('/') num, denom = text.split("/")
if num.isdigit() and denom.isdigit(): if num.isdigit() and denom.isdigit():
return True return True
if text.lower() in _num_words: if text.lower() in _num_words:
@ -37,6 +41,4 @@ def like_num(text):
return False return False
LEX_ATTRS = { LEX_ATTRS = {LIKE_NUM: like_num}
LIKE_NUM: like_num
}

View File

@ -10,24 +10,32 @@ from ..punctuation import TOKENIZER_SUFFIXES as DEFAULT_TOKENIZER_SUFFIXES
# Copied from `de` package. Main purpose is to ensure that hyphens are not # Copied from `de` package. Main purpose is to ensure that hyphens are not
# split on. # split on.
_quotes = CONCAT_QUOTES.replace("'", '') _quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (LIST_ELLIPSES + LIST_ICONS + _infixes = (
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), LIST_ELLIPSES
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), + LIST_ICONS
+ [
r"(?<=[{}])\.(?=[{}])".format(ALPHA_LOWER, ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])'.format(a=ALPHA, q=_quotes), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r'(?<=[0-9])-(?=[0-9])']) r"(?<=[0-9])-(?=[0-9])",
]
)
# Remove "'s" suffix from suffix list. In Dutch, "'s" is a plural ending when # Remove "'s" suffix from suffix list. In Dutch, "'s" is a plural ending when
# it occurs as a suffix and a clitic for "eens" in standalone use. To avoid # it occurs as a suffix and a clitic for "eens" in standalone use. To avoid
# ambiguity it's better to just leave it attached when it occurs as a suffix. # ambiguity it's better to just leave it attached when it occurs as a suffix.
default_suffix_blacklist = ("'s", "'S", 's', 'S') default_suffix_blacklist = ("'s", "'S", "s", "S")
_suffixes = [suffix for suffix in DEFAULT_TOKENIZER_SUFFIXES _suffixes = [
if suffix not in default_suffix_blacklist] suffix
for suffix in DEFAULT_TOKENIZER_SUFFIXES
if suffix not in default_suffix_blacklist
]
TOKENIZER_INFIXES = _infixes TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes TOKENIZER_SUFFIXES = _suffixes

View File

@ -16,7 +16,8 @@ from __future__ import unicode_literals
# should have a Dutch counterpart here. # should have a Dutch counterpart here.
STOP_WORDS = set(""" STOP_WORDS = set(
"""
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
afgelopen aldus alhoewel anderzijds afgelopen aldus alhoewel anderzijds
@ -70,4 +71,5 @@ welk welke welken werd werden wiens wier wilde wordt
zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zon zoals zodra zouden zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zon zoals zodra zouden
zoveel zowat zulk zulke zulks zullen zult zoveel zowat zulk zulke zulks zullen zult
""".split()) """.split()
)

View File

@ -47,8 +47,12 @@ TAG_MAP = {
"Adj_Prep|adv|vergr|onverv_voor__Degree=Cmp|Variant=Short": {POS: ADJ}, "Adj_Prep|adv|vergr|onverv_voor__Degree=Cmp|Variant=Short": {POS: ADJ},
"Adj_V_Conj_V__Degree=Pos|VerbForm=Inf": {POS: ADJ}, "Adj_V_Conj_V__Degree=Pos|VerbForm=Inf": {POS: ADJ},
"Adj_V_N__Degree=Pos|Number=Sing|Tense=Past|VerbForm=Part": {POS: ADJ}, "Adj_V_N__Degree=Pos|Number=Sing|Tense=Past|VerbForm=Part": {POS: ADJ},
"Adj_V|adv|stell|onverv_intrans|inf__Degree=Pos|Variant=Short|VerbForm=Inf": {POS: ADJ}, "Adj_V|adv|stell|onverv_intrans|inf__Degree=Pos|Variant=Short|VerbForm=Inf": {
"Adj_V|adv|stell|onverv_trans|imp__Degree=Pos|Mood=Imp|Variant=Short|VerbForm=Fin": {POS: ADJ}, POS: ADJ
},
"Adj_V|adv|stell|onverv_trans|imp__Degree=Pos|Mood=Imp|Variant=Short|VerbForm=Fin": {
POS: ADJ
},
"Adj|adv|stell|onverv__Degree=Pos|Variant=Short": {POS: ADJ}, "Adj|adv|stell|onverv__Degree=Pos|Variant=Short": {POS: ADJ},
"Adj|adv|stell|vervneut__Case=Nom|Degree=Pos|Variant=Short": {POS: ADJ}, "Adj|adv|stell|vervneut__Case=Nom|Degree=Pos|Variant=Short": {POS: ADJ},
"Adj|adv|vergr|onverv__Degree=Cmp|Variant=Short": {POS: ADJ}, "Adj|adv|vergr|onverv__Degree=Cmp|Variant=Short": {POS: ADJ},
@ -133,15 +137,21 @@ TAG_MAP = {
"Art_Num__Definite=Def|Degree=Sup|Gender=Neut|PronType=Ind": {POS: DET}, "Art_Num__Definite=Def|Degree=Sup|Gender=Neut|PronType=Ind": {POS: DET},
"Art_Num__Definite=Def|Gender=Neut": {POS: DET}, "Art_Num__Definite=Def|Gender=Neut": {POS: DET},
"Art_Num__Degree=Pos|Number=Sing|PronType=Ind": {POS: DET}, "Art_Num__Degree=Pos|Number=Sing|PronType=Ind": {POS: DET},
"Art_N|bep|onzijd|neut_eigen|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {POS: DET}, "Art_N|bep|onzijd|neut_eigen|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {
"Art_N|bep|onzijd|neut_soort|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {POS: DET}, POS: DET
},
"Art_N|bep|onzijd|neut_soort|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {
POS: DET
},
"Art_Pron_N__Case=Gen|Number=Plur|PronType=Ind": {POS: DET}, "Art_Pron_N__Case=Gen|Number=Plur|PronType=Ind": {POS: DET},
"Art_Pron__Number=Sing|PronType=Ind": {POS: DET}, "Art_Pron__Number=Sing|PronType=Ind": {POS: DET},
"Art_V_N__AdpType=Prep": {POS: DET}, "Art_V_N__AdpType=Prep": {POS: DET},
"Art|bep|onzijd|neut__Definite=Def|Gender=Neut|PronType=Art": {POS: DET}, "Art|bep|onzijd|neut__Definite=Def|Gender=Neut|PronType=Art": {POS: DET},
"Art|bep|zijdofmv|gen__Case=Gen|Definite=Def|PronType=Art": {POS: DET}, "Art|bep|zijdofmv|gen__Case=Gen|Definite=Def|PronType=Art": {POS: DET},
"Art|bep|zijdofmv|neut__Definite=Def|PronType=Art": {POS: DET}, "Art|bep|zijdofmv|neut__Definite=Def|PronType=Art": {POS: DET},
"Art|bep|zijdofonzijd|gen__Case=Gen|Definite=Def|Number=Sing|PronType=Art": {POS: DET}, "Art|bep|zijdofonzijd|gen__Case=Gen|Definite=Def|Number=Sing|PronType=Art": {
POS: DET
},
"Art|bep|zijd|dat__Case=Dat|Definite=Def|Gender=Com|PronType=Art": {POS: DET}, "Art|bep|zijd|dat__Case=Dat|Definite=Def|Gender=Com|PronType=Art": {POS: DET},
"Art|onbep|zijdofonzijd|neut__Definite=Ind|Number=Sing|PronType=Art": {POS: DET}, "Art|onbep|zijdofonzijd|neut__Definite=Ind|Number=Sing|PronType=Art": {POS: DET},
"CCONJ___": {POS: CONJ}, "CCONJ___": {POS: CONJ},
@ -159,17 +169,23 @@ TAG_MAP = {
"Conj_Int|onder|metfin___": {POS: CONJ}, "Conj_Int|onder|metfin___": {POS: CONJ},
"Conj_N_Adv__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ}, "Conj_N_Adv__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
"Conj_N_Prep__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ}, "Conj_N_Prep__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
"Conj_N|onder|metfin_soort|ev|neut__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ}, "Conj_N|onder|metfin_soort|ev|neut__AdpType=Preppron|Gender=Masc|Number=Plur": {
POS: CONJ
},
"Conj_Pron_Adv__Degree=Pos|Number=Sing|Person=3": {POS: CONJ}, "Conj_Pron_Adv__Degree=Pos|Number=Sing|Person=3": {POS: CONJ},
"Conj_Pron_V__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ}, "Conj_Pron_V__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
"Conj_Pron|neven_aanw|neut|zelfst__AdpType=Prep": {POS: CONJ}, "Conj_Pron|neven_aanw|neut|zelfst__AdpType=Prep": {POS: CONJ},
"Conj_Punc_Conj|neven_schuinstreep_neven__AdpType=Prep": {POS: CONJ}, "Conj_Punc_Conj|neven_schuinstreep_neven__AdpType=Prep": {POS: CONJ},
"Conj_V|onder|metfin_intrans|ott|3|ev__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ}, "Conj_V|onder|metfin_intrans|ott|3|ev__AdpType=Preppron|Gender=Masc|Number=Plur": {
POS: CONJ
},
"Conj|neven___": {POS: CONJ}, "Conj|neven___": {POS: CONJ},
"Conj|onder|metfin___": {POS: CONJ}, "Conj|onder|metfin___": {POS: CONJ},
"Conj|onder|metinf___": {POS: CONJ}, "Conj|onder|metinf___": {POS: CONJ},
"DET__Degree=Cmp|NumType=Card|PronType=Ind": {POS: DET}, "DET__Degree=Cmp|NumType=Card|PronType=Ind": {POS: DET},
"DET__Gender=Fem|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs": {POS: DET}, "DET__Gender=Fem|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs": {
POS: DET
},
"DET__Gender=Fem|Number=Sing|PronType=Art": {POS: DET}, "DET__Gender=Fem|Number=Sing|PronType=Art": {POS: DET},
"DET__Gender=Masc|Number=Plur|PronType=Art": {POS: DET}, "DET__Gender=Masc|Number=Plur|PronType=Art": {POS: DET},
"DET__Gender=Masc|Number=Sing|PronType=Tot": {POS: DET}, "DET__Gender=Masc|Number=Sing|PronType=Tot": {POS: DET},
@ -185,7 +201,9 @@ TAG_MAP = {
"Misc_Misc_Misc_Misc_Misc_Misc_Punc_Misc_Misc_Misc___": {POS: X}, "Misc_Misc_Misc_Misc_Misc_Misc_Punc_Misc_Misc_Misc___": {POS: X},
"Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X}, "Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X},
"Misc_Misc_Misc_Misc_Misc_N_Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X}, "Misc_Misc_Misc_Misc_Misc_N_Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X},
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd__AdpType=Preppron|Gender=Masc|Number=Sing": {POS: X}, "Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd__AdpType=Preppron|Gender=Masc|Number=Sing": {
POS: X
},
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd___": {POS: X}, "Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd___": {POS: X},
"Misc_Misc_Misc_N__Number=Sing": {POS: X}, "Misc_Misc_Misc_N__Number=Sing": {POS: X},
"Misc_Misc_Misc|vreemd_vreemd_vreemd___": {POS: X}, "Misc_Misc_Misc|vreemd_vreemd_vreemd___": {POS: X},
@ -217,7 +235,9 @@ TAG_MAP = {
"N_Adj__Degree=Pos|Number=Plur": {POS: NOUN}, "N_Adj__Degree=Pos|Number=Plur": {POS: NOUN},
"N_Adj__Degree=Pos|Number=Sing": {POS: NOUN}, "N_Adj__Degree=Pos|Number=Sing": {POS: NOUN},
"N_Adj___": {POS: NOUN}, "N_Adj___": {POS: NOUN},
"N_Adv_Punc_V_Pron_V__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {POS: NOUN}, "N_Adv_Punc_V_Pron_V__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {
POS: NOUN
},
"N_Adv__Degree=Pos|Number=Sing": {POS: NOUN}, "N_Adv__Degree=Pos|Number=Sing": {POS: NOUN},
"N_Adv___": {POS: NOUN}, "N_Adv___": {POS: NOUN},
"N_Adv|soort|ev|neut_deelv__Number=Sing": {POS: NOUN}, "N_Adv|soort|ev|neut_deelv__Number=Sing": {POS: NOUN},
@ -320,12 +340,20 @@ TAG_MAP = {
"N_N|eigen|ev|gen_soort|mv|neut___": {POS: NOUN}, "N_N|eigen|ev|gen_soort|mv|neut___": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|gen___": {POS: NOUN}, "N_N|eigen|ev|neut_eigen|ev|gen___": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Prep": {POS: NOUN}, "N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Prep": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Fem|Number=Plur": {POS: NOUN}, "N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Fem|Number=Plur": {
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Masc|Number=Sing": {POS: NOUN}, POS: NOUN
},
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Masc|Number=Sing": {
POS: NOUN
},
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Plur|PronType=Art": {POS: NOUN}, "N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Plur|PronType=Art": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Sing|PronType=Art": {POS: NOUN}, "N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Sing|PronType=Art": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {POS: NOUN}, "N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Sing|PronType=Art": {POS: NOUN}, POS: NOUN
},
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Sing|PronType=Art": {
POS: NOUN
},
"N_N|eigen|ev|neut_eigen|ev|neut__NumType=Card": {POS: NOUN}, "N_N|eigen|ev|neut_eigen|ev|neut__NumType=Card": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN}, "N_N|eigen|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut___": {POS: NOUN}, "N_N|eigen|ev|neut_eigen|ev|neut___": {POS: NOUN},
@ -335,7 +363,9 @@ TAG_MAP = {
"N_N|eigen|ev|neut_soort|mv|neut___": {POS: NOUN}, "N_N|eigen|ev|neut_soort|mv|neut___": {POS: NOUN},
"N_N|eigen|mv|neut_eigen|mv|neut___": {POS: NOUN}, "N_N|eigen|mv|neut_eigen|mv|neut___": {POS: NOUN},
"N_N|soort|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN}, "N_N|soort|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN},
"N_N|soort|ev|neut_soort|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {POS: NOUN}, "N_N|soort|ev|neut_soort|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {
POS: NOUN
},
"N_N|soort|ev|neut_soort|ev|neut__NumForm=Digit|NumType=Card": {POS: NOUN}, "N_N|soort|ev|neut_soort|ev|neut__NumForm=Digit|NumType=Card": {POS: NOUN},
"N_N|soort|ev|neut_soort|ev|neut__Number=Sing": {POS: NOUN}, "N_N|soort|ev|neut_soort|ev|neut__Number=Sing": {POS: NOUN},
"N_N|soort|ev|neut_soort|mv|neut__Number=Plur": {POS: NOUN}, "N_N|soort|ev|neut_soort|mv|neut__Number=Plur": {POS: NOUN},
@ -365,7 +395,9 @@ TAG_MAP = {
"N_Pron___": {POS: NOUN}, "N_Pron___": {POS: NOUN},
"N_Punc_Adj_N___": {POS: NOUN}, "N_Punc_Adj_N___": {POS: NOUN},
"N_Punc_Adj_Pron_Punc__Degree=Pos|Number=Sing|Person=2": {POS: NOUN}, "N_Punc_Adj_Pron_Punc__Degree=Pos|Number=Sing|Person=2": {POS: NOUN},
"N_Punc_Adv_V_Pron_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: NOUN}, "N_Punc_Adv_V_Pron_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
POS: NOUN
},
"N_Punc_Misc_Punc_N___": {POS: NOUN}, "N_Punc_Misc_Punc_N___": {POS: NOUN},
"N_Punc_N_N_N_N__Number=Sing": {POS: NOUN}, "N_Punc_N_N_N_N__Number=Sing": {POS: NOUN},
"N_Punc_N_Punc_N__Number=Sing": {POS: NOUN}, "N_Punc_N_Punc_N__Number=Sing": {POS: NOUN},
@ -415,8 +447,12 @@ TAG_MAP = {
"Num|hoofd|bep|attr|onverv__Definite=Def|NumType=Card": {POS: NUM}, "Num|hoofd|bep|attr|onverv__Definite=Def|NumType=Card": {POS: NUM},
"Num|hoofd|bep|zelfst|onverv__Definite=Def|NumType=Card": {POS: NUM}, "Num|hoofd|bep|zelfst|onverv__Definite=Def|NumType=Card": {POS: NUM},
"Num|hoofd|bep|zelfst|vervmv__Definite=Def|Number=Plur|NumType=Card": {POS: NUM}, "Num|hoofd|bep|zelfst|vervmv__Definite=Def|Number=Plur|NumType=Card": {POS: NUM},
"Num|hoofd|onbep|attr|stell|onverv__Degree=Pos|NumType=Card|PronType=Ind": {POS: NUM}, "Num|hoofd|onbep|attr|stell|onverv__Degree=Pos|NumType=Card|PronType=Ind": {
"Num|hoofd|onbep|attr|vergr|onverv__Degree=Cmp|NumType=Card|PronType=Ind": {POS: NUM}, POS: NUM
},
"Num|hoofd|onbep|attr|vergr|onverv__Degree=Cmp|NumType=Card|PronType=Ind": {
POS: NUM
},
"Num|rang|bep|attr|onverv__Definite=Def|NumType=Ord": {POS: NUM}, "Num|rang|bep|attr|onverv__Definite=Def|NumType=Ord": {POS: NUM},
"Num|rang|bep|zelfst|onverv__Definite=Def|NumType=Ord": {POS: NUM}, "Num|rang|bep|zelfst|onverv__Definite=Def|NumType=Ord": {POS: NUM},
"N|eigen|ev|gen__Case=Gen|Number=Sing": {POS: NOUN}, "N|eigen|ev|gen__Case=Gen|Number=Sing": {POS: NOUN},
@ -469,7 +505,9 @@ TAG_MAP = {
"Prep_N_Adv|voor_soort|ev|neut_pron|aanw__AdpType=Prep": {POS: ADP}, "Prep_N_Adv|voor_soort|ev|neut_pron|aanw__AdpType=Prep": {POS: ADP},
"Prep_N_Adv|voor_soort|ev|neut_pron|aanw__Number=Sing|PronType=Dem": {POS: ADP}, "Prep_N_Adv|voor_soort|ev|neut_pron|aanw__Number=Sing|PronType=Dem": {POS: ADP},
"Prep_N_Adv|voor_soort|ev|neut_pron|vrag__Number=Sing|PronType=Int": {POS: ADP}, "Prep_N_Adv|voor_soort|ev|neut_pron|vrag__Number=Sing|PronType=Int": {POS: ADP},
"Prep_N_Adv|voor_soort|mv|neut_deelv__Gender=Masc|Number=Sing|PronType=Tot": {POS: ADP}, "Prep_N_Adv|voor_soort|mv|neut_deelv__Gender=Masc|Number=Sing|PronType=Tot": {
POS: ADP
},
"Prep_N_Conj_N__Number=Sing": {POS: ADP}, "Prep_N_Conj_N__Number=Sing": {POS: ADP},
"Prep_N_Conj__AdpType=Prep": {POS: ADP}, "Prep_N_Conj__AdpType=Prep": {POS: ADP},
"Prep_N_Prep_N__Number=Sing": {POS: ADP}, "Prep_N_Prep_N__Number=Sing": {POS: ADP},
@ -489,7 +527,9 @@ TAG_MAP = {
"Prep_N|voor_soort|ev|neut__Number=Sing": {POS: ADP}, "Prep_N|voor_soort|ev|neut__Number=Sing": {POS: ADP},
"Prep_N|voor_soort|mv|neut__AdpType=Prep": {POS: ADP}, "Prep_N|voor_soort|mv|neut__AdpType=Prep": {POS: ADP},
"Prep_N|voor_soort|mv|neut__Number=Plur": {POS: ADP}, "Prep_N|voor_soort|mv|neut__Number=Plur": {POS: ADP},
"Prep_Prep_Adj|voor_voor_adv|stell|onverv__Gender=Masc|Number=Sing|PronType=Tot": {POS: ADP}, "Prep_Prep_Adj|voor_voor_adv|stell|onverv__Gender=Masc|Number=Sing|PronType=Tot": {
POS: ADP
},
"Prep_Prep_Adv__Degree=Pos": {POS: ADP}, "Prep_Prep_Adv__Degree=Pos": {POS: ADP},
"Prep_Pron_Adj__Degree=Cmp|Number=Sing|Person=3": {POS: ADP}, "Prep_Pron_Adj__Degree=Cmp|Number=Sing|Person=3": {POS: ADP},
"Prep_Pron_N_Adv__Number=Plur": {POS: ADP}, "Prep_Pron_N_Adv__Number=Plur": {POS: ADP},
@ -503,7 +543,9 @@ TAG_MAP = {
"Prep_Pron|voor_ref|3|evofmv__Number=Plur,Sing|Person=3": {POS: ADP}, "Prep_Pron|voor_ref|3|evofmv__Number=Plur,Sing|Person=3": {POS: ADP},
"Prep_Punc_N_Conj_N__AdpType=Prep": {POS: ADP}, "Prep_Punc_N_Conj_N__AdpType=Prep": {POS: ADP},
"Prep_V_N__Number=Sing|Tense=Pres|VerbForm=Part": {POS: ADP}, "Prep_V_N__Number=Sing|Tense=Pres|VerbForm=Part": {POS: ADP},
"Prep_V_Pron_Pron_Adv__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {POS: ADP}, "Prep_V_Pron_Pron_Adv__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {
POS: ADP
},
"Prep_V|voor_intrans|inf__VerbForm=Inf": {POS: ADP}, "Prep_V|voor_intrans|inf__VerbForm=Inf": {POS: ADP},
"Prep_V|voorinf_trans|inf__VerbForm=Inf": {POS: ADP}, "Prep_V|voorinf_trans|inf__VerbForm=Inf": {POS: ADP},
"Prep|achter__AdpType=Post": {POS: ADP}, "Prep|achter__AdpType=Post": {POS: ADP},
@ -511,17 +553,25 @@ TAG_MAP = {
"Prep|voor__AdpType=Prep": {POS: ADP}, "Prep|voor__AdpType=Prep": {POS: ADP},
"Prep|voorinf__AdpType=Prep|PartType=Inf": {POS: ADP}, "Prep|voorinf__AdpType=Prep|PartType=Inf": {POS: ADP},
"Pron_Adj_N_Punc_Art_Adj_N_Prep_Art_Adj_N__NumType=Card": {POS: PRON}, "Pron_Adj_N_Punc_Art_Adj_N_Prep_Art_Adj_N__NumType=Card": {POS: PRON},
"Pron_Adj__Case=Nom|Degree=Sup|Number=Sing|Person=2|Poss=Yes|PronType=Prs": {POS: PRON}, "Pron_Adj__Case=Nom|Degree=Sup|Number=Sing|Person=2|Poss=Yes|PronType=Prs": {
POS: PRON
},
"Pron_Adj__Degree=Cmp|PronType=Ind": {POS: PRON}, "Pron_Adj__Degree=Cmp|PronType=Ind": {POS: PRON},
"Pron_Adv|vrag|neut|attr_deelv__PronType=Int": {POS: PRON}, "Pron_Adv|vrag|neut|attr_deelv__PronType=Int": {POS: PRON},
"Pron_Art_N_N__Number=Plur|PronType=Ind": {POS: PRON}, "Pron_Art_N_N__Number=Plur|PronType=Ind": {POS: PRON},
"Pron_Art__Number=Sing|PronType=Int": {POS: PRON}, "Pron_Art__Number=Sing|PronType=Int": {POS: PRON},
"Pron_N_Adv__Number=Sing|PronType=Ind": {POS: PRON}, "Pron_N_Adv__Number=Sing|PronType=Ind": {POS: PRON},
"Pron_N_V_Adv_Num_Punc__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: PRON}, "Pron_N_V_Adv_Num_Punc__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
"Pron_N_V_Conj_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: PRON}, POS: PRON
},
"Pron_N_V_Conj_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
POS: PRON
},
"Pron_N__Case=Gen|Number=Sing|PronType=Ind": {POS: PRON}, "Pron_N__Case=Gen|Number=Sing|PronType=Ind": {POS: PRON},
"Pron_N__Number=Sing|PronType=Ind": {POS: PRON}, "Pron_N__Number=Sing|PronType=Ind": {POS: PRON},
"Pron_N|aanw|gen|attr_soort|mv|neut__Case=Gen|Number=Plur|PronType=Dem": {POS: PRON}, "Pron_N|aanw|gen|attr_soort|mv|neut__Case=Gen|Number=Plur|PronType=Dem": {
POS: PRON
},
"Pron_N|onbep|neut|attr_soort|ev|neut__Number=Sing|PronType=Ind": {POS: PRON}, "Pron_N|onbep|neut|attr_soort|ev|neut__Number=Sing|PronType=Ind": {POS: PRON},
"Pron_Prep_Art__Number=Sing|PronType=Int": {POS: PRON}, "Pron_Prep_Art__Number=Sing|PronType=Int": {POS: PRON},
"Pron_Prep_Art__Number=Sing|PronType=Rel": {POS: PRON}, "Pron_Prep_Art__Number=Sing|PronType=Rel": {POS: PRON},
@ -529,10 +579,16 @@ TAG_MAP = {
"Pron_Prep|betr|neut|zelfst_voor__PronType=Rel": {POS: PRON}, "Pron_Prep|betr|neut|zelfst_voor__PronType=Rel": {POS: PRON},
"Pron_Prep|onbep|neut|zelfst_voor__PronType=Ind": {POS: PRON}, "Pron_Prep|onbep|neut|zelfst_voor__PronType=Ind": {POS: PRON},
"Pron_Prep|vrag|neut|attr_voor__PronType=Int": {POS: PRON}, "Pron_Prep|vrag|neut|attr_voor__PronType=Int": {POS: PRON},
"Pron_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {POS: PRON}, "Pron_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {
POS: PRON
},
"Pron_Pron__Person=3|PronType=Prs|Reflex=Yes": {POS: PRON}, "Pron_Pron__Person=3|PronType=Prs|Reflex=Yes": {POS: PRON},
"Pron_V_V__Aspect=Imp|Mood=Ind|Person=3|PronType=Dem|Tense=Pres|VerbForm=Inf": {POS: PRON}, "Pron_V_V__Aspect=Imp|Mood=Ind|Person=3|PronType=Dem|Tense=Pres|VerbForm=Inf": {
"Pron_V__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs|VerbForm=Inf": {POS: PRON}, POS: PRON
},
"Pron_V__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs|VerbForm=Inf": {
POS: PRON
},
"Pron_V__Number=Plur|Person=1|Poss=Yes|PronType=Prs|VerbForm=Inf": {POS: PRON}, "Pron_V__Number=Plur|Person=1|Poss=Yes|PronType=Prs|VerbForm=Inf": {POS: PRON},
"Pron|aanw|dat|attr__Case=Dat|PronType=Dem": {POS: PRON}, "Pron|aanw|dat|attr__Case=Dat|PronType=Dem": {POS: PRON},
"Pron|aanw|gen|attr__Case=Gen|PronType=Dem": {POS: PRON}, "Pron|aanw|gen|attr__Case=Gen|PronType=Dem": {POS: PRON},
@ -547,27 +603,47 @@ TAG_MAP = {
"Pron|bez|1|mv|neut|attr__Number=Plur|Person=1|Poss=Yes|PronType=Prs": {POS: PRON}, "Pron|bez|1|mv|neut|attr__Number=Plur|Person=1|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|bez|2|ev|neut|attr__Number=Sing|Person=2|Poss=Yes|PronType=Prs": {POS: PRON}, "Pron|bez|2|ev|neut|attr__Number=Sing|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|bez|2|mv|neut|attr__Number=Plur|Person=2|Poss=Yes|PronType=Prs": {POS: PRON}, "Pron|bez|2|mv|neut|attr__Number=Plur|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|bez|3|ev|gen|attr__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON}, "Pron|bez|3|ev|gen|attr__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
POS: PRON
},
"Pron|bez|3|ev|neut|attr__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON}, "Pron|bez|3|ev|neut|attr__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|bez|3|ev|neut|zelfst__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON}, "Pron|bez|3|ev|neut|zelfst__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
POS: PRON
},
"Pron|bez|3|mv|neut|attr__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {POS: PRON}, "Pron|bez|3|mv|neut|attr__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|onbep|gen|attr__Case=Gen|PronType=Ind": {POS: PRON}, "Pron|onbep|gen|attr__Case=Gen|PronType=Ind": {POS: PRON},
"Pron|onbep|gen|zelfst__Case=Gen|PronType=Ind": {POS: PRON}, "Pron|onbep|gen|zelfst__Case=Gen|PronType=Ind": {POS: PRON},
"Pron|onbep|neut|attr__PronType=Ind": {POS: PRON}, "Pron|onbep|neut|attr__PronType=Ind": {POS: PRON},
"Pron|onbep|neut|zelfst__PronType=Ind": {POS: PRON}, "Pron|onbep|neut|zelfst__PronType=Ind": {POS: PRON},
"Pron|per|1|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=1|PronType=Prs": {POS: PRON}, "Pron|per|1|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=1|PronType=Prs": {
POS: PRON
},
"Pron|per|1|ev|nom__Case=Nom|Number=Sing|Person=1|PronType=Prs": {POS: PRON}, "Pron|per|1|ev|nom__Case=Nom|Number=Sing|Person=1|PronType=Prs": {POS: PRON},
"Pron|per|1|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs": {POS: PRON}, "Pron|per|1|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs": {
POS: PRON
},
"Pron|per|1|mv|nom__Case=Nom|Number=Plur|Person=1|PronType=Prs": {POS: PRON}, "Pron|per|1|mv|nom__Case=Nom|Number=Plur|Person=1|PronType=Prs": {POS: PRON},
"Pron|per|2|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=2|PronType=Prs": {POS: PRON}, "Pron|per|2|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=2|PronType=Prs": {
POS: PRON
},
"Pron|per|2|ev|nom__Case=Nom|Number=Sing|Person=2|PronType=Prs": {POS: PRON}, "Pron|per|2|ev|nom__Case=Nom|Number=Sing|Person=2|PronType=Prs": {POS: PRON},
"Pron|per|2|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs": {POS: PRON}, "Pron|per|2|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs": {
POS: PRON
},
"Pron|per|2|mv|nom__Case=Nom|Number=Plur|Person=2|PronType=Prs": {POS: PRON}, "Pron|per|2|mv|nom__Case=Nom|Number=Plur|Person=2|PronType=Prs": {POS: PRON},
"Pron|per|3|evofmv|datofacc__Case=Acc,Dat|Number=Plur,Sing|Person=3|PronType=Prs": {POS: PRON}, "Pron|per|3|evofmv|datofacc__Case=Acc,Dat|Number=Plur,Sing|Person=3|PronType=Prs": {
"Pron|per|3|evofmv|nom__Case=Nom|Number=Plur,Sing|Person=3|PronType=Prs": {POS: PRON}, POS: PRON
"Pron|per|3|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=3|PronType=Prs": {POS: PRON}, },
"Pron|per|3|evofmv|nom__Case=Nom|Number=Plur,Sing|Person=3|PronType=Prs": {
POS: PRON
},
"Pron|per|3|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=3|PronType=Prs": {
POS: PRON
},
"Pron|per|3|ev|nom__Case=Nom|Number=Sing|Person=3|PronType=Prs": {POS: PRON}, "Pron|per|3|ev|nom__Case=Nom|Number=Sing|Person=3|PronType=Prs": {POS: PRON},
"Pron|per|3|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs": {POS: PRON}, "Pron|per|3|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs": {
POS: PRON
},
"Pron|rec|gen__Case=Gen|PronType=Rcp": {POS: PRON}, "Pron|rec|gen__Case=Gen|PronType=Rcp": {POS: PRON},
"Pron|rec|neut__PronType=Rcp": {POS: PRON}, "Pron|rec|neut__PronType=Rcp": {POS: PRON},
"Pron|ref|1|ev__Number=Sing|Person=1|PronType=Prs|Reflex=Yes": {POS: PRON}, "Pron|ref|1|ev__Number=Sing|Person=1|PronType=Prs|Reflex=Yes": {POS: PRON},
@ -597,20 +673,34 @@ TAG_MAP = {
"Punc|vraag__PunctType=Qest": {POS: PUNCT}, "Punc|vraag__PunctType=Qest": {POS: PUNCT},
"V_Adv_Art_N_Prep_Pron_N__Degree=Pos|Number=Plur|Person=2|Subcat=Tran": {POS: VERB}, "V_Adv_Art_N_Prep_Pron_N__Degree=Pos|Number=Plur|Person=2|Subcat=Tran": {POS: VERB},
"V_Adv__Degree=Pos|Subcat=Tran": {POS: VERB}, "V_Adv__Degree=Pos|Subcat=Tran": {POS: VERB},
"V_Art_N_Num_N__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB}, "V_Art_N_Num_N__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V_Art_N__Number=Sing|Subcat=Tran": {POS: VERB}, "V_Art_N__Number=Sing|Subcat=Tran": {POS: VERB},
"V_Conj_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB}, "V_Conj_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V_Conj_Pron__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB}, "V_Conj_Pron__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V_N_Conj_Adj_N_Prep_Art_N__Degree=Pos|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB}, "V_N_Conj_Adj_N_Prep_Art_N__Degree=Pos|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {
POS: VERB
},
"V_N_N__Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB}, "V_N_N__Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
"V_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB}, "V_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V_N_V__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {POS: VERB}, "V_N_V__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {
POS: VERB
},
"V_N__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB}, "V_N__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V_N|trans|imp_eigen|ev|neut__Number=Sing|Subcat=Tran": {POS: VERB}, "V_N|trans|imp_eigen|ev|neut__Number=Sing|Subcat=Tran": {POS: VERB},
"V_Prep|intrans|verldw|onverv_voor__Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB}, "V_Prep|intrans|verldw|onverv_voor__Subcat=Intr|Tense=Past|VerbForm=Part": {
"V_Pron_Adv_Adv_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB}, POS: VERB
"V_Pron_Adv__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB}, },
"V_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V_Pron_Adv_Adv_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V_Pron_Adv__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V_Pron__VerbType=Aux,Cop": {POS: VERB}, "V_Pron__VerbType=Aux,Cop": {POS: VERB},
"V_V|hulp|imp_intrans|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB}, "V_V|hulp|imp_intrans|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB},
"V|hulpofkopp|conj__Mood=Sub|VerbForm=Fin": {POS: VERB}, "V|hulpofkopp|conj__Mood=Sub|VerbForm=Fin": {POS: VERB},
@ -620,94 +710,220 @@ TAG_MAP = {
"V|hulpofkopp|inf__VerbForm=Inf": {POS: VERB}, "V|hulpofkopp|inf__VerbForm=Inf": {POS: VERB},
"V|hulpofkopp|inf__VerbForm=Inf|VerbType=Aux,Cop": {POS: VERB}, "V|hulpofkopp|inf__VerbForm=Inf|VerbType=Aux,Cop": {POS: VERB},
"V|hulpofkopp|inf|subst__VerbForm=Inf": {POS: VERB}, "V|hulpofkopp|inf|subst__VerbForm=Inf": {POS: VERB},
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB}, POS: VERB
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: VERB}, },
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB}, "V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {POS: VERB}, POS: VERB
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB}, },
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB}, POS: VERB
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: VERB}, },
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB}, "V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {POS: VERB}, POS: VERB
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB}, },
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part": {POS: VERB}, "V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part|VerbType=Aux,Cop": {POS: VERB}, "V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB}, "V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB},
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Aux,Cop": {POS: VERB}, "V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulp|conj__Mood=Sub|VerbForm=Fin|VerbType=Mod": {POS: VERB}, "V|hulp|conj__Mood=Sub|VerbForm=Fin|VerbType=Mod": {POS: VERB},
"V|hulp|inf__VerbForm=Inf": {POS: VERB}, "V|hulp|inf__VerbForm=Inf": {POS: VERB},
"V|hulp|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB}, "V|hulp|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB},
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB}, POS: VERB
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: VERB}, },
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB}, "V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {POS: VERB}, POS: VERB
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB}, },
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB}, POS: VERB
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: VERB}, },
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Mod": {POS: VERB}, "V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {POS: VERB}, POS: VERB
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Mod": {POS: VERB}, },
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V|hulp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB}, "V|hulp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB},
"V|hulp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Mod": {POS: VERB}, "V|hulp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Mod": {POS: VERB},
"V|intrans|conj__Mood=Sub|Subcat=Intr|VerbForm=Fin": {POS: VERB}, "V|intrans|conj__Mood=Sub|Subcat=Intr|VerbForm=Fin": {POS: VERB},
"V|intrans|imp__Mood=Imp|Subcat=Intr|VerbForm=Fin": {POS: VERB}, "V|intrans|imp__Mood=Imp|Subcat=Intr|VerbForm=Fin": {POS: VERB},
"V|intrans|inf__Subcat=Intr|VerbForm=Inf": {POS: VERB}, "V|intrans|inf__Subcat=Intr|VerbForm=Inf": {POS: VERB},
"V|intrans|inf|subst__Subcat=Intr|VerbForm=Inf": {POS: VERB}, "V|intrans|inf|subst__Subcat=Intr|VerbForm=Inf": {POS: VERB},
"V|intrans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V|intrans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
"V|intrans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB}, POS: VERB
"V|intrans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB}, },
"V|intrans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V|intrans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
"V|intrans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Past|VerbForm=Fin": {POS: VERB}, POS: VERB
"V|intrans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Fin": {POS: VERB}, },
"V|intrans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|intrans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|intrans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|intrans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|intrans|tegdw|onverv__Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB}, "V|intrans|tegdw|onverv__Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|intrans|tegdw|vervmv__Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB}, "V|intrans|tegdw|vervmv__Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Part": {
"V|intrans|tegdw|vervneut__Case=Nom|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB}, POS: VERB
"V|intrans|tegdw|vervvergr__Degree=Cmp|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB}, },
"V|intrans|tegdw|vervneut__Case=Nom|Subcat=Intr|Tense=Pres|VerbForm=Part": {
POS: VERB
},
"V|intrans|tegdw|vervvergr__Degree=Cmp|Subcat=Intr|Tense=Pres|VerbForm=Part": {
POS: VERB
},
"V|intrans|verldw|onverv__Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB}, "V|intrans|verldw|onverv__Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
"V|intrans|verldw|vervmv__Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB}, "V|intrans|verldw|vervmv__Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Part": {
"V|intrans|verldw|vervneut__Case=Nom|Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB}, POS: VERB
},
"V|intrans|verldw|vervneut__Case=Nom|Subcat=Intr|Tense=Past|VerbForm=Part": {
POS: VERB
},
"V|refl|imp__Mood=Imp|Reflex=Yes|VerbForm=Fin": {POS: VERB}, "V|refl|imp__Mood=Imp|Reflex=Yes|VerbForm=Fin": {POS: VERB},
"V|refl|inf__Reflex=Yes|VerbForm=Inf": {POS: VERB}, "V|refl|inf__Reflex=Yes|VerbForm=Inf": {POS: VERB},
"V|refl|inf|subst__Reflex=Yes|VerbForm=Inf": {POS: VERB}, "V|refl|inf|subst__Reflex=Yes|VerbForm=Inf": {POS: VERB},
"V|refl|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V|refl|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
"V|refl|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB}, POS: VERB
"V|refl|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB}, },
"V|refl|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V|refl|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
"V|refl|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Reflex=Yes|Tense=Past|VerbForm=Fin": {POS: VERB}, POS: VERB
"V|refl|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Past|VerbForm=Fin": {POS: VERB}, },
"V|refl|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|refl|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|refl|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Reflex=Yes|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|refl|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|refl|tegdw|vervneut__Case=Nom|Reflex=Yes|Tense=Pres|VerbForm=Part": {POS: VERB}, "V|refl|tegdw|vervneut__Case=Nom|Reflex=Yes|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|refl|verldw|onverv__Reflex=Yes|Tense=Past|VerbForm=Part": {POS: VERB}, "V|refl|verldw|onverv__Reflex=Yes|Tense=Past|VerbForm=Part": {POS: VERB},
"V|trans|conj__Mood=Sub|Subcat=Tran|VerbForm=Fin": {POS: VERB}, "V|trans|conj__Mood=Sub|Subcat=Tran|VerbForm=Fin": {POS: VERB},
"V|trans|imp__Mood=Imp|Subcat=Tran|VerbForm=Fin": {POS: VERB}, "V|trans|imp__Mood=Imp|Subcat=Tran|VerbForm=Fin": {POS: VERB},
"V|trans|inf__Subcat=Tran|VerbForm=Inf": {POS: VERB}, "V|trans|inf__Subcat=Tran|VerbForm=Inf": {POS: VERB},
"V|trans|inf|subst__Subcat=Tran|VerbForm=Inf": {POS: VERB}, "V|trans|inf|subst__Subcat=Tran|VerbForm=Inf": {POS: VERB},
"V|trans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V|trans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
"V|trans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB}, POS: VERB
"V|trans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB}, },
"V|trans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB}, "V|trans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
"V|trans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Fin": {POS: VERB}, POS: VERB
"V|trans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Fin": {POS: VERB}, },
"V|trans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|trans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|trans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|trans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|trans|tegdw|onverv__Subcat=Tran|Tense=Pres|VerbForm=Part": {POS: VERB}, "V|trans|tegdw|onverv__Subcat=Tran|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|trans|tegdw|vervneut__Case=Nom|Subcat=Tran|Tense=Pres|VerbForm=Part": {POS: VERB}, "V|trans|tegdw|vervneut__Case=Nom|Subcat=Tran|Tense=Pres|VerbForm=Part": {
POS: VERB
},
"V|trans|verldw|onverv__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB}, "V|trans|verldw|onverv__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V|trans|verldw|vervmv__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB}, "V|trans|verldw|vervmv__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {
"V|trans|verldw|vervneut__Case=Nom|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB}, POS: VERB
"V|trans|verldw|vervvergr__Degree=Cmp|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB}, },
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: X}, "V|trans|verldw|vervneut__Case=Nom|Subcat=Tran|Tense=Past|VerbForm=Part": {
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: X}, POS: VERB
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X}, },
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Past|VerbForm=Part": {POS: X}, "V|trans|verldw|vervvergr__Degree=Cmp|Subcat=Tran|Tense=Past|VerbForm=Part": {
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {POS: X}, POS: VERB
},
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
POS: X
},
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Past|VerbForm=Part": {
POS: X
},
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {
POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: X}, "X__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {POS: X}, "X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {POS: X}, POS: X
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X}, },
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: X}, "X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X}, POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: X}, "X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {POS: X}, "X__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: X}, "X__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: X},
@ -808,5 +1024,5 @@ TAG_MAP = {
"X__VerbForm=Inf|VerbType=Mod": {POS: X}, "X__VerbForm=Inf|VerbType=Mod": {POS: X},
"X__VerbType=Aux,Cop": {POS: X}, "X__VerbType=Aux,Cop": {POS: X},
"X___": {POS: X}, "X___": {POS: X},
"_SP": {POS: SPACE} "_SP": {POS: SPACE},
} }

File diff suppressed because it is too large Load Diff

View File

@ -5039,5 +5039,5 @@ TAG_MAP = {
"punc": {POS: PUNCT}, "punc": {POS: PUNCT},
"v-pcp|M|P": {POS: VERB}, "v-pcp|M|P": {POS: VERB},
"v-pcp|M|S": {POS: VERB}, "v-pcp|M|S": {POS: VERB},
"_SP": {POS: SPACE} "_SP": {POS: SPACE},
} }

View File

@ -39,7 +39,9 @@ _infixes = (
+ LIST_ICONS + LIST_ICONS
+ [ + [
r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),

View File

@ -19,7 +19,6 @@ _abbrev_exc = [
{ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"}, {ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"},
{ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"}, {ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"},
{ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"}, {ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"},
# Months abbreviations # Months abbreviations
{ORTH: "янв", LEMMA: "январь", NORM: "январь"}, {ORTH: "янв", LEMMA: "январь", NORM: "январь"},
{ORTH: "фев", LEMMA: "февраль", NORM: "февраль"}, {ORTH: "фев", LEMMA: "февраль", NORM: "февраль"},
@ -49,16 +48,18 @@ for abbrev_desc in _abbrev_exc:
abbrev = abbrev_desc[ORTH] abbrev = abbrev_desc[ORTH]
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
_exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] _exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
_exc[orth + '.'] = [{ORTH: orth + '.', LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] _exc[orth + "."] = [
{ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}
]
_slang_exc = [ _slang_exc = [
{ORTH: '2к15', LEMMA: '2015', NORM: '2015'}, {ORTH: "2к15", LEMMA: "2015", NORM: "2015"},
{ORTH: '2к16', LEMMA: '2016', NORM: '2016'}, {ORTH: "2к16", LEMMA: "2016", NORM: "2016"},
{ORTH: '2к17', LEMMA: '2017', NORM: '2017'}, {ORTH: "2к17", LEMMA: "2017", NORM: "2017"},
{ORTH: '2к18', LEMMA: '2018', NORM: '2018'}, {ORTH: "2к18", LEMMA: "2018", NORM: "2018"},
{ORTH: '2к19', LEMMA: '2019', NORM: '2019'}, {ORTH: "2к19", LEMMA: "2019", NORM: "2019"},
{ORTH: '2к20', LEMMA: '2020', NORM: '2020'}, {ORTH: "2к20", LEMMA: "2020", NORM: "2020"},
] ]
for slang_desc in _slang_exc: for slang_desc in _slang_exc:

View File

@ -15,7 +15,7 @@ _infixes = (
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=_hyphens_no_dash), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash),
r"(?<=[0-9])-(?=[0-9])", r"(?<=[0-9])-(?=[0-9])",
] ]
) )

View File

@ -7,7 +7,6 @@ from __future__ import unicode_literals
# Entries should be added in the following format: # Entries should be added in the following format:
LOOKUP = { LOOKUP = {
"آ": "آنا", "آ": "آنا",
"آْباد": "آْباد", "آْباد": "آْباد",
@ -29109,5 +29108,5 @@ LOOKUP = {
"ظالموں": "ظالم", "ظالموں": "ظالم",
"ظلم": "ظلم", "ظلم": "ظلم",
"ظلمو": "ظلم", "ظلمو": "ظلم",
"ظلموں": "ظلم" "ظلموں": "ظلم",
} }

View File

@ -16,5 +16,5 @@ sentences = [
"此外,中文还是联合国正式语文,并被上海合作组织等国际组织采用为官方语言。", "此外,中文还是联合国正式语文,并被上海合作组织等国际组织采用为官方语言。",
"在中国大陆,汉语通称为“汉语”。", "在中国大陆,汉语通称为“汉语”。",
"在联合国、台湾、香港及澳门,通称为“中文”。", "在联合国、台湾、香港及澳门,通称为“中文”。",
"在新加坡及马来西亚,通称为“华语”。" "在新加坡及马来西亚,通称为“华语”。",
] ]

View File

@ -47,7 +47,7 @@ _single_num_words = [
"拾陆", "拾陆",
"拾柒", "拾柒",
"拾捌", "拾捌",
"拾玖" "拾玖",
] ]
_count_num_words = [ _count_num_words = [
@ -68,27 +68,16 @@ _count_num_words = [
"", "",
"", "",
"", "",
"" "",
] ]
_base_num_words = [ _base_num_words = ["", "", "", "", "亿", "", "", "", ""]
"",
"",
"",
"",
"亿",
"",
"",
"",
""
]
def like_num(text): def like_num(text):
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]
text = text.replace(",", "").replace( text = text.replace(",", "").replace(".", "").replace("", "").replace("", "")
".", "").replace("", "").replace("", "")
if text.isdigit(): if text.isdigit():
return True return True
if text.count("/") == 1: if text.count("/") == 1:
@ -97,10 +86,12 @@ def like_num(text):
return True return True
if text in _single_num_words: if text in _single_num_words:
return True return True
# fmt: off
if re.match('^((' + '|'.join(_count_num_words) + '){1}' if re.match('^((' + '|'.join(_count_num_words) + '){1}'
+ '(' + '|'.join(_base_num_words) + '){1})+' + '(' + '|'.join(_base_num_words) + '){1})+'
+ '(' + '|'.join(_count_num_words) + ')?$', text): + '(' + '|'.join(_count_num_words) + ')?$', text):
return True return True
# fmt: on
return False return False

View File

@ -430,6 +430,7 @@ class Language(object):
DOCS: https://spacy.io/api/language#update DOCS: https://spacy.io/api/language#update
""" """
expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
if len(docs) != len(golds): if len(docs) != len(golds):
raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds))) raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
if len(docs) == 0: if len(docs) == 0:
@ -445,10 +446,10 @@ class Language(object):
if isinstance(doc, basestring_): if isinstance(doc, basestring_):
doc = self.make_doc(doc) doc = self.make_doc(doc)
if not isinstance(gold, GoldParse): if not isinstance(gold, GoldParse):
expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links") unexpected = [k for k in gold if k not in expected_keys]
unexpected_keys = [k for k in gold if k not in expected_keys] if unexpected:
if unexpected_keys: err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
raise ValueError(Errors.E151.format(unexpected_keys=unexpected_keys, expected_keys=expected_keys)) raise ValueError(err)
gold = GoldParse(doc, **gold) gold = GoldParse(doc, **gold)
doc_objs.append(doc) doc_objs.append(doc)
gold_objs.append(gold) gold_objs.append(gold)

View File

@ -5,10 +5,10 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.parametrize('text,lemma', [("aprox.", "aproximadament"), @pytest.mark.parametrize(
("pàg.", "pàgina"), "text,lemma",
("p.ex.", "per exemple") [("aprox.", "aproximadament"), ("pàg.", "pàgina"), ("p.ex.", "per exemple")],
]) )
def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma): def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
tokens = ca_tokenizer(text) tokens = ca_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1

View File

@ -21,21 +21,37 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
assert len(tokens) == 138 assert len(tokens) == 138
@pytest.mark.parametrize('text,length', [ @pytest.mark.parametrize(
"text,length",
[
("Perquè va anar-hi?", 6), ("Perquè va anar-hi?", 6),
("“Ah no?”", 5), ("“Ah no?”", 5),
("""Sí! "Anem", va contestar el Joan Carles""", 11), ("""Sí! "Anem", va contestar el Joan Carles""", 11),
("Van córrer aprox. 10km", 5), ("Van córrer aprox. 10km", 5),
("Llavors perqué...", 3)]) ("Llavors perqué...", 3),
],
)
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length): def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
tokens = ca_tokenizer(text) tokens = ca_tokenizer(text)
assert len(tokens) == length assert len(tokens) == length
@pytest.mark.parametrize('text,match', [ @pytest.mark.parametrize(
('10', True), ('1', True), ('10,000', True), ('10,00', True), "text,match",
('999.0', True), ('un', True), ('dos', True), ('bilió', True), [
('gos', False), (',', False), ('1/2', True)]) ("10", True),
("1", True),
("10,000", True),
("10,00", True),
("999.0", True),
("un", True),
("dos", True),
("bilió", True),
("gos", False),
(",", False),
("1/2", True),
],
)
def test_ca_lex_attrs_like_number(ca_tokenizer, text, match): def test_ca_lex_attrs_like_number(ca_tokenizer, text, match):
tokens = ca_tokenizer(text) tokens = ca_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1

View File

@ -32,7 +32,7 @@ def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
assert [token.norm_ for token in tokens] == norms assert [token.norm_ for token in tokens] == norms
@pytest.mark.parametrize('text,norm', [("daß", "dass")]) @pytest.mark.parametrize("text,norm", [("daß", "dass")])
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm): def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
assert tokens[0].norm_ == norm assert tokens[0].norm_ == norm

View File

@ -7,33 +7,33 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text", "text",
[ [
u"aujourd'hui", "aujourd'hui",
u"Aujourd'hui", "Aujourd'hui",
u"prud'hommes", "prud'hommes",
u"prudhommal", "prudhommal",
u"audio-numérique", "audio-numérique",
u"Audio-numérique", "Audio-numérique",
u"entr'amis", "entr'amis",
u"entr'abat", "entr'abat",
u"rentr'ouvertes", "rentr'ouvertes",
u"grand'hamien", "grand'hamien",
u"Châteauneuf-la-Forêt", "Châteauneuf-la-Forêt",
u"Château-Guibert", "Château-Guibert",
u"11-septembre", "11-septembre",
u"11-Septembre", "11-Septembre",
u"refox-trottâmes", "refox-trottâmes",
# u"K-POP", # u"K-POP",
# u"K-Pop", # u"K-Pop",
# u"K-pop", # u"K-pop",
u"z'yeutes", "z'yeutes",
u"black-outeront", "black-outeront",
u"états-unienne", "états-unienne",
u"courtes-pattes", "courtes-pattes",
u"court-pattes", "court-pattes",
u"saut-de-ski", "saut-de-ski",
u"Écourt-Saint-Quentin", "Écourt-Saint-Quentin",
u"Bout-de-l'Îlien", "Bout-de-l'Îlien",
u"pet-en-l'air", "pet-en-l'air",
], ],
) )
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text): def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):

View File

@ -3,13 +3,18 @@ from __future__ import unicode_literals
import pytest import pytest
# fmt: off
@pytest.mark.parametrize("tokens,lemmas", [ TEST_CASES = [
(["Galime", "vadinti", "gerovės", "valstybe", ",", "turime", "išvystytą", "socialinę", "apsaugą", ",", (["Galime", "vadinti", "gerovės", "valstybe", ",", "turime", "išvystytą", "socialinę", "apsaugą", ",",
"sveikatos", "apsaugą", "ir", "prieinamą", "švietimą", "."], "sveikatos", "apsaugą", "ir", "prieinamą", "švietimą", "."],
["galėti", "vadintas", "gerovė", "valstybė", ",", "turėti", "išvystytas", "socialinis", ["galėti", "vadintas", "gerovė", "valstybė", ",", "turėti", "išvystytas", "socialinis",
"apsauga", ",", "sveikata", "apsauga", "ir", "prieinamas", "švietimas", "."]), "apsauga", ",", "sveikata", "apsauga", "ir", "prieinamas", "švietimas", "."]),
(["taip", ",", "uoliai", "tyrinėjau", "ir", "pasirinkau", "geriausią", "variantą", "."], (["taip", ",", "uoliai", "tyrinėjau", "ir", "pasirinkau", "geriausią", "variantą", "."],
["taip", ",", "uolus", "tyrinėti", "ir", "pasirinkti", "geras", "variantas", "."])]) ["taip", ",", "uolus", "tyrinėti", "ir", "pasirinkti", "geras", "variantas", "."])
]
# fmt: on
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas): def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
assert lemmas == [lt_lemmatizer.lookup(token) for token in tokens] assert lemmas == [lt_lemmatizer.lookup(token) for token in tokens]

View File

@ -7,10 +7,21 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.parametrize('text,match', [ @pytest.mark.parametrize(
('10', True), ('1', True), ('10,000', True), ('10,00', True), "text,match",
('jeden', True), ('dwa', True), ('milion', True), [
('pies', False), (',', False), ('1/2', True)]) ("10", True),
("1", True),
("10,000", True),
("10,00", True),
("jeden", True),
("dwa", True),
("milion", True),
("pies", False),
(",", False),
("1/2", True),
],
)
def test_lex_attrs_like_number(pl_tokenizer, text, match): def test_lex_attrs_like_number(pl_tokenizer, text, match):
tokens = pl_tokenizer(text) tokens = pl_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1

View File

@ -4,9 +4,7 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize("text", ["ہےں۔", "کیا۔"])
"text", ['ہےں۔', 'کیا۔']
)
def test_contractions(ur_tokenizer, text): def test_contractions(ur_tokenizer, text):
"""Test specific Urdu punctuation character""" """Test specific Urdu punctuation character"""
tokens = ur_tokenizer(text) tokens = ur_tokenizer(text)

View File

@ -134,12 +134,12 @@ def test_matcher_end_zero_plus(en_vocab):
def test_matcher_sets_return_correct_tokens(en_vocab): def test_matcher_sets_return_correct_tokens(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
patterns = [ patterns = [
[{'LOWER': {'IN': ["zero"]}}], [{"LOWER": {"IN": ["zero"]}}],
[{'LOWER': {'IN': ["one"]}}], [{"LOWER": {"IN": ["one"]}}],
[{'LOWER': {'IN': ["two"]}}], [{"LOWER": {"IN": ["two"]}}],
] ]
matcher.add('TEST', None, *patterns) matcher.add("TEST", None, *patterns)
doc = Doc(en_vocab, words="zero one two three".split()) doc = Doc(en_vocab, words="zero one two three".split())
matches = matcher(doc) matches = matcher(doc)
texts = [Span(doc, s, e, label=L).text for L, s, e in matches] texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
assert texts == ['zero', 'one', 'two'] assert texts == ["zero", "one", "two"]

View File

@ -52,7 +52,9 @@ def test_get_pipe(nlp, name):
assert nlp.get_pipe(name) == new_pipe assert nlp.get_pipe(name) == new_pipe
@pytest.mark.parametrize("name,replacement,not_callable", [("my_component", lambda doc: doc, {})]) @pytest.mark.parametrize(
"name,replacement,not_callable", [("my_component", lambda doc: doc, {})]
)
def test_replace_pipe(nlp, name, replacement, not_callable): def test_replace_pipe(nlp, name, replacement, not_callable):
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.replace_pipe(name, new_pipe) nlp.replace_pipe(name, new_pipe)

View File

@ -358,7 +358,9 @@ def test_issue850_basic():
assert end == 4 assert end == 4
@pytest.mark.skip(reason="French exception list is not enabled in the default tokenizer anymore") @pytest.mark.skip(
reason="French exception list is not enabled in the default tokenizer anymore"
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"] "text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
) )

View File

@ -19,7 +19,7 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
def test_issue1235(): def test_issue1235():
"""Test that g is not split of if preceded by a number and a letter""" """Test that g is not split of if preceded by a number and a letter"""
nlp = English() nlp = English()
testwords = u'e2g 2g 52g' testwords = "e2g 2g 52g"
doc = nlp(testwords) doc = nlp(testwords)
assert len(doc) == 5 assert len(doc) == 5
assert doc[0].text == "e2g" assert doc[0].text == "e2g"

View File

@ -4,15 +4,7 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize("word", ["don't", "dont", "I'd", "Id"])
"word",
[
"don't",
"dont",
"I'd",
"Id",
],
)
def test_issue3521(en_tokenizer, word): def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1] tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms # 'not' and 'would' should be stopwords, also in their abbreviated forms

View File

@ -9,7 +9,10 @@ import numpy as np
def test_issue3540(en_vocab): def test_issue3540(en_vocab):
words = ["I", "live", "in", "NewYork", "right", "now"] words = ["I", "live", "in", "NewYork", "right", "now"]
tensor = np.asarray([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], dtype="f") tensor = np.asarray(
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
dtype="f",
)
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words)
doc.tensor = tensor doc.tensor = tensor
@ -25,7 +28,7 @@ def test_issue3540(en_vocab):
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
heads = [(doc[3], 1), doc[2]] heads = [(doc[3], 1), doc[2]]
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
retokenizer.split(doc[3], [u"New", u"York"], heads=heads, attrs=attrs) retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
gold_text = ["I", "live", "in", "New", "York", "right", "now"] gold_text = ["I", "live", "in", "New", "York", "right", "now"]
assert [token.text for token in doc] == gold_text assert [token.text for token in doc] == gold_text

View File

@ -35,7 +35,9 @@ def test_issue3962(doc):
doc2_json = doc2.to_json() doc2_json = doc2.to_json()
assert doc2_json assert doc2_json
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root assert (
doc2[0].head.text == "jests"
) # head set to itself, being the new artificial root
assert doc2[0].dep_ == "dep" assert doc2[0].dep_ == "dep"
assert doc2[1].head.text == "jests" assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep" assert doc2[1].dep_ == "prep"
@ -92,7 +94,9 @@ def test_issue3962_long(two_sent_doc):
doc2_json = doc2.to_json() doc2_json = doc2.to_json()
assert doc2_json assert doc2_json
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root (in sentence 1) assert (
doc2[0].head.text == "jests"
) # head set to itself, being the new artificial root (in sentence 1)
assert doc2[0].dep_ == "ROOT" assert doc2[0].dep_ == "ROOT"
assert doc2[1].head.text == "jests" assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep" assert doc2[1].dep_ == "prep"
@ -100,9 +104,13 @@ def test_issue3962_long(two_sent_doc):
assert doc2[2].dep_ == "pobj" assert doc2[2].dep_ == "pobj"
assert doc2[3].head.text == "jests" assert doc2[3].head.text == "jests"
assert doc2[3].dep_ == "punct" assert doc2[3].dep_ == "punct"
assert doc2[4].head.text == "They" # head set to itself, being the new artificial root (in sentence 2) assert (
doc2[4].head.text == "They"
) # head set to itself, being the new artificial root (in sentence 2)
assert doc2[4].dep_ == "dep" assert doc2[4].dep_ == "dep"
assert doc2[4].head.text == "They" # head set to the new artificial head (in sentence 2) assert (
doc2[4].head.text == "They"
) # head set to the new artificial head (in sentence 2)
assert doc2[4].dep_ == "dep" assert doc2[4].dep_ == "dep"
# We should still have 2 sentences # We should still have 2 sentences

View File

@ -30,14 +30,18 @@ def test_serialize_kb_disk(en_vocab):
def _get_dummy_kb(vocab): def _get_dummy_kb(vocab):
kb = KnowledgeBase(vocab=vocab, entity_vector_length=3) kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
kb.add_entity(entity='Q53', freq=33, entity_vector=[0, 5, 3]) kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
kb.add_entity(entity='Q17', freq=2, entity_vector=[7, 1, 0]) kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
kb.add_entity(entity='Q007', freq=7, entity_vector=[0, 0, 7]) kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
kb.add_entity(entity='Q44', freq=342, entity_vector=[4, 4, 4]) kb.add_entity(entity="Q44", freq=342, entity_vector=[4, 4, 4])
kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9]) kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1]) kb.add_alias(
kb.add_alias(alias='random', entities=['Q007'], probabilities=[1.0]) alias="guy",
entities=["Q53", "Q007", "Q17", "Q44"],
probabilities=[0.3, 0.3, 0.2, 0.1],
)
kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
return kb return kb
@ -45,30 +49,30 @@ def _get_dummy_kb(vocab):
def _check_kb(kb): def _check_kb(kb):
# check entities # check entities
assert kb.get_size_entities() == 4 assert kb.get_size_entities() == 4
for entity_string in ['Q53', 'Q17', 'Q007', 'Q44']: for entity_string in ["Q53", "Q17", "Q007", "Q44"]:
assert entity_string in kb.get_entity_strings() assert entity_string in kb.get_entity_strings()
for entity_string in ['', 'Q0']: for entity_string in ["", "Q0"]:
assert entity_string not in kb.get_entity_strings() assert entity_string not in kb.get_entity_strings()
# check aliases # check aliases
assert kb.get_size_aliases() == 3 assert kb.get_size_aliases() == 3
for alias_string in ['double07', 'guy', 'random']: for alias_string in ["double07", "guy", "random"]:
assert alias_string in kb.get_alias_strings() assert alias_string in kb.get_alias_strings()
for alias_string in ['nothingness', '', 'randomnoise']: for alias_string in ["nothingness", "", "randomnoise"]:
assert alias_string not in kb.get_alias_strings() assert alias_string not in kb.get_alias_strings()
# check candidates & probabilities # check candidates & probabilities
candidates = sorted(kb.get_candidates('double07'), key=lambda x: x.entity_) candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_)
assert len(candidates) == 2 assert len(candidates) == 2
assert candidates[0].entity_ == 'Q007' assert candidates[0].entity_ == "Q007"
assert 6.999 < candidates[0].entity_freq < 7.01 assert 6.999 < candidates[0].entity_freq < 7.01
assert candidates[0].entity_vector == [0, 0, 7] assert candidates[0].entity_vector == [0, 0, 7]
assert candidates[0].alias_ == 'double07' assert candidates[0].alias_ == "double07"
assert 0.899 < candidates[0].prior_prob < 0.901 assert 0.899 < candidates[0].prior_prob < 0.901
assert candidates[1].entity_ == 'Q17' assert candidates[1].entity_ == "Q17"
assert 1.99 < candidates[1].entity_freq < 2.01 assert 1.99 < candidates[1].entity_freq < 2.01
assert candidates[1].entity_vector == [7, 1, 0] assert candidates[1].entity_vector == [7, 1, 0]
assert candidates[1].alias_ == 'double07' assert candidates[1].alias_ == "double07"
assert 0.099 < candidates[1].prior_prob < 0.101 assert 0.099 < candidates[1].prior_prob < 0.101