Tidy up and auto-format

This commit is contained in:
Ines Montani 2019-08-20 17:36:34 +02:00
parent 364aaf5bc2
commit f580302673
69 changed files with 83201 additions and 82191 deletions

View File

@ -430,8 +430,7 @@ class Errors(object):
E150 = ("The language of the `nlp` object and the `vocab` should be the "
"same, but found '{nlp}' and '{vocab}' respectively.")
E151 = ("Trying to call nlp.update without required annotation types. "
"Expected top-level keys: {expected_keys}."
" Got: {unexpected_keys}.")
"Expected top-level keys: {exp}. Got: {unexp}.")
E152 = ("The `nlp` object should have a pre-trained `ner` component.")
E153 = ("Either provide a path to a preprocessed training directory, "
"or to the original Wikipedia XML dump.")

View File

@ -10,8 +10,4 @@ Example sentences to test spaCy and its language models.
"""
sentences = [
'তুই খুব ভালো',
'আজ আমরা ডাক্তার দেখতে যাবো',
'আমি জানি না '
]
sentences = ["তুই খুব ভালো", "আজ আমরা ডাক্তার দেখতে যাবো", "আমি জানি না "]

View File

@ -22,7 +22,9 @@ _suffixes = (
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency),
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
),
]
)
@ -35,8 +37,8 @@ _infixes = (
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])({h})(?=[{ae}])".format(a=ALPHA, h=HYPHENS, ae=""),
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)

View File

@ -13,7 +13,7 @@ _infixes = (
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),

View File

@ -59,7 +59,9 @@ _suffixes = (
r"([0-9])+\&", # 12&
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"²\-\+", q=CONCAT_QUOTES),
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"²\-\+", q=CONCAT_QUOTES
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\-", # όνομα-
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\.",
@ -87,8 +89,8 @@ _infixes = (
r"([a-zA-Z]+)(\-([a-zA-Z]+))+", # abc-abc
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)

View File

@ -27,5 +27,5 @@ ADVERBS_IRREG = {
"slower": ("slow",),
"slowest": ("slowest",),
"sooner": ("soon",),
"soonest": ("soon",)
"soonest": ("soon",),
}

View File

@ -44,7 +44,7 @@ NOUNS_IRREG = {
"allodia": ("allodium",),
"alluvia": ("alluvium",),
"alodia": ("alodium",),
"alto-relievos": ("alto-relievo", "alto-rilievo",),
"alto-relievos": ("alto-relievo", "alto-rilievo"),
"altocumuli": ("altocumulus",),
"altostrati": ("altostratus",),
"alulae": ("alula",),
@ -81,7 +81,7 @@ NOUNS_IRREG = {
"anamorphoses": ("anamorphosis",),
"anastomoses": ("anastomosis",),
"anatyxes": ("anaptyxis",),
"ancones": ("ancon", "ancone",),
"ancones": ("ancon", "ancone"),
"androclinia": ("androclinium",),
"androecia": ("androecium",),
"androsphinges": ("androsphinx",),
@ -90,7 +90,7 @@ NOUNS_IRREG = {
"angiomata": ("angioma",),
"animalcula": ("animalculum",),
"anlagen": ("anlage",),
"annattos": ("anatto", "annatto",),
"annattos": ("anatto", "annatto"),
"annuli": ("annulus",),
"antae": ("anta",),
"antalkalies": ("antalkali",),
@ -158,7 +158,7 @@ NOUNS_IRREG = {
"aspergilli": ("aspergillus",),
"aspergilloses": ("aspergillosis",),
"aspersoria": ("aspersorium",),
"assegais": ("assagai", "assegai",),
"assegais": ("assagai", "assegai"),
"astragali": ("astragalus",),
"asyndeta": ("asyndeton",),
"atheromata": ("atheroma",),
@ -172,15 +172,15 @@ NOUNS_IRREG = {
"aurei": ("aureus",),
"auriculae": ("auricula",),
"aurorae": ("aurora",),
"auspices": ("auspex", "auspice",),
"auspices": ("auspex", "auspice"),
"autocatalyses": ("autocatalysis",),
"autochthones": ("autochthon",),
"automata": ("automaton",),
"autos-da-fe": ("auto-da-fe",),
"avitaminoses": ("avitaminosis",),
"axes": ("ax", "axis",),
"axes": ("ax", "axis"),
"axillae": ("axilla",),
"bacchantes": ("bacchant", "bacchante",),
"bacchantes": ("bacchant", "bacchante"),
"bacchii": ("bacchius",),
"bacilli": ("bacillus",),
"bacteriostases": ("bacteriostasis",),
@ -195,7 +195,7 @@ NOUNS_IRREG = {
"banjoes": ("banjo",),
"barklice": ("barklouse",),
"barramundies": ("barramundi",),
"bases": ("base", "basis",),
"bases": ("base", "basis"),
"bases-on-balls": ("base_on_balls",),
"bases_on_balls": ("base_on_balls",),
"basidia": ("basidium",),
@ -204,15 +204,15 @@ NOUNS_IRREG = {
"bastinadoes": ("bastinado",),
"bateaux": ("bateau",),
"batfishes": ("batfish",),
"beadsmen": ("beadsman", "bedesman",),
"beadsmen": ("beadsman", "bedesman"),
"beaux": ("beau",),
"beches-de-mer": ("beche-de-mer",),
"beeves": ("beef",),
"behooves": ("behoof",),
"bersaglieri": ("bersagliere",),
"bhishties": ("bheesty", "bhishti",),
"bhishties": ("bheesty", "bhishti"),
"bibliothecae": ("bibliotheca",),
"bicennaries": ("bicentenary", "bicentennial",),
"bicennaries": ("bicentenary", "bicentennial"),
"bijoux": ("bijou",),
"bilboes": ("bilbo",),
"billets-doux": ("billet-doux",),
@ -245,7 +245,7 @@ NOUNS_IRREG = {
"brachia": ("brachium",),
"brainchildren": ("brainchild",),
"branchiae": ("branchia",),
"brants": ("brant", "brent",),
"brants": ("brant", "brent"),
"bravadoes": ("bravado",),
"bravoes": ("bravo",),
"bregmata": ("bregma",),
@ -275,7 +275,7 @@ NOUNS_IRREG = {
"caesurae": ("caesura",),
"calami": ("calamus",),
"calathi": ("calathus",),
"calcanei": ("calcaneum", "calcaneus",),
"calcanei": ("calcaneum", "calcaneus"),
"calces": ("calx",),
"calculi": ("calculus",),
"caldaria": ("caldarium",),
@ -421,7 +421,7 @@ NOUNS_IRREG = {
"comae": ("coma",),
"comatulae": ("comatula",),
"comedones": ("comedo",),
"comics": ("comic_strip", "comic",),
"comics": ("comic_strip", "comic"),
"commandoes": ("commando",),
"concertanti": ("concertante",),
"concerti": ("concerto",),
@ -549,11 +549,11 @@ NOUNS_IRREG = {
"diplococci": ("diplococcus",),
"directors-general": ("director-general",),
"disci": ("discus",),
"discoboli": ("discobolos", "discobolus",),
"discoboli": ("discobolos", "discobolus"),
"dive": ("diva",),
"diverticula": ("diverticulum",),
"divertimenti": ("divertimento",),
"djinn": ("djinni", "djinny",),
"djinn": ("djinni", "djinny"),
"dodoes": ("dodo",),
"dogfishes": ("dogfish",),
"dogmata": ("dogma",),
@ -593,7 +593,7 @@ NOUNS_IRREG = {
"ellipses": ("ellipsis",),
"eluvia": ("eluvium",),
"elves": ("elf",),
"elytra": ("elytron", "elytrum",),
"elytra": ("elytron", "elytrum"),
"embargoes": ("embargo",),
"emboli": ("embolus",),
"emphases": ("emphasis",),
@ -623,7 +623,7 @@ NOUNS_IRREG = {
"entases": ("entasis",),
"entera": ("enteron",),
"entia": ("ens",),
"entozoa": ("entozoan", "entozoon",),
"entozoa": ("entozoan", "entozoon"),
"epencephala": ("epencephalon",),
"epentheses": ("epenthesis",),
"epexegeses": ("epexegesis",),
@ -643,10 +643,10 @@ NOUNS_IRREG = {
"epiphenomena": ("epiphenomenon",),
"epiphyses": ("epiphysis",),
"episterna": ("episternum",),
"epithalamia": ("epithalamion", "epithalamium",),
"epithalamia": ("epithalamion", "epithalamium"),
"epithelia": ("epithelium",),
"epitheliomata": ("epithelioma",),
"epizoa": ("epizoan", "epizoon",),
"epizoa": ("epizoan", "epizoon"),
"epyllia": ("epyllion",),
"equilibria": ("equilibrium",),
"equiseta": ("equisetum",),
@ -845,11 +845,11 @@ NOUNS_IRREG = {
"groszy": ("grosz",),
"grottoes": ("grotto",),
"guilder": ("guilde",),
"guilders": ("guilde", "guilder",),
"guilders": ("guilde", "guilder"),
"guitarfishes": ("guitarfish",),
"gummata": ("gumma",),
"gurnard": ("gurnar",),
"gurnards": ("gurnar", "gurnard",),
"gurnards": ("gurnar", "gurnard"),
"guttae": ("gutta",),
"gymnasia": ("gymnasium",),
"gynaecea": ("gynaeceum",),
@ -870,7 +870,7 @@ NOUNS_IRREG = {
"haeredes": ("haeres",),
"haftaroth": ("haftarah",),
"hagfishes": ("hagfish",),
"haggadas": ("haggada", "haggadah",),
"haggadas": ("haggada", "haggadah"),
"haggadoth": ("haggada",),
"hajjes": ("hajj",),
"haleru": ("haler",),
@ -879,7 +879,7 @@ NOUNS_IRREG = {
"halloth": ("hallah",),
"halluces": ("hallux",),
"haloes": ("halo",),
"halteres": ("halter", "haltere",),
"halteres": ("halter", "haltere"),
"halves": ("half",),
"hamuli": ("hamulus",),
"hangers-on": ("hanger-on",),
@ -909,7 +909,7 @@ NOUNS_IRREG = {
"heraclidae": ("heraclid",),
"heraklidae": ("heraklid",),
"herbaria": ("herbarium",),
"hermae": ("herm", "herma",),
"hermae": ("herm", "herma"),
"hermai": ("herma",),
"herniae": ("hernia",),
"heroes": ("hero",),
@ -955,8 +955,8 @@ NOUNS_IRREG = {
"ibices": ("ibex",),
"ibo": ("igbo",),
"ichthyosauri": ("ichthyosaurus",),
"ichthyosauruses": ("ichthyosaur", "ichthyosaurus",),
"iconostases": ("iconostas", "iconostasis",),
"ichthyosauruses": ("ichthyosaur", "ichthyosaurus"),
"iconostases": ("iconostas", "iconostasis"),
"icosahedra": ("icosahedron",),
"ideata": ("ideatum",),
"igorrorote": ("igorrote",),
@ -991,7 +991,7 @@ NOUNS_IRREG = {
"is": ("is",),
"ischia": ("ischium",),
"isthmi": ("isthmus",),
"jackeroos": ("jackaroo", "jackeroo",),
"jackeroos": ("jackaroo", "jackeroo"),
"jackfishes": ("jackfish",),
"jackknives": ("jackknife",),
"jacks-in-the-box": ("jack-in-the-box",),
@ -1001,12 +1001,12 @@ NOUNS_IRREG = {
"jewfishes": ("jewfish",),
"jingoes": ("jingo",),
"jinn": ("jinni",),
"joes": ("jo", "joe",),
"joes": ("jo", "joe"),
"judge_advocates_general": ("judge_advocate_general",),
"jura": ("jus",),
"kaddishim": ("kaddish",),
"kalmuck": ("kalmuc",),
"kalmucks": ("kalmuc", "kalmuck",),
"kalmucks": ("kalmuc", "kalmuck"),
"katabases": ("katabasis",),
"keeshonden": ("keeshond",),
"kibbutzim": ("kibbutz",),
@ -1045,7 +1045,7 @@ NOUNS_IRREG = {
"latifundia": ("latifundium",),
"latu": ("lat",),
"lavaboes": ("lavabo",),
"leaves": ("leaf", "leave",),
"leaves": ("leaf", "leave"),
"lecythi": ("lecythus",),
"leges": ("lex",),
"lei": ("leu",),
@ -1078,7 +1078,7 @@ NOUNS_IRREG = {
"liriodendra": ("liriodendron",),
"lisente": ("sente",),
"listente": ("sente",),
"litai": ("lit", "litas",),
"litai": ("lit", "litas"),
"litu": ("litas",),
"lives": ("life",),
"lixivia": ("lixivium",),
@ -1098,7 +1098,7 @@ NOUNS_IRREG = {
"lumpfishes": ("lumpfish",),
"lungfishes": ("lungfish",),
"lunulae": ("lunula",),
"lures": ("lur", "lure",),
"lures": ("lur", "lure"),
"lustra": ("lustre",),
"lyings-in": ("lying-in",),
"lymphangitides": ("lymphangitis",),
@ -1142,7 +1142,7 @@ NOUNS_IRREG = {
"marsupia": ("marsupium",),
"marvels-of-peru": ("marvel-of-peru",),
"mass_media": ("mass_medium",),
"masses": ("mass", "masse",),
"masses": ("mass", "masse"),
"masters-at-arms": ("master-at-arms",),
"matrices": ("matrix",),
"matzoth": ("matzo",),
@ -1210,7 +1210,7 @@ NOUNS_IRREG = {
"mioses": ("miosis",),
"miracidia": ("miracidium",),
"miri": ("mir",),
"mishnayoth": ("mishna", "mishnah",),
"mishnayoth": ("mishna", "mishnah"),
"mitochondria": ("mitochondrion",),
"mitzvoth": ("mitzvah",),
"modioli": ("modiolus",),
@ -1218,7 +1218,7 @@ NOUNS_IRREG = {
"momenta": ("momentum",),
"moments_of_truth": ("moment_of_truth",),
"momi": ("momus",),
"monades": ("monad", "monas",),
"monades": ("monad", "monas"),
"monkfishes": ("monkfish",),
"monochasia": ("monochasium",),
"monopodia": ("monopodium",),
@ -1235,7 +1235,7 @@ NOUNS_IRREG = {
"moriscoes": ("morisco",),
"morphallaxes": ("morphallaxis",),
"morphoses": ("morphosis",),
"morses": ("morse", "mors",),
"morses": ("morse", "mors"),
"morulae": ("morula",),
"mosasauri": ("mosasaurus",),
"moshavim": ("moshav",),
@ -1328,13 +1328,13 @@ NOUNS_IRREG = {
"oceanides": ("oceanid",),
"ocelli": ("ocellus",),
"ochreae": ("ochrea",),
"ocreae": ("ochrea", "ocrea",),
"ocreae": ("ochrea", "ocrea"),
"octahedra": ("octahedron",),
"octopi": ("octopus",),
"oculi": ("oculus",),
"odea": ("odeum",),
"oedemata": ("edema", "oedema",),
"oesophagi": ("esophagus", "oesophagus",),
"oedemata": ("edema", "oedema"),
"oesophagi": ("esophagus", "oesophagus"),
"oldwives": ("oldwife",),
"olea": ("oleum",),
"omasa": ("omasum",),
@ -1350,15 +1350,15 @@ NOUNS_IRREG = {
"optic_axes": ("optic_axis",),
"optima": ("optimum",),
"ora": ("os",),
"organa": ("organon", "organum",),
"organums": ("organa", "organum",),
"organa": ("organon", "organum"),
"organums": ("organa", "organum"),
"orthoptera": ("orthopteron",),
"osar": ("os",),
"oscula": ("osculum",),
"ossa": ("os",),
"osteomata": ("osteoma",),
"ostia": ("ostium",),
"ottomans": ("othman", "ottoman",),
"ottomans": ("othman", "ottoman"),
"ova": ("ovum",),
"ovoli": ("ovolo",),
"ovotestes": ("ovotestis",),
@ -1382,7 +1382,7 @@ NOUNS_IRREG = {
"papulae": ("papula",),
"papyri": ("papyrus",),
"parabases": ("parabasis",),
"paraleipses": ("paraleipsis", "paralipsis",),
"paraleipses": ("paraleipsis", "paralipsis"),
"paralyses": ("paralysis",),
"paramecia": ("paramecium",),
"paramenta": ("parament",),
@ -1442,13 +1442,13 @@ NOUNS_IRREG = {
"personae": ("persona",),
"petechiae": ("petechia",),
"pfennige": ("pfennig",),
"phalanges": ("phalange", "phalanx",),
"phalanges": ("phalange", "phalanx"),
"phalli": ("phallus",),
"pharynges": ("pharynx",),
"phenomena": ("phenomenon",),
"phi-phenomena": ("phi-phenomenon",),
"philodendra": ("philodendron",),
"phlyctenae": ("phlyctaena", "phlyctena",),
"phlyctenae": ("phlyctaena", "phlyctena"),
"phyla": ("phylum",),
"phylae": ("phyle",),
"phyllotaxes": ("phyllotaxis",),
@ -1475,12 +1475,12 @@ NOUNS_IRREG = {
"plasmodesmata": ("plasmodesma",),
"plasmodia": ("plasmodium",),
"plateaux": ("plateau",),
"plectra": ("plectron", "plectrum",),
"plectra": ("plectron", "plectrum"),
"plena": ("plenum",),
"pleura": ("pleuron",),
"pleurae": ("pleura",),
"plicae": ("plica",),
"ploughmen": ("ploughman", "plowman",),
"ploughmen": ("ploughman", "plowman"),
"pneumobacilli": ("pneumobacillus",),
"pneumococci": ("pneumococcus",),
"pocketknives": ("pocketknife",),
@ -1515,7 +1515,7 @@ NOUNS_IRREG = {
"principia": ("principium",),
"proboscides": ("proboscis",),
"proces-verbaux": ("proces-verbal",),
"proglottides": ("proglottid", "proglottis",),
"proglottides": ("proglottid", "proglottis"),
"prognoses": ("prognosis",),
"prolegomena": ("prolegomenon",),
"prolepses": ("prolepsis",),
@ -1532,7 +1532,7 @@ NOUNS_IRREG = {
"prostheses": ("prosthesis",),
"prostomia": ("prostomium",),
"protases": ("protasis",),
"prothalamia": ("prothalamion", "prothalamium",),
"prothalamia": ("prothalamion", "prothalamium"),
"prothalli": ("prothallus",),
"prothallia": ("prothallium",),
"prothoraces": ("prothorax",),
@ -1572,7 +1572,7 @@ NOUNS_IRREG = {
"quezales": ("quezal",),
"quinquennia": ("quinquennium",),
"quizzes": ("quiz",),
"rabatos": ("rabato", "rebato",),
"rabatos": ("rabato", "rebato"),
"rabbitfishes": ("rabbitfish",),
"rachides": ("rhachis",),
"radices": ("radix",),
@ -1583,7 +1583,7 @@ NOUNS_IRREG = {
"ranulae": ("ranula",),
"ranunculi": ("ranunculus",),
"raphae": ("raphe",),
"raphides": ("raphide", "raphis",),
"raphides": ("raphide", "raphis"),
"ratfishes": ("ratfish",),
"reales": ("real",),
"rearmice": ("rearmouse",),
@ -1598,7 +1598,7 @@ NOUNS_IRREG = {
"reis": ("real",),
"relata": ("relatum",),
"remiges": ("remex",),
"reremice": ("rearmouse", "reremouse",),
"reremice": ("rearmouse", "reremouse"),
"reseaux": ("reseau",),
"residua": ("residuum",),
"responsa": ("responsum",),
@ -1609,7 +1609,7 @@ NOUNS_IRREG = {
"retinae": ("retina",),
"rhabdomyomata": ("rhabdomyoma",),
"rhachides": ("rhachis",),
"rhachises": ("rachis", "rhachis",),
"rhachises": ("rachis", "rhachis"),
"rhinencephala": ("rhinencephalon",),
"rhizobia": ("rhizobium",),
"rhombi": ("rhombus",),
@ -1636,7 +1636,7 @@ NOUNS_IRREG = {
"runners-up": ("runner-up",),
"sacra": ("sacrum",),
"sacraria": ("sacrarium",),
"saguaros": ("saguaro", "sahuaro",),
"saguaros": ("saguaro", "sahuaro"),
"sailfishes": ("sailfish",),
"salespeople": ("salesperson",),
"salmonellae": ("salmonella",),
@ -1657,7 +1657,7 @@ NOUNS_IRREG = {
"scapulae": ("scapula",),
"scarabaei": ("scarabaeus",),
"scarves": ("scarf",),
"schatchonim": ("schatchen", "shadchan",),
"schatchonim": ("schatchen", "shadchan"),
"schemata": ("schema",),
"scherzandi": ("scherzando",),
"scherzi": ("scherzo",),
@ -1690,7 +1690,7 @@ NOUNS_IRREG = {
"senores": ("senor",),
"sensilla": ("sensillum",),
"senti": ("sent",),
"senussis": ("senusi", "senussi",),
"senussis": ("senusi", "senussi"),
"separatrices": ("separatrix",),
"sephardim": ("sephardi",),
"septa": ("septum",),
@ -1707,9 +1707,9 @@ NOUNS_IRREG = {
"shabbatim": ("shabbat",),
"shackoes": ("shacko",),
"shadchanim": ("shadchan",),
"shadchans": ("schatchen", "shadchan",),
"shadchans": ("schatchen", "shadchan"),
"shakoes": ("shako",),
"shammosim": ("shammas", "shammes",),
"shammosim": ("shammas", "shammes"),
"sheatfishes": ("sheatfish",),
"sheaves": ("sheaf",),
"shellfishes": ("shellfish",),
@ -1717,14 +1717,14 @@ NOUNS_IRREG = {
"shinleaves": ("shinleaf",),
"shittim": ("shittah",),
"shmoes": ("shmo",),
"shofroth": ("shofar", "shophar",),
"shofroth": ("shofar", "shophar"),
"shophroth": ("shophar",),
"shrewmice": ("shrewmouse",),
"shuln": ("shul",),
"siddurim": ("siddur",),
"sigloi": ("siglos",),
"signore": ("signora",),
"signori": ("signior", "signore",),
"signori": ("signior", "signore"),
"signorine": ("signorina",),
"siliquae": ("siliqua",),
"silvae": ("silva",),
@ -1739,12 +1739,12 @@ NOUNS_IRREG = {
"snaggleteeth": ("snaggletooth",),
"snailfishes": ("snailfish",),
"snipefishes": ("snipefish",),
"socmen": ("socman", "sokeman",),
"socmen": ("socman", "sokeman"),
"sola": ("solum",),
"solaria": ("solarium",),
"solatia": ("solatium",),
"soldi": ("soldo",),
"soles": ("sol", "sole",),
"soles": ("sol", "sole"),
"solfeggi": ("solfeggio",),
"soli": ("solo",),
"solidi": ("solidus",),
@ -1864,7 +1864,7 @@ NOUNS_IRREG = {
"syringes": ("syrinx",),
"syssarcoses": ("syssarcosis",),
"tableaux": ("tableau",),
"taeniae": ("taenia", "tenia",),
"taeniae": ("taenia", "tenia"),
"tali": ("talus",),
"tallaisim": ("tallith",),
"tallithes": ("tallith",),
@ -1874,14 +1874,14 @@ NOUNS_IRREG = {
"tarsi": ("tarsus",),
"tarsometatarsi": ("tarsometatarsus",),
"taxa": ("taxon",),
"taxes": ("tax", "taxis",),
"taxes": ("tax", "taxis"),
"taxies": ("taxi",),
"tectrices": ("tectrix",),
"teeth": ("tooth",),
"tegmina": ("tegmen",),
"telae": ("tela",),
"telamones": ("telamon",),
"telangiectases": ("telangiectasia", "telangiectasis",),
"telangiectases": ("telangiectasia", "telangiectasis"),
"telia": ("telium",),
"tempi": ("tempo",),
"tenacula": ("tenaculum",),
@ -1932,7 +1932,7 @@ NOUNS_IRREG = {
"tornadoes": ("tornado",),
"torpedoes": ("torpedo",),
"torsi": ("torso",),
"touracos": ("touraco", "turaco",),
"touracos": ("touraco", "turaco"),
"trabeculae": ("trabecula",),
"tracheae": ("trachea",),
"traditores": ("traditor",),
@ -1960,7 +1960,7 @@ NOUNS_IRREG = {
"tubae": ("tuba",),
"turves": ("turf",),
"tympana": ("tympanum",),
"tyros": ("tiro", "tyro",),
"tyros": ("tiro", "tyro"),
"ubermenschen": ("ubermensch",),
"uglies": ("ugli",),
"uigurs": ("uighur",),
@ -1980,7 +1980,7 @@ NOUNS_IRREG = {
"utriculi": ("utriculus",),
"uvulae": ("uvula",),
"vacua": ("vacuum",),
"vagi": ("vagus", "vagus",),
"vagi": ("vagus", "vagus"),
"vaginae": ("vagina",),
"valleculae": ("vallecula",),
"vaporetti": ("vaporetto",),
@ -2026,7 +2026,7 @@ NOUNS_IRREG = {
"vortices": ("vortex",),
"vulvae": ("vulva",),
"wagons-lits": ("wagon-lit",),
"wahhabis": ("wahabi", "wahhabi",),
"wahhabis": ("wahabi", "wahhabi"),
"wanderjahre": ("wanderjahr",),
"weakfishes": ("weakfish",),
"werewolves": ("werewolf",),
@ -2044,13 +2044,13 @@ NOUNS_IRREG = {
"yeshivoth": ("yeshiva",),
"yogin": ("yogi",),
"yourselves": ("yourself",),
"zamindaris": ("zamindari", "zemindari",),
"zamindaris": ("zamindari", "zemindari"),
"zecchini": ("zecchino",),
"zeroes": ("zero",),
"zoa": ("zoon",),
"zoaeae": ("zoaea", "zoea",),
"zoaeae": ("zoaea", "zoea"),
"zoeae": ("zoea",),
"zoeas": ("zoaea",),
"zoonoses": ("zoonosis",),
"zoosporangia": ("zoosporangium",)
"zoosporangia": ("zoosporangium",),
}

View File

@ -42,8 +42,8 @@ VERBS_IRREG = {
"anglified": ("anglify",),
"annulled": ("annul",),
"annulling": ("annul",),
"appalled": ("appal", "appall",),
"appalling": ("appal", "appall",),
"appalled": ("appal", "appall"),
"appalling": ("appal", "appall"),
"applied": ("apply",),
"arcked": ("arc",),
"arcking": ("arc",),
@ -244,9 +244,9 @@ VERBS_IRREG = {
"bypast": ("bypass",),
"caballed": ("cabal",),
"caballing": ("cabal",),
"caddied": ("caddie", "caddy",),
"caddies": ("caddie", "caddy",),
"caddying": ("caddie", "caddy",),
"caddied": ("caddie", "caddy"),
"caddies": ("caddie", "caddy"),
"caddying": ("caddie", "caddy"),
"calcified": ("calcify",),
"came": ("come",),
"canalled": ("canal",),
@ -506,8 +506,8 @@ VERBS_IRREG = {
"disembodied": ("disembody",),
"disembowelled": ("disembowel",),
"disembowelling": ("disembowel",),
"disenthralled": ("disenthral", "disenthrall",),
"disenthralling": ("disenthral", "disenthrall",),
"disenthralled": ("disenthral", "disenthrall"),
"disenthralling": ("disenthral", "disenthrall"),
"disenthralls": ("disenthral",),
"disenthrals": ("disenthrall",),
"dishevelled": ("dishevel",),
@ -518,8 +518,8 @@ VERBS_IRREG = {
"dispelling": ("dispel",),
"disqualified": ("disqualify",),
"dissatisfied": ("dissatisfy",),
"distilled": ("distil", "distill",),
"distilling": ("distil", "distill",),
"distilled": ("distil", "distill"),
"distilling": ("distil", "distill"),
"diversified": ("diversify",),
"divvied": ("divvy",),
"dizzied": ("dizzy",),
@ -595,10 +595,10 @@ VERBS_IRREG = {
"enamelling": ("enamel",),
"englutted": ("englut",),
"englutting": ("englut",),
"enrolled": ("enrol", "enroll",),
"enrolling": ("enrol", "enroll",),
"enthralled": ("enthral", "enthrall",),
"enthralling": ("enthral", "enthrall",),
"enrolled": ("enrol", "enroll"),
"enrolling": ("enrol", "enroll"),
"enthralled": ("enthral", "enthrall"),
"enthralling": ("enthral", "enthrall"),
"entrammelled": ("entrammel",),
"entrammelling": ("entrammel",),
"entrapped": ("entrap",),
@ -621,8 +621,8 @@ VERBS_IRREG = {
"exemplified": ("exemplify",),
"expelled": ("expel",),
"expelling": ("expel",),
"extolled": ("extol", "extoll",),
"extolling": ("extol", "extoll",),
"extolled": ("extol", "extoll"),
"extolling": ("extol", "extoll"),
"facetted": ("facet",),
"facetting": ("facet",),
"fagged": ("fag",),
@ -638,7 +638,7 @@ VERBS_IRREG = {
"featherbedded": ("featherbed",),
"featherbedding": ("featherbed",),
"fed": ("feed",),
"feed": ("feed", "fee",),
"feed": ("feed", "fee"),
"fell": ("fall",),
"felt": ("feel",),
"ferried": ("ferry",),
@ -744,8 +744,8 @@ VERBS_IRREG = {
"fried": ("fry",),
"frigged": ("frig",),
"frigging": ("frig",),
"fritted": ("frit", "fritt",),
"fritting": ("frit", "fritt",),
"fritted": ("frit", "fritt"),
"fritting": ("frit", "fritt"),
"frivolled": ("frivol",),
"frivolling": ("frivol",),
"frogged": ("frog",),
@ -757,8 +757,8 @@ VERBS_IRREG = {
"fructified": ("fructify",),
"fuelled": ("fuel",),
"fuelling": ("fuel",),
"fulfilled": ("fulfil", "fulfill",),
"fulfilling": ("fulfil", "fulfill",),
"fulfilled": ("fulfil", "fulfill"),
"fulfilling": ("fulfil", "fulfill"),
"funned": ("fun",),
"funnelled": ("funnel",),
"funnelling": ("funnel",),
@ -955,8 +955,8 @@ VERBS_IRREG = {
"insetting": ("inset",),
"inspanned": ("inspan",),
"inspanning": ("inspan",),
"installed": ("instal", "install",),
"installing": ("instal", "install",),
"installed": ("instal", "install"),
"installing": ("instal", "install"),
"intensified": ("intensify",),
"interbred": ("interbreed",),
"intercropped": ("intercrop",),
@ -1303,7 +1303,7 @@ VERBS_IRREG = {
"overdriven": ("overdrive",),
"overdrove": ("overdrive",),
"overflew": ("overfly",),
"overflown": ("overflow", "overfly",),
"overflown": ("overflow", "overfly"),
"overgrew": ("overgrow",),
"overgrown": ("overgrow",),
"overheard": ("overhear",),
@ -1547,8 +1547,8 @@ VERBS_IRREG = {
"red": ("red",),
"red-pencilled": ("red-pencil",),
"red-pencilling": ("red-pencil",),
"redded": ("red", "redd",),
"redding": ("red", "redd",),
"redded": ("red", "redd"),
"redding": ("red", "redd"),
"redid": ("redo",),
"redone": ("redo",),
"referred": ("refer",),
@ -1763,7 +1763,7 @@ VERBS_IRREG = {
"signified": ("signify",),
"silicified": ("silicify",),
"simplified": ("simplify",),
"singing": ("sing", "singe",),
"singing": ("sing", "singe"),
"single-stepped": ("single-step",),
"single-stepping": ("single-step",),
"sinned": ("sin",),
@ -2404,5 +2404,5 @@ VERBS_IRREG = {
"zigzagged": ("zigzag",),
"zigzagging": ("zigzag",),
"zipped": ("zip",),
"zipping": ("zip",)
"zipping": ("zip",),
}

View File

@ -538,7 +538,7 @@ for orth in [
"Sen.",
"St.",
"vs.",
"v.s."
"v.s.",
]:
_exc[orth] = [{ORTH: orth}]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -18,16 +18,24 @@ from ._verbs import VERBS
from ....util import load_language_data
BASE_PATH = Path(__file__).parent
BASE_PATH = Path(__file__).parent
LOOKUP = load_language_data(BASE_PATH / 'lookup.json')
VERBS_IRREG = load_language_data(BASE_PATH / '_verbs_irreg.json')
ADJECTIVES_IRREG = load_language_data(BASE_PATH / '_adjectives_irreg.json')
LOOKUP = load_language_data(BASE_PATH / "lookup.json")
VERBS_IRREG = load_language_data(BASE_PATH / "_verbs_irreg.json")
ADJECTIVES_IRREG = load_language_data(BASE_PATH / "_adjectives_irreg.json")
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adp': ADP_IRREG, 'aux': AUXILIARY_VERBS_IRREG,
'cconj': CCONJ_IRREG, 'det': DETS_IRREG, 'noun': NOUNS_IRREG, 'verb': VERBS_IRREG,
'pron': PRONOUNS_IRREG, 'sconj': SCONJ_IRREG}
LEMMA_EXC = {
"adj": ADJECTIVES_IRREG,
"adp": ADP_IRREG,
"aux": AUXILIARY_VERBS_IRREG,
"cconj": CCONJ_IRREG,
"det": DETS_IRREG,
"noun": NOUNS_IRREG,
"verb": VERBS_IRREG,
"pron": PRONOUNS_IRREG,
"sconj": SCONJ_IRREG,
}
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES}
LEMMA_RULES = {"adj": ADJECTIVE_RULES, "noun": NOUN_RULES, "verb": VERB_RULES}

View File

@ -3,22 +3,22 @@ from __future__ import unicode_literals
ADP_IRREG = {
"a": ("à",),
"apr.": ("après",),
"aux": ("à",),
"av.": ("avant",),
"avt": ("avant",),
"cf.": ("cf",),
"conf.": ("cf",),
"confer": ("cf",),
"d'": ("de",),
"des": ("de",),
"du": ("de",),
"jusqu'": ("jusque",),
"pdt": ("pendant",),
"+": ("plus",),
"pr": ("pour",),
"/": ("sur",),
"versus": ("vs",),
"vs.": ("vs",)
"a": ("à",),
"apr.": ("après",),
"aux": ("à",),
"av.": ("avant",),
"avt": ("avant",),
"cf.": ("cf",),
"conf.": ("cf",),
"confer": ("cf",),
"d'": ("de",),
"des": ("de",),
"du": ("de",),
"jusqu'": ("jusque",),
"pdt": ("pendant",),
"+": ("plus",),
"pr": ("pour",),
"/": ("sur",),
"versus": ("vs",),
"vs.": ("vs",),
}

View File

@ -365,5 +365,5 @@ AUXILIARY_VERBS_IRREG = {
"va": ("aller",),
"vais": ("aller",),
"vas": ("aller",),
"vont": ("aller",)
"vont": ("aller",),
}

View File

@ -3,15 +3,15 @@ from __future__ import unicode_literals
CCONJ_IRREG = {
"&amp;": ("et",),
"c-à-d": ("c'est-à-dire",),
"c.-à.-d.": ("c'est-à-dire",),
"càd": ("c'est-à-dire",),
"&": ("et",),
"et|ou": ("et-ou",),
"et/ou": ("et-ou",),
"i.e.": ("c'est-à-dire",),
"ie": ("c'est-à-dire",),
"ou/et": ("et-ou",),
"+": ("plus",)
"&amp;": ("et",),
"c-à-d": ("c'est-à-dire",),
"c.-à.-d.": ("c'est-à-dire",),
"càd": ("c'est-à-dire",),
"&": ("et",),
"et|ou": ("et-ou",),
"et/ou": ("et-ou",),
"i.e.": ("c'est-à-dire",),
"ie": ("c'est-à-dire",),
"ou/et": ("et-ou",),
"+": ("plus",),
}

File diff suppressed because it is too large Load Diff

View File

@ -3,17 +3,17 @@ from __future__ import unicode_literals
SCONJ_IRREG = {
"lorsqu'": ("lorsque",),
"pac'que": ("parce que",),
"pac'qu'": ("parce que",),
"parc'que": ("parce que",),
"parc'qu'": ("parce que",),
"paske": ("parce que",),
"pask'": ("parce que",),
"pcq": ("parce que",),
"+": ("plus",),
"puisqu'": ("puisque",),
"qd": ("quand",),
"quoiqu'": ("quoique",),
"qu'": ("que",)
"lorsqu'": ("lorsque",),
"pac'que": ("parce que",),
"pac'qu'": ("parce que",),
"parc'que": ("parce que",),
"parc'qu'": ("parce que",),
"paske": ("parce que",),
"pask'": ("parce que",),
"pcq": ("parce que",),
"+": ("plus",),
"puisqu'": ("puisque",),
"qd": ("quand",),
"quoiqu'": ("quoique",),
"qu'": ("que",),
}

View File

@ -3,20 +3,22 @@ from __future__ import unicode_literals
from pathlib import Path
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP, SCONJ, CCONJ
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ....symbols import SCONJ, CCONJ
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
from ....util import load_language_data
LOOKUP = load_language_data(Path(__file__).parent / 'lookup.json')
LOOKUP = load_language_data(Path(__file__).parent / "lookup.json")
'''
"""
French language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better French language support.
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use the
rule-based lemmatization. As a last resort, the lemmatizer checks in
the lookup table.
'''
"""
class FrenchLemmatizer(object):
@classmethod
@ -32,36 +34,39 @@ class FrenchLemmatizer(object):
def __call__(self, string, univ_pos, morphology=None):
if not self.rules:
return [self.lookup_table.get(string, string)]
if univ_pos in (NOUN, 'NOUN', 'noun'):
univ_pos = 'noun'
elif univ_pos in (VERB, 'VERB', 'verb'):
univ_pos = 'verb'
elif univ_pos in (ADJ, 'ADJ', 'adj'):
univ_pos = 'adj'
elif univ_pos in (ADP, 'ADP', 'adp'):
univ_pos = 'adp'
elif univ_pos in (ADV, 'ADV', 'adv'):
univ_pos = 'adv'
elif univ_pos in (AUX, 'AUX', 'aux'):
univ_pos = 'aux'
elif univ_pos in (CCONJ, 'CCONJ', 'cconj'):
univ_pos = 'cconj'
elif univ_pos in (DET, 'DET', 'det'):
univ_pos = 'det'
elif univ_pos in (PRON, 'PRON', 'pron'):
univ_pos = 'pron'
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
univ_pos = 'punct'
elif univ_pos in (SCONJ, 'SCONJ', 'sconj'):
univ_pos = 'sconj'
if univ_pos in (NOUN, "NOUN", "noun"):
univ_pos = "noun"
elif univ_pos in (VERB, "VERB", "verb"):
univ_pos = "verb"
elif univ_pos in (ADJ, "ADJ", "adj"):
univ_pos = "adj"
elif univ_pos in (ADP, "ADP", "adp"):
univ_pos = "adp"
elif univ_pos in (ADV, "ADV", "adv"):
univ_pos = "adv"
elif univ_pos in (AUX, "AUX", "aux"):
univ_pos = "aux"
elif univ_pos in (CCONJ, "CCONJ", "cconj"):
univ_pos = "cconj"
elif univ_pos in (DET, "DET", "det"):
univ_pos = "det"
elif univ_pos in (PRON, "PRON", "pron"):
univ_pos = "pron"
elif univ_pos in (PUNCT, "PUNCT", "punct"):
univ_pos = "punct"
elif univ_pos in (SCONJ, "SCONJ", "sconj"):
univ_pos = "sconj"
else:
return [self.lookup(string)]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):
return list(set([string.lower()]))
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []))
lemmas = lemmatize(
string,
self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []),
)
return lemmas
def is_base_form(self, univ_pos, morphology=None):
@ -70,20 +75,25 @@ class FrenchLemmatizer(object):
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology
others = [key for key in morphology
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
others = [
key
for key in morphology
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
]
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and
morphology.get('Tense') == 'pres' and
morphology.get('Number') is None and
not others):
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
and not others
):
return True
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif VerbForm_inf in morphology:
return True
@ -97,16 +107,16 @@ class FrenchLemmatizer(object):
return False
def noun(self, string, morphology=None):
return self(string, 'noun', morphology)
return self(string, "noun", morphology)
def verb(self, string, morphology=None):
return self(string, 'verb', morphology)
return self(string, "verb", morphology)
def adj(self, string, morphology=None):
return self(string, 'adj', morphology)
return self(string, "adj", morphology)
def punct(self, string, morphology=None):
return self(string, 'punct', morphology)
return self(string, "punct", morphology)
def lookup(self, string):
if string in self.lookup_table:
@ -117,7 +127,7 @@ class FrenchLemmatizer(object):
def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
if (string in index):
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
@ -125,7 +135,7 @@ def lemmatize(string, index, exceptions, rules):
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[:len(string) - len(old)] + new
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():

View File

@ -2,8 +2,6 @@
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA
_exc = {
"po'": [{ORTH: "po'", LEMMA: 'poco'}]
}
_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]}
TOKENIZER_EXCEPTIONS = _exc

View File

@ -1,5 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
@ -11,5 +12,5 @@ sentences = [
"애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.",
"자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.",
"자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.",
"런던은 영국의 수도이자 가장 큰 도시입니다."
"런던은 영국의 수도이자 가장 큰 도시입니다.",
]

View File

@ -1,7 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
STOP_WORDS = set(
"""
@ -65,4 +66,5 @@ STOP_WORDS = set("""
""".split())
""".split()
)

View File

@ -20,10 +20,10 @@ LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
BASE_PATH = Path(__file__).parent
LEMMA_EXC = {
"adj": load_language_data(BASE_PATH / '_adjectives_wordforms.json'),
"adj": load_language_data(BASE_PATH / "_adjectives_wordforms.json"),
"adv": ADVERBS_WORDFORMS,
"noun": load_language_data(BASE_PATH / '_nouns_wordforms.json'),
"verb": load_language_data(BASE_PATH / '_verbs_wordforms.json'),
"noun": load_language_data(BASE_PATH / "_nouns_wordforms.json"),
"verb": load_language_data(BASE_PATH / "_verbs_wordforms.json"),
}
LEMMA_RULES = {
@ -39,5 +39,3 @@ LEMMA_RULES = {
# https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
# License:
# Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -14,7 +14,7 @@ _infixes = (
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),

View File

@ -118,7 +118,7 @@ for orth in [
"o.l.",
"on.",
"op.",
"org."
"org.",
"osv.",
"ovf.",
"p.",

View File

@ -14,5 +14,5 @@ sentences = [
"Apple overweegt om voor 1 miljard een U.K. startup te kopen",
"Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",
"San Francisco overweegt robots op voetpaden te verbieden",
"Londen is een grote stad in het Verenigd Koninkrijk"
"Londen is een grote stad in het Verenigd Koninkrijk",
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,22 +3,25 @@ from __future__ import unicode_literals
ADPOSITIONS = set(
('aan aangaande aanwezig achter af afgezien al als an annex anno anti '
'behalve behoudens beneden benevens benoorden beoosten betreffende bewesten '
'bezijden bezuiden bij binnen binnenuit binst bladzij blijkens boven bovenop '
'buiten conform contra cq daaraan daarbij daarbuiten daarin daarnaar '
'daaronder daartegenover daarvan dankzij deure dichtbij door doordat doorheen '
'echter eraf erop erover errond eruit ervoor evenals exclusief gedaan '
'gedurende gegeven getuige gezien halfweg halverwege heen hierdoorheen hierop '
'houdende in inclusief indien ingaande ingevolge inzake jegens kortweg '
'krachtens kralj langs langsheen langst lastens linksom lopende luidens mede '
'mee met middels midden middenop mits na naan naar naartoe naast naat nabij '
'nadat namens neer neffe neffen neven nevenst niettegenstaande nopens '
'officieel om omheen omstreeks omtrent onafgezien ondanks onder onderaan '
'ondere ongeacht ooit op open over per plus pro qua rechtover rond rondom '
"sedert sinds spijts strekkende te tegen tegenaan tegenop tegenover telde "
'teneinde terug tijdens toe tot totdat trots tussen tégen uit uitgenomen '
'ultimo van vanaf vandaan vandoor vanop vanuit vanwege versus via vinnen '
'vlakbij volgens voor voor- voorbij voordat voort voren vòòr vóór waaraan '
'waarbij waardoor waaronder weg wegens weleens zijdens zoals zodat zonder '
'zónder à').split())
(
"aan aangaande aanwezig achter af afgezien al als an annex anno anti "
"behalve behoudens beneden benevens benoorden beoosten betreffende bewesten "
"bezijden bezuiden bij binnen binnenuit binst bladzij blijkens boven bovenop "
"buiten conform contra cq daaraan daarbij daarbuiten daarin daarnaar "
"daaronder daartegenover daarvan dankzij deure dichtbij door doordat doorheen "
"echter eraf erop erover errond eruit ervoor evenals exclusief gedaan "
"gedurende gegeven getuige gezien halfweg halverwege heen hierdoorheen hierop "
"houdende in inclusief indien ingaande ingevolge inzake jegens kortweg "
"krachtens kralj langs langsheen langst lastens linksom lopende luidens mede "
"mee met middels midden middenop mits na naan naar naartoe naast naat nabij "
"nadat namens neer neffe neffen neven nevenst niettegenstaande nopens "
"officieel om omheen omstreeks omtrent onafgezien ondanks onder onderaan "
"ondere ongeacht ooit op open over per plus pro qua rechtover rond rondom "
"sedert sinds spijts strekkende te tegen tegenaan tegenop tegenover telde "
"teneinde terug tijdens toe tot totdat trots tussen tégen uit uitgenomen "
"ultimo van vanaf vandaan vandoor vanop vanuit vanwege versus via vinnen "
"vlakbij volgens voor voor- voorbij voordat voort voren vòòr vóór waaraan "
"waarbij waardoor waaronder weg wegens weleens zijdens zoals zodat zonder "
"zónder à"
).split()
)

View File

@ -3,10 +3,10 @@ from __future__ import unicode_literals
ADPOSITIONS_IRREG = {
"'t": ('te',),
'me': ('mee',),
'meer': ('mee',),
'on': ('om',),
'ten': ('te',),
'ter': ('te',)
"'t": ("te",),
"me": ("mee",),
"meer": ("mee",),
"on": ("om",),
"ten": ("te",),
"ter": ("te",),
}

View File

@ -3,17 +3,17 @@ from __future__ import unicode_literals
ADVERBS_IRREG = {
"'ns": ('eens',),
"'s": ('eens',),
"'t": ('het',),
"d'r": ('er',),
"d'raf": ('eraf',),
"d'rbij": ('erbij',),
"d'rheen": ('erheen',),
"d'rin": ('erin',),
"d'rna": ('erna',),
"d'rnaar": ('ernaar',),
'hele': ('heel',),
'nevenst': ('nevens',),
'overend': ('overeind',)
"'ns": ("eens",),
"'s": ("eens",),
"'t": ("het",),
"d'r": ("er",),
"d'raf": ("eraf",),
"d'rbij": ("erbij",),
"d'rheen": ("erheen",),
"d'rin": ("erin",),
"d'rna": ("erna",),
"d'rnaar": ("ernaar",),
"hele": ("heel",),
"nevenst": ("nevens",),
"overend": ("overeind",),
}

View File

@ -3,15 +3,18 @@ from __future__ import unicode_literals
DETERMINERS = set(
("al allebei allerhande allerminst alletwee"
"beide clip-on d'n d'r dat datgeen datgene de dees degeen degene den dewelke "
'deze dezelfde die diegeen diegene diehien dien diene diens diezelfde dit '
'ditgene e een eene eigen elk elkens elkes enig enkel enne ettelijke eure '
'euren evenveel ewe ge geen ginds géén haar haaren halfelf het hetgeen '
'hetwelk hetzelfde heur heure hulder hulle hullen hullie hun hunder hunderen '
'ieder iederes ja je jen jouw jouwen jouwes jullie junder keiveel keiweinig '
"m'ne me meer meerder meerdere menen menig mijn mijnes minst méér niemendal "
'oe ons onse se sommig sommigeder superveel telken teveel titulair ulder '
'uldere ulderen ulle under une uw vaak veel veels véél wat weinig welk welken '
"welkene welksten z'nen ze zenen zijn zo'n zo'ne zoiet zoveel zovele zovelen "
'zuk zulk zulkdanig zulken zulks zullie zíjn àlle álle').split())
(
"al allebei allerhande allerminst alletwee"
"beide clip-on d'n d'r dat datgeen datgene de dees degeen degene den dewelke "
"deze dezelfde die diegeen diegene diehien dien diene diens diezelfde dit "
"ditgene e een eene eigen elk elkens elkes enig enkel enne ettelijke eure "
"euren evenveel ewe ge geen ginds géén haar haaren halfelf het hetgeen "
"hetwelk hetzelfde heur heure hulder hulle hullen hullie hun hunder hunderen "
"ieder iederes ja je jen jouw jouwen jouwes jullie junder keiveel keiweinig "
"m'ne me meer meerder meerdere menen menig mijn mijnes minst méér niemendal "
"oe ons onse se sommig sommigeder superveel telken teveel titulair ulder "
"uldere ulderen ulle under une uw vaak veel veels véél wat weinig welk welken "
"welkene welksten z'nen ze zenen zijn zo'n zo'ne zoiet zoveel zovele zovelen "
"zuk zulk zulkdanig zulken zulks zullie zíjn àlle álle"
).split()
)

View File

@ -3,67 +3,67 @@ from __future__ import unicode_literals
DETERMINERS_IRREG = {
"'r": ('haar',),
"'s": ('de',),
"'t": ('het',),
"'tgene": ('hetgeen',),
'alle': ('al',),
'allen': ('al',),
'aller': ('al',),
'beiden': ('beide',),
'beider': ('beide',),
"d'": ('het',),
"d'r": ('haar',),
'der': ('de',),
'des': ('de',),
'dezer': ('deze',),
'dienen': ('die',),
'dier': ('die',),
'elke': ('elk',),
'ene': ('een',),
'enen': ('een',),
'ener': ('een',),
'enige': ('enig',),
'enigen': ('enig',),
'er': ('haar',),
'gene': ('geen',),
'genen': ('geen',),
'hare': ('haar',),
'haren': ('haar',),
'harer': ('haar',),
'hunne': ('hun',),
'hunnen': ('hun',),
'jou': ('jouw',),
'jouwe': ('jouw',),
'julliejen': ('jullie',),
"m'n": ('mijn',),
'mee': ('meer',),
'meer': ('veel',),
'meerderen': ('meerdere',),
'meest': ('veel',),
'meesten': ('veel',),
'meet': ('veel',),
'menige': ('menig',),
'mij': ('mijn',),
'mijnen': ('mijn',),
'minder': ('weinig',),
'mindere': ('weinig',),
'minst': ('weinig',),
'minste': ('minst',),
'ne': ('een',),
'onze': ('ons',),
'onzent': ('ons',),
'onzer': ('ons',),
'ouw': ('uw',),
'sommige': ('sommig',),
'sommigen': ('sommig',),
'u': ('uw',),
'vaker': ('vaak',),
'vele': ('veel',),
'velen': ('veel',),
'welke': ('welk',),
'zijne': ('zijn',),
'zijnen': ('zijn',),
'zijns': ('zijn',),
'één': ('een',)
"'r": ("haar",),
"'s": ("de",),
"'t": ("het",),
"'tgene": ("hetgeen",),
"alle": ("al",),
"allen": ("al",),
"aller": ("al",),
"beiden": ("beide",),
"beider": ("beide",),
"d'": ("het",),
"d'r": ("haar",),
"der": ("de",),
"des": ("de",),
"dezer": ("deze",),
"dienen": ("die",),
"dier": ("die",),
"elke": ("elk",),
"ene": ("een",),
"enen": ("een",),
"ener": ("een",),
"enige": ("enig",),
"enigen": ("enig",),
"er": ("haar",),
"gene": ("geen",),
"genen": ("geen",),
"hare": ("haar",),
"haren": ("haar",),
"harer": ("haar",),
"hunne": ("hun",),
"hunnen": ("hun",),
"jou": ("jouw",),
"jouwe": ("jouw",),
"julliejen": ("jullie",),
"m'n": ("mijn",),
"mee": ("meer",),
"meer": ("veel",),
"meerderen": ("meerdere",),
"meest": ("veel",),
"meesten": ("veel",),
"meet": ("veel",),
"menige": ("menig",),
"mij": ("mijn",),
"mijnen": ("mijn",),
"minder": ("weinig",),
"mindere": ("weinig",),
"minst": ("weinig",),
"minste": ("minst",),
"ne": ("een",),
"onze": ("ons",),
"onzent": ("ons",),
"onzer": ("ons",),
"ouw": ("uw",),
"sommige": ("sommig",),
"sommigen": ("sommig",),
"u": ("uw",),
"vaker": ("vaak",),
"vele": ("veel",),
"velen": ("veel",),
"welke": ("welk",),
"zijne": ("zijn",),
"zijnen": ("zijn",),
"zijns": ("zijn",),
"één": ("een",),
}

View File

@ -9,7 +9,7 @@ ADJECTIVE_SUFFIX_RULES = [
["er", ""],
["en", ""],
["e", ""],
["ende", "end"]
["ende", "end"],
]
VERB_SUFFIX_RULES = [
@ -39,7 +39,7 @@ NOUN_SUFFIX_RULES = [
["ssen", "s"],
["rren", "r"],
["kken", "k"],
["bben", "b"]
["bben", "b"],
]
NUM_SUFFIX_RULES = [
@ -50,23 +50,20 @@ NUM_SUFFIX_RULES = [
["de", ""],
["er", ""],
["ër", ""],
["tjes", ""]
["tjes", ""],
]
PUNCT_SUFFIX_RULES = [
["", "\""],
["", "\""],
["\u2018", "'"],
["\u2019", "'"]
]
PUNCT_SUFFIX_RULES = [["", '"'], ["", '"'], ["\u2018", "'"], ["\u2019", "'"]]
# In-place sort guaranteeing that longer -- more specific -- rules are
# applied first.
for rule_set in (ADJECTIVE_SUFFIX_RULES,
NOUN_SUFFIX_RULES,
NUM_SUFFIX_RULES,
VERB_SUFFIX_RULES):
for rule_set in (
ADJECTIVE_SUFFIX_RULES,
NOUN_SUFFIX_RULES,
NUM_SUFFIX_RULES,
VERB_SUFFIX_RULES,
):
rule_set.sort(key=lambda r: len(r[0]), reverse=True)
@ -75,5 +72,5 @@ RULES = {
"noun": NOUN_SUFFIX_RULES,
"verb": VERB_SUFFIX_RULES,
"num": NUM_SUFFIX_RULES,
"punct": PUNCT_SUFFIX_RULES
"punct": PUNCT_SUFFIX_RULES,
}

File diff suppressed because it is too large Load Diff

View File

@ -3,29 +3,29 @@ from __future__ import unicode_literals
NUMBERS_IRREG = {
'achten': ('acht',),
'biljoenen': ('biljoen',),
'drieën': ('drie',),
'duizenden': ('duizend',),
'eentjes': ('één',),
'elven': ('elf',),
'miljoenen': ('miljoen',),
'negenen': ('negen',),
'negentiger': ('negentig',),
'tienduizenden': ('tienduizend',),
'tienen': ('tien',),
'tientjes': ('tien',),
'twaalven': ('twaalf',),
'tweeën': ('twee',),
'twintiger': ('twintig',),
'twintigsten': ('twintig',),
'vieren': ('vier',),
'vijftiger': ('vijftig',),
'vijven': ('vijf',),
'zessen': ('zes',),
'zestiger': ('zestig',),
'zevenen': ('zeven',),
'zeventiger': ('zeventig',),
'zovele': ('zoveel',),
'zovelen': ('zoveel',)
"achten": ("acht",),
"biljoenen": ("biljoen",),
"drieën": ("drie",),
"duizenden": ("duizend",),
"eentjes": ("één",),
"elven": ("elf",),
"miljoenen": ("miljoen",),
"negenen": ("negen",),
"negentiger": ("negentig",),
"tienduizenden": ("tienduizend",),
"tienen": ("tien",),
"tientjes": ("tien",),
"twaalven": ("twaalf",),
"tweeën": ("twee",),
"twintiger": ("twintig",),
"twintigsten": ("twintig",),
"vieren": ("vier",),
"vijftiger": ("vijftig",),
"vijven": ("vijf",),
"zessen": ("zes",),
"zestiger": ("zestig",),
"zevenen": ("zeven",),
"zeventiger": ("zeventig",),
"zovele": ("zoveel",),
"zovelen": ("zoveel",),
}

View File

@ -3,33 +3,33 @@ from __future__ import unicode_literals
PRONOUNS_IRREG = {
"'r": ('haar',),
"'rzelf": ('haarzelf',),
"'t": ('het',),
"d'r": ('haar',),
'da': ('dat',),
'dienen': ('die',),
'diens': ('die',),
'dies': ('die',),
'elkaars': ('elkaar',),
'elkanders': ('elkander',),
'ene': ('een',),
'enen': ('een',),
'fik': ('ik',),
'gaat': ('gaan',),
'gene': ('geen',),
'harer': ('haar',),
'ieders': ('ieder',),
'iemands': ('iemand',),
'ikke': ('ik',),
'mijnen': ('mijn',),
'oe': ('je',),
'onzer': ('ons',),
'wa': ('wat',),
'watte': ('wat',),
'wier': ('wie',),
'zijns': ('zijn',),
'zoietsken': ('zoietske',),
'zulks': ('zulk',),
'één': ('een',)
"'r": ("haar",),
"'rzelf": ("haarzelf",),
"'t": ("het",),
"d'r": ("haar",),
"da": ("dat",),
"dienen": ("die",),
"diens": ("die",),
"dies": ("die",),
"elkaars": ("elkaar",),
"elkanders": ("elkander",),
"ene": ("een",),
"enen": ("een",),
"fik": ("ik",),
"gaat": ("gaan",),
"gene": ("geen",),
"harer": ("haar",),
"ieders": ("ieder",),
"iemands": ("iemand",),
"ikke": ("ik",),
"mijnen": ("mijn",),
"oe": ("je",),
"onzer": ("ons",),
"wa": ("wat",),
"watte": ("wat",),
"wier": ("wie",),
"zijns": ("zijn",),
"zoietsken": ("zoietske",),
"zulks": ("zulk",),
"één": ("een",),
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -7,15 +7,33 @@ from ....symbols import POS, NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
class DutchLemmatizer(object):
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
univ_pos_name_variants = {
NOUN: "noun", "NOUN": "noun", "noun": "noun",
VERB: "verb", "VERB": "verb", "verb": "verb",
AUX: "verb", "AUX": "verb", "aux": "verb",
ADJ: "adj", "ADJ": "adj", "adj": "adj",
ADV: "adv", "ADV": "adv", "adv": "adv",
PRON: "pron", "PRON": "pron", "pron": "pron",
DET: "det", "DET": "det", "det": "det",
ADP: "adp", "ADP": "adp", "adp": "adp",
NUM: "num", "NUM": "num", "num": "num"
NOUN: "noun",
"NOUN": "noun",
"noun": "noun",
VERB: "verb",
"VERB": "verb",
"verb": "verb",
AUX: "verb",
"AUX": "verb",
"aux": "verb",
ADJ: "adj",
"ADJ": "adj",
"adj": "adj",
ADV: "adv",
"ADV": "adv",
"adv": "adv",
PRON: "pron",
"PRON": "pron",
"pron": "pron",
DET: "det",
"DET": "det",
"det": "det",
ADP: "adp",
"ADP": "adp",
"adp": "adp",
NUM: "num",
"NUM": "num",
"num": "num",
}
@classmethod
@ -62,10 +80,8 @@ class DutchLemmatizer(object):
return [looked_up_lemma]
forms, is_known = lemmatize(
string,
lemma_index,
exceptions,
self.rules.get(univ_pos, []))
string, lemma_index, exceptions, self.rules.get(univ_pos, [])
)
# Back-off through remaining return value candidates.
if forms:
@ -92,25 +108,25 @@ class DutchLemmatizer(object):
return self.lookup_table.get(string, string)
def noun(self, string, morphology=None):
return self(string, 'noun', morphology)
return self(string, "noun", morphology)
def verb(self, string, morphology=None):
return self(string, 'verb', morphology)
return self(string, "verb", morphology)
def adj(self, string, morphology=None):
return self(string, 'adj', morphology)
return self(string, "adj", morphology)
def det(self, string, morphology=None):
return self(string, 'det', morphology)
return self(string, "det", morphology)
def pron(self, string, morphology=None):
return self(string, 'pron', morphology)
return self(string, "pron", morphology)
def adp(self, string, morphology=None):
return self(string, 'adp', morphology)
return self(string, "adp", morphology)
def punct(self, string, morphology=None):
return self(string, 'punct', morphology)
return self(string, "punct", morphology)
# Reimplemented to focus more on application of suffix rules and to return
@ -120,7 +136,7 @@ def lemmatize(string, index, exceptions, rules):
oov_forms = []
for old, new in rules:
if string.endswith(old):
form = string[:len(string) - len(old)] + new
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index:

View File

@ -4,18 +4,22 @@ from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = set("""
_num_words = set(
"""
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
duizend miljoen miljard biljoen biljard triljoen triljard
""".split())
""".split()
)
_ordinal_words = set("""
_ordinal_words = set(
"""
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
miljardste biljoenste biljardste triljoenste triljardste
""".split())
""".split()
)
def like_num(text):
@ -23,11 +27,11 @@ def like_num(text):
# or matches one of the number words. In order to handle numbers like
# "drieëntwintig", more work is required.
# See this discussion: https://github.com/explosion/spaCy/pull/1177
text = text.replace(',', '').replace('.', '')
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text.lower() in _num_words:
@ -37,6 +41,4 @@ def like_num(text):
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -10,24 +10,32 @@ from ..punctuation import TOKENIZER_SUFFIXES as DEFAULT_TOKENIZER_SUFFIXES
# Copied from `de` package. Main purpose is to ensure that hyphens are not
# split on.
_quotes = CONCAT_QUOTES.replace("'", '')
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])'.format(a=ALPHA, q=_quotes),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
r'(?<=[0-9])-(?=[0-9])'])
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[{}])\.(?=[{}])".format(ALPHA_LOWER, ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[0-9])-(?=[0-9])",
]
)
# Remove "'s" suffix from suffix list. In Dutch, "'s" is a plural ending when
# it occurs as a suffix and a clitic for "eens" in standalone use. To avoid
# ambiguity it's better to just leave it attached when it occurs as a suffix.
default_suffix_blacklist = ("'s", "'S", 's', 'S')
_suffixes = [suffix for suffix in DEFAULT_TOKENIZER_SUFFIXES
if suffix not in default_suffix_blacklist]
default_suffix_blacklist = ("'s", "'S", "s", "S")
_suffixes = [
suffix
for suffix in DEFAULT_TOKENIZER_SUFFIXES
if suffix not in default_suffix_blacklist
]
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -16,7 +16,8 @@ from __future__ import unicode_literals
# should have a Dutch counterpart here.
STOP_WORDS = set("""
STOP_WORDS = set(
"""
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
afgelopen aldus alhoewel anderzijds
@ -70,4 +71,5 @@ welk welke welken werd werden wiens wier wilde wordt
zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zon zoals zodra zouden
zoveel zowat zulk zulke zulks zullen zult
""".split())
""".split()
)

View File

@ -47,8 +47,12 @@ TAG_MAP = {
"Adj_Prep|adv|vergr|onverv_voor__Degree=Cmp|Variant=Short": {POS: ADJ},
"Adj_V_Conj_V__Degree=Pos|VerbForm=Inf": {POS: ADJ},
"Adj_V_N__Degree=Pos|Number=Sing|Tense=Past|VerbForm=Part": {POS: ADJ},
"Adj_V|adv|stell|onverv_intrans|inf__Degree=Pos|Variant=Short|VerbForm=Inf": {POS: ADJ},
"Adj_V|adv|stell|onverv_trans|imp__Degree=Pos|Mood=Imp|Variant=Short|VerbForm=Fin": {POS: ADJ},
"Adj_V|adv|stell|onverv_intrans|inf__Degree=Pos|Variant=Short|VerbForm=Inf": {
POS: ADJ
},
"Adj_V|adv|stell|onverv_trans|imp__Degree=Pos|Mood=Imp|Variant=Short|VerbForm=Fin": {
POS: ADJ
},
"Adj|adv|stell|onverv__Degree=Pos|Variant=Short": {POS: ADJ},
"Adj|adv|stell|vervneut__Case=Nom|Degree=Pos|Variant=Short": {POS: ADJ},
"Adj|adv|vergr|onverv__Degree=Cmp|Variant=Short": {POS: ADJ},
@ -133,15 +137,21 @@ TAG_MAP = {
"Art_Num__Definite=Def|Degree=Sup|Gender=Neut|PronType=Ind": {POS: DET},
"Art_Num__Definite=Def|Gender=Neut": {POS: DET},
"Art_Num__Degree=Pos|Number=Sing|PronType=Ind": {POS: DET},
"Art_N|bep|onzijd|neut_eigen|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {POS: DET},
"Art_N|bep|onzijd|neut_soort|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {POS: DET},
"Art_N|bep|onzijd|neut_eigen|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {
POS: DET
},
"Art_N|bep|onzijd|neut_soort|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {
POS: DET
},
"Art_Pron_N__Case=Gen|Number=Plur|PronType=Ind": {POS: DET},
"Art_Pron__Number=Sing|PronType=Ind": {POS: DET},
"Art_V_N__AdpType=Prep": {POS: DET},
"Art|bep|onzijd|neut__Definite=Def|Gender=Neut|PronType=Art": {POS: DET},
"Art|bep|zijdofmv|gen__Case=Gen|Definite=Def|PronType=Art": {POS: DET},
"Art|bep|zijdofmv|neut__Definite=Def|PronType=Art": {POS: DET},
"Art|bep|zijdofonzijd|gen__Case=Gen|Definite=Def|Number=Sing|PronType=Art": {POS: DET},
"Art|bep|zijdofonzijd|gen__Case=Gen|Definite=Def|Number=Sing|PronType=Art": {
POS: DET
},
"Art|bep|zijd|dat__Case=Dat|Definite=Def|Gender=Com|PronType=Art": {POS: DET},
"Art|onbep|zijdofonzijd|neut__Definite=Ind|Number=Sing|PronType=Art": {POS: DET},
"CCONJ___": {POS: CONJ},
@ -159,17 +169,23 @@ TAG_MAP = {
"Conj_Int|onder|metfin___": {POS: CONJ},
"Conj_N_Adv__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
"Conj_N_Prep__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
"Conj_N|onder|metfin_soort|ev|neut__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
"Conj_N|onder|metfin_soort|ev|neut__AdpType=Preppron|Gender=Masc|Number=Plur": {
POS: CONJ
},
"Conj_Pron_Adv__Degree=Pos|Number=Sing|Person=3": {POS: CONJ},
"Conj_Pron_V__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
"Conj_Pron|neven_aanw|neut|zelfst__AdpType=Prep": {POS: CONJ},
"Conj_Punc_Conj|neven_schuinstreep_neven__AdpType=Prep": {POS: CONJ},
"Conj_V|onder|metfin_intrans|ott|3|ev__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
"Conj_V|onder|metfin_intrans|ott|3|ev__AdpType=Preppron|Gender=Masc|Number=Plur": {
POS: CONJ
},
"Conj|neven___": {POS: CONJ},
"Conj|onder|metfin___": {POS: CONJ},
"Conj|onder|metinf___": {POS: CONJ},
"DET__Degree=Cmp|NumType=Card|PronType=Ind": {POS: DET},
"DET__Gender=Fem|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs": {POS: DET},
"DET__Gender=Fem|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs": {
POS: DET
},
"DET__Gender=Fem|Number=Sing|PronType=Art": {POS: DET},
"DET__Gender=Masc|Number=Plur|PronType=Art": {POS: DET},
"DET__Gender=Masc|Number=Sing|PronType=Tot": {POS: DET},
@ -185,7 +201,9 @@ TAG_MAP = {
"Misc_Misc_Misc_Misc_Misc_Misc_Punc_Misc_Misc_Misc___": {POS: X},
"Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X},
"Misc_Misc_Misc_Misc_Misc_N_Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X},
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd__AdpType=Preppron|Gender=Masc|Number=Sing": {POS: X},
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd__AdpType=Preppron|Gender=Masc|Number=Sing": {
POS: X
},
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd___": {POS: X},
"Misc_Misc_Misc_N__Number=Sing": {POS: X},
"Misc_Misc_Misc|vreemd_vreemd_vreemd___": {POS: X},
@ -217,7 +235,9 @@ TAG_MAP = {
"N_Adj__Degree=Pos|Number=Plur": {POS: NOUN},
"N_Adj__Degree=Pos|Number=Sing": {POS: NOUN},
"N_Adj___": {POS: NOUN},
"N_Adv_Punc_V_Pron_V__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {POS: NOUN},
"N_Adv_Punc_V_Pron_V__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {
POS: NOUN
},
"N_Adv__Degree=Pos|Number=Sing": {POS: NOUN},
"N_Adv___": {POS: NOUN},
"N_Adv|soort|ev|neut_deelv__Number=Sing": {POS: NOUN},
@ -320,12 +340,20 @@ TAG_MAP = {
"N_N|eigen|ev|gen_soort|mv|neut___": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|gen___": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Prep": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Fem|Number=Plur": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Masc|Number=Sing": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Fem|Number=Plur": {
POS: NOUN
},
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Masc|Number=Sing": {
POS: NOUN
},
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Plur|PronType=Art": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Sing|PronType=Art": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Sing|PronType=Art": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {
POS: NOUN
},
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Sing|PronType=Art": {
POS: NOUN
},
"N_N|eigen|ev|neut_eigen|ev|neut__NumType=Card": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN},
"N_N|eigen|ev|neut_eigen|ev|neut___": {POS: NOUN},
@ -335,7 +363,9 @@ TAG_MAP = {
"N_N|eigen|ev|neut_soort|mv|neut___": {POS: NOUN},
"N_N|eigen|mv|neut_eigen|mv|neut___": {POS: NOUN},
"N_N|soort|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN},
"N_N|soort|ev|neut_soort|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {POS: NOUN},
"N_N|soort|ev|neut_soort|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {
POS: NOUN
},
"N_N|soort|ev|neut_soort|ev|neut__NumForm=Digit|NumType=Card": {POS: NOUN},
"N_N|soort|ev|neut_soort|ev|neut__Number=Sing": {POS: NOUN},
"N_N|soort|ev|neut_soort|mv|neut__Number=Plur": {POS: NOUN},
@ -365,7 +395,9 @@ TAG_MAP = {
"N_Pron___": {POS: NOUN},
"N_Punc_Adj_N___": {POS: NOUN},
"N_Punc_Adj_Pron_Punc__Degree=Pos|Number=Sing|Person=2": {POS: NOUN},
"N_Punc_Adv_V_Pron_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: NOUN},
"N_Punc_Adv_V_Pron_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
POS: NOUN
},
"N_Punc_Misc_Punc_N___": {POS: NOUN},
"N_Punc_N_N_N_N__Number=Sing": {POS: NOUN},
"N_Punc_N_Punc_N__Number=Sing": {POS: NOUN},
@ -415,8 +447,12 @@ TAG_MAP = {
"Num|hoofd|bep|attr|onverv__Definite=Def|NumType=Card": {POS: NUM},
"Num|hoofd|bep|zelfst|onverv__Definite=Def|NumType=Card": {POS: NUM},
"Num|hoofd|bep|zelfst|vervmv__Definite=Def|Number=Plur|NumType=Card": {POS: NUM},
"Num|hoofd|onbep|attr|stell|onverv__Degree=Pos|NumType=Card|PronType=Ind": {POS: NUM},
"Num|hoofd|onbep|attr|vergr|onverv__Degree=Cmp|NumType=Card|PronType=Ind": {POS: NUM},
"Num|hoofd|onbep|attr|stell|onverv__Degree=Pos|NumType=Card|PronType=Ind": {
POS: NUM
},
"Num|hoofd|onbep|attr|vergr|onverv__Degree=Cmp|NumType=Card|PronType=Ind": {
POS: NUM
},
"Num|rang|bep|attr|onverv__Definite=Def|NumType=Ord": {POS: NUM},
"Num|rang|bep|zelfst|onverv__Definite=Def|NumType=Ord": {POS: NUM},
"N|eigen|ev|gen__Case=Gen|Number=Sing": {POS: NOUN},
@ -469,7 +505,9 @@ TAG_MAP = {
"Prep_N_Adv|voor_soort|ev|neut_pron|aanw__AdpType=Prep": {POS: ADP},
"Prep_N_Adv|voor_soort|ev|neut_pron|aanw__Number=Sing|PronType=Dem": {POS: ADP},
"Prep_N_Adv|voor_soort|ev|neut_pron|vrag__Number=Sing|PronType=Int": {POS: ADP},
"Prep_N_Adv|voor_soort|mv|neut_deelv__Gender=Masc|Number=Sing|PronType=Tot": {POS: ADP},
"Prep_N_Adv|voor_soort|mv|neut_deelv__Gender=Masc|Number=Sing|PronType=Tot": {
POS: ADP
},
"Prep_N_Conj_N__Number=Sing": {POS: ADP},
"Prep_N_Conj__AdpType=Prep": {POS: ADP},
"Prep_N_Prep_N__Number=Sing": {POS: ADP},
@ -489,7 +527,9 @@ TAG_MAP = {
"Prep_N|voor_soort|ev|neut__Number=Sing": {POS: ADP},
"Prep_N|voor_soort|mv|neut__AdpType=Prep": {POS: ADP},
"Prep_N|voor_soort|mv|neut__Number=Plur": {POS: ADP},
"Prep_Prep_Adj|voor_voor_adv|stell|onverv__Gender=Masc|Number=Sing|PronType=Tot": {POS: ADP},
"Prep_Prep_Adj|voor_voor_adv|stell|onverv__Gender=Masc|Number=Sing|PronType=Tot": {
POS: ADP
},
"Prep_Prep_Adv__Degree=Pos": {POS: ADP},
"Prep_Pron_Adj__Degree=Cmp|Number=Sing|Person=3": {POS: ADP},
"Prep_Pron_N_Adv__Number=Plur": {POS: ADP},
@ -503,7 +543,9 @@ TAG_MAP = {
"Prep_Pron|voor_ref|3|evofmv__Number=Plur,Sing|Person=3": {POS: ADP},
"Prep_Punc_N_Conj_N__AdpType=Prep": {POS: ADP},
"Prep_V_N__Number=Sing|Tense=Pres|VerbForm=Part": {POS: ADP},
"Prep_V_Pron_Pron_Adv__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {POS: ADP},
"Prep_V_Pron_Pron_Adv__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {
POS: ADP
},
"Prep_V|voor_intrans|inf__VerbForm=Inf": {POS: ADP},
"Prep_V|voorinf_trans|inf__VerbForm=Inf": {POS: ADP},
"Prep|achter__AdpType=Post": {POS: ADP},
@ -511,17 +553,25 @@ TAG_MAP = {
"Prep|voor__AdpType=Prep": {POS: ADP},
"Prep|voorinf__AdpType=Prep|PartType=Inf": {POS: ADP},
"Pron_Adj_N_Punc_Art_Adj_N_Prep_Art_Adj_N__NumType=Card": {POS: PRON},
"Pron_Adj__Case=Nom|Degree=Sup|Number=Sing|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron_Adj__Case=Nom|Degree=Sup|Number=Sing|Person=2|Poss=Yes|PronType=Prs": {
POS: PRON
},
"Pron_Adj__Degree=Cmp|PronType=Ind": {POS: PRON},
"Pron_Adv|vrag|neut|attr_deelv__PronType=Int": {POS: PRON},
"Pron_Art_N_N__Number=Plur|PronType=Ind": {POS: PRON},
"Pron_Art__Number=Sing|PronType=Int": {POS: PRON},
"Pron_N_Adv__Number=Sing|PronType=Ind": {POS: PRON},
"Pron_N_V_Adv_Num_Punc__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: PRON},
"Pron_N_V_Conj_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: PRON},
"Pron_N_V_Adv_Num_Punc__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
POS: PRON
},
"Pron_N_V_Conj_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
POS: PRON
},
"Pron_N__Case=Gen|Number=Sing|PronType=Ind": {POS: PRON},
"Pron_N__Number=Sing|PronType=Ind": {POS: PRON},
"Pron_N|aanw|gen|attr_soort|mv|neut__Case=Gen|Number=Plur|PronType=Dem": {POS: PRON},
"Pron_N|aanw|gen|attr_soort|mv|neut__Case=Gen|Number=Plur|PronType=Dem": {
POS: PRON
},
"Pron_N|onbep|neut|attr_soort|ev|neut__Number=Sing|PronType=Ind": {POS: PRON},
"Pron_Prep_Art__Number=Sing|PronType=Int": {POS: PRON},
"Pron_Prep_Art__Number=Sing|PronType=Rel": {POS: PRON},
@ -529,10 +579,16 @@ TAG_MAP = {
"Pron_Prep|betr|neut|zelfst_voor__PronType=Rel": {POS: PRON},
"Pron_Prep|onbep|neut|zelfst_voor__PronType=Ind": {POS: PRON},
"Pron_Prep|vrag|neut|attr_voor__PronType=Int": {POS: PRON},
"Pron_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {POS: PRON},
"Pron_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {
POS: PRON
},
"Pron_Pron__Person=3|PronType=Prs|Reflex=Yes": {POS: PRON},
"Pron_V_V__Aspect=Imp|Mood=Ind|Person=3|PronType=Dem|Tense=Pres|VerbForm=Inf": {POS: PRON},
"Pron_V__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs|VerbForm=Inf": {POS: PRON},
"Pron_V_V__Aspect=Imp|Mood=Ind|Person=3|PronType=Dem|Tense=Pres|VerbForm=Inf": {
POS: PRON
},
"Pron_V__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs|VerbForm=Inf": {
POS: PRON
},
"Pron_V__Number=Plur|Person=1|Poss=Yes|PronType=Prs|VerbForm=Inf": {POS: PRON},
"Pron|aanw|dat|attr__Case=Dat|PronType=Dem": {POS: PRON},
"Pron|aanw|gen|attr__Case=Gen|PronType=Dem": {POS: PRON},
@ -547,27 +603,47 @@ TAG_MAP = {
"Pron|bez|1|mv|neut|attr__Number=Plur|Person=1|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|bez|2|ev|neut|attr__Number=Sing|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|bez|2|mv|neut|attr__Number=Plur|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|bez|3|ev|gen|attr__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|bez|3|ev|gen|attr__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
POS: PRON
},
"Pron|bez|3|ev|neut|attr__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|bez|3|ev|neut|zelfst__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|bez|3|ev|neut|zelfst__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
POS: PRON
},
"Pron|bez|3|mv|neut|attr__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
"Pron|onbep|gen|attr__Case=Gen|PronType=Ind": {POS: PRON},
"Pron|onbep|gen|zelfst__Case=Gen|PronType=Ind": {POS: PRON},
"Pron|onbep|neut|attr__PronType=Ind": {POS: PRON},
"Pron|onbep|neut|zelfst__PronType=Ind": {POS: PRON},
"Pron|per|1|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=1|PronType=Prs": {POS: PRON},
"Pron|per|1|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=1|PronType=Prs": {
POS: PRON
},
"Pron|per|1|ev|nom__Case=Nom|Number=Sing|Person=1|PronType=Prs": {POS: PRON},
"Pron|per|1|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs": {POS: PRON},
"Pron|per|1|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs": {
POS: PRON
},
"Pron|per|1|mv|nom__Case=Nom|Number=Plur|Person=1|PronType=Prs": {POS: PRON},
"Pron|per|2|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=2|PronType=Prs": {POS: PRON},
"Pron|per|2|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=2|PronType=Prs": {
POS: PRON
},
"Pron|per|2|ev|nom__Case=Nom|Number=Sing|Person=2|PronType=Prs": {POS: PRON},
"Pron|per|2|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs": {POS: PRON},
"Pron|per|2|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs": {
POS: PRON
},
"Pron|per|2|mv|nom__Case=Nom|Number=Plur|Person=2|PronType=Prs": {POS: PRON},
"Pron|per|3|evofmv|datofacc__Case=Acc,Dat|Number=Plur,Sing|Person=3|PronType=Prs": {POS: PRON},
"Pron|per|3|evofmv|nom__Case=Nom|Number=Plur,Sing|Person=3|PronType=Prs": {POS: PRON},
"Pron|per|3|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=3|PronType=Prs": {POS: PRON},
"Pron|per|3|evofmv|datofacc__Case=Acc,Dat|Number=Plur,Sing|Person=3|PronType=Prs": {
POS: PRON
},
"Pron|per|3|evofmv|nom__Case=Nom|Number=Plur,Sing|Person=3|PronType=Prs": {
POS: PRON
},
"Pron|per|3|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=3|PronType=Prs": {
POS: PRON
},
"Pron|per|3|ev|nom__Case=Nom|Number=Sing|Person=3|PronType=Prs": {POS: PRON},
"Pron|per|3|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs": {POS: PRON},
"Pron|per|3|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs": {
POS: PRON
},
"Pron|rec|gen__Case=Gen|PronType=Rcp": {POS: PRON},
"Pron|rec|neut__PronType=Rcp": {POS: PRON},
"Pron|ref|1|ev__Number=Sing|Person=1|PronType=Prs|Reflex=Yes": {POS: PRON},
@ -597,20 +673,34 @@ TAG_MAP = {
"Punc|vraag__PunctType=Qest": {POS: PUNCT},
"V_Adv_Art_N_Prep_Pron_N__Degree=Pos|Number=Plur|Person=2|Subcat=Tran": {POS: VERB},
"V_Adv__Degree=Pos|Subcat=Tran": {POS: VERB},
"V_Art_N_Num_N__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
"V_Art_N_Num_N__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V_Art_N__Number=Sing|Subcat=Tran": {POS: VERB},
"V_Conj_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V_Conj_Pron__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V_N_Conj_Adj_N_Prep_Art_N__Degree=Pos|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V_N_Conj_Adj_N_Prep_Art_N__Degree=Pos|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {
POS: VERB
},
"V_N_N__Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
"V_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V_N_V__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {POS: VERB},
"V_N_V__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {
POS: VERB
},
"V_N__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V_N|trans|imp_eigen|ev|neut__Number=Sing|Subcat=Tran": {POS: VERB},
"V_Prep|intrans|verldw|onverv_voor__Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
"V_Pron_Adv_Adv_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V_Pron_Adv__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V_Prep|intrans|verldw|onverv_voor__Subcat=Intr|Tense=Past|VerbForm=Part": {
POS: VERB
},
"V_Pron_Adv_Adv_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V_Pron_Adv__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V_Pron__VerbType=Aux,Cop": {POS: VERB},
"V_V|hulp|imp_intrans|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB},
"V|hulpofkopp|conj__Mood=Sub|VerbForm=Fin": {POS: VERB},
@ -620,94 +710,220 @@ TAG_MAP = {
"V|hulpofkopp|inf__VerbForm=Inf": {POS: VERB},
"V|hulpofkopp|inf__VerbForm=Inf|VerbType=Aux,Cop": {POS: VERB},
"V|hulpofkopp|inf|subst__VerbForm=Inf": {POS: VERB},
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: VERB},
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {POS: VERB},
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part|VerbType=Aux,Cop": {POS: VERB},
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB},
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Aux,Cop": {POS: VERB},
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Aux,Cop": {
POS: VERB
},
"V|hulp|conj__Mood=Sub|VerbForm=Fin|VerbType=Mod": {POS: VERB},
"V|hulp|inf__VerbForm=Inf": {POS: VERB},
"V|hulp|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB},
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: VERB},
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Mod": {POS: VERB},
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {POS: VERB},
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Mod": {POS: VERB},
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Mod": {
POS: VERB
},
"V|hulp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB},
"V|hulp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Mod": {POS: VERB},
"V|intrans|conj__Mood=Sub|Subcat=Intr|VerbForm=Fin": {POS: VERB},
"V|intrans|imp__Mood=Imp|Subcat=Intr|VerbForm=Fin": {POS: VERB},
"V|intrans|inf__Subcat=Intr|VerbForm=Inf": {POS: VERB},
"V|intrans|inf|subst__Subcat=Intr|VerbForm=Inf": {POS: VERB},
"V|intrans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|intrans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|intrans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|intrans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|intrans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Past|VerbForm=Fin": {POS: VERB},
"V|intrans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Fin": {POS: VERB},
"V|intrans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|intrans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|intrans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|intrans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|intrans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|intrans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|intrans|tegdw|onverv__Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|intrans|tegdw|vervmv__Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|intrans|tegdw|vervneut__Case=Nom|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|intrans|tegdw|vervvergr__Degree=Cmp|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|intrans|tegdw|vervmv__Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Part": {
POS: VERB
},
"V|intrans|tegdw|vervneut__Case=Nom|Subcat=Intr|Tense=Pres|VerbForm=Part": {
POS: VERB
},
"V|intrans|tegdw|vervvergr__Degree=Cmp|Subcat=Intr|Tense=Pres|VerbForm=Part": {
POS: VERB
},
"V|intrans|verldw|onverv__Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
"V|intrans|verldw|vervmv__Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
"V|intrans|verldw|vervneut__Case=Nom|Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
"V|intrans|verldw|vervmv__Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Part": {
POS: VERB
},
"V|intrans|verldw|vervneut__Case=Nom|Subcat=Intr|Tense=Past|VerbForm=Part": {
POS: VERB
},
"V|refl|imp__Mood=Imp|Reflex=Yes|VerbForm=Fin": {POS: VERB},
"V|refl|inf__Reflex=Yes|VerbForm=Inf": {POS: VERB},
"V|refl|inf|subst__Reflex=Yes|VerbForm=Inf": {POS: VERB},
"V|refl|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|refl|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|refl|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|refl|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|refl|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Reflex=Yes|Tense=Past|VerbForm=Fin": {POS: VERB},
"V|refl|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Past|VerbForm=Fin": {POS: VERB},
"V|refl|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|refl|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|refl|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|refl|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|refl|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Reflex=Yes|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|refl|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|refl|tegdw|vervneut__Case=Nom|Reflex=Yes|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|refl|verldw|onverv__Reflex=Yes|Tense=Past|VerbForm=Part": {POS: VERB},
"V|trans|conj__Mood=Sub|Subcat=Tran|VerbForm=Fin": {POS: VERB},
"V|trans|imp__Mood=Imp|Subcat=Tran|VerbForm=Fin": {POS: VERB},
"V|trans|inf__Subcat=Tran|VerbForm=Inf": {POS: VERB},
"V|trans|inf|subst__Subcat=Tran|VerbForm=Inf": {POS: VERB},
"V|trans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|trans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|trans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|trans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
"V|trans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Fin": {POS: VERB},
"V|trans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Fin": {POS: VERB},
"V|trans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|trans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|trans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|trans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: VERB
},
"V|trans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|trans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Fin": {
POS: VERB
},
"V|trans|tegdw|onverv__Subcat=Tran|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|trans|tegdw|vervneut__Case=Nom|Subcat=Tran|Tense=Pres|VerbForm=Part": {POS: VERB},
"V|trans|tegdw|vervneut__Case=Nom|Subcat=Tran|Tense=Pres|VerbForm=Part": {
POS: VERB
},
"V|trans|verldw|onverv__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V|trans|verldw|vervmv__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V|trans|verldw|vervneut__Case=Nom|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"V|trans|verldw|vervvergr__Degree=Cmp|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: X},
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Past|VerbForm=Part": {POS: X},
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {POS: X},
"V|trans|verldw|vervmv__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {
POS: VERB
},
"V|trans|verldw|vervneut__Case=Nom|Subcat=Tran|Tense=Past|VerbForm=Part": {
POS: VERB
},
"V|trans|verldw|vervvergr__Degree=Cmp|Subcat=Tran|Tense=Past|VerbForm=Part": {
POS: VERB
},
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
POS: X
},
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Past|VerbForm=Part": {
POS: X
},
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {
POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
POS: X
},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {POS: X},
"X__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: X},
@ -808,5 +1024,5 @@ TAG_MAP = {
"X__VerbForm=Inf|VerbType=Mod": {POS: X},
"X__VerbType=Aux,Cop": {POS: X},
"X___": {POS: X},
"_SP": {POS: SPACE}
"_SP": {POS: SPACE},
}

File diff suppressed because it is too large Load Diff

View File

@ -5039,5 +5039,5 @@ TAG_MAP = {
"punc": {POS: PUNCT},
"v-pcp|M|P": {POS: VERB},
"v-pcp|M|S": {POS: VERB},
"_SP": {POS: SPACE}
"_SP": {POS: SPACE},
}

View File

@ -39,7 +39,9 @@ _infixes = (
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),

View File

@ -19,7 +19,6 @@ _abbrev_exc = [
{ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"},
{ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"},
{ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"},
# Months abbreviations
{ORTH: "янв", LEMMA: "январь", NORM: "январь"},
{ORTH: "фев", LEMMA: "февраль", NORM: "февраль"},
@ -49,16 +48,18 @@ for abbrev_desc in _abbrev_exc:
abbrev = abbrev_desc[ORTH]
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
_exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
_exc[orth + '.'] = [{ORTH: orth + '.', LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
_exc[orth + "."] = [
{ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}
]
_slang_exc = [
{ORTH: '2к15', LEMMA: '2015', NORM: '2015'},
{ORTH: '2к16', LEMMA: '2016', NORM: '2016'},
{ORTH: '2к17', LEMMA: '2017', NORM: '2017'},
{ORTH: '2к18', LEMMA: '2018', NORM: '2018'},
{ORTH: '2к19', LEMMA: '2019', NORM: '2019'},
{ORTH: '2к20', LEMMA: '2020', NORM: '2020'},
{ORTH: "2к15", LEMMA: "2015", NORM: "2015"},
{ORTH: "2к16", LEMMA: "2016", NORM: "2016"},
{ORTH: "2к17", LEMMA: "2017", NORM: "2017"},
{ORTH: "2к18", LEMMA: "2018", NORM: "2018"},
{ORTH: "2к19", LEMMA: "2019", NORM: "2019"},
{ORTH: "2к20", LEMMA: "2020", NORM: "2020"},
]
for slang_desc in _slang_exc:

View File

@ -15,7 +15,7 @@ _infixes = (
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=_hyphens_no_dash),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash),
r"(?<=[0-9])-(?=[0-9])",
]
)

File diff suppressed because it is too large Load Diff

View File

@ -16,5 +16,5 @@ sentences = [
"此外,中文还是联合国正式语文,并被上海合作组织等国际组织采用为官方语言。",
"在中国大陆,汉语通称为“汉语”。",
"在联合国、台湾、香港及澳门,通称为“中文”。",
"在新加坡及马来西亚,通称为“华语”。"
"在新加坡及马来西亚,通称为“华语”。",
]

View File

@ -47,7 +47,7 @@ _single_num_words = [
"拾陆",
"拾柒",
"拾捌",
"拾玖"
"拾玖",
]
_count_num_words = [
@ -68,27 +68,16 @@ _count_num_words = [
"",
"",
"",
""
"",
]
_base_num_words = [
"",
"",
"",
"",
"亿",
"",
"",
"",
""
]
_base_num_words = ["", "", "", "", "亿", "", "", "", ""]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(
".", "").replace("", "").replace("", "")
text = text.replace(",", "").replace(".", "").replace("", "").replace("", "")
if text.isdigit():
return True
if text.count("/") == 1:
@ -97,10 +86,12 @@ def like_num(text):
return True
if text in _single_num_words:
return True
# fmt: off
if re.match('^((' + '|'.join(_count_num_words) + '){1}'
+ '(' + '|'.join(_base_num_words) + '){1})+'
+ '(' + '|'.join(_count_num_words) + ')?$', text):
return True
# fmt: on
return False

View File

@ -430,6 +430,7 @@ class Language(object):
DOCS: https://spacy.io/api/language#update
"""
expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
if len(docs) != len(golds):
raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
if len(docs) == 0:
@ -445,10 +446,10 @@ class Language(object):
if isinstance(doc, basestring_):
doc = self.make_doc(doc)
if not isinstance(gold, GoldParse):
expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
unexpected_keys = [k for k in gold if k not in expected_keys]
if unexpected_keys:
raise ValueError(Errors.E151.format(unexpected_keys=unexpected_keys, expected_keys=expected_keys))
unexpected = [k for k in gold if k not in expected_keys]
if unexpected:
err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
raise ValueError(err)
gold = GoldParse(doc, **gold)
doc_objs.append(doc)
gold_objs.append(gold)

View File

@ -5,10 +5,10 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text,lemma', [("aprox.", "aproximadament"),
("pàg.", "pàgina"),
("p.ex.", "per exemple")
])
@pytest.mark.parametrize(
"text,lemma",
[("aprox.", "aproximadament"), ("pàg.", "pàgina"), ("p.ex.", "per exemple")],
)
def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
tokens = ca_tokenizer(text)
assert len(tokens) == 1

View File

@ -21,21 +21,37 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
assert len(tokens) == 138
@pytest.mark.parametrize('text,length', [
("Perquè va anar-hi?", 6),
("“Ah no?”", 5),
("""Sí! "Anem", va contestar el Joan Carles""", 11),
("Van córrer aprox. 10km", 5),
("Llavors perqué...", 3)])
@pytest.mark.parametrize(
"text,length",
[
("Perquè va anar-hi?", 6),
("“Ah no?”", 5),
("""Sí! "Anem", va contestar el Joan Carles""", 11),
("Van córrer aprox. 10km", 5),
("Llavors perqué...", 3),
],
)
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
tokens = ca_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text,match', [
('10', True), ('1', True), ('10,000', True), ('10,00', True),
('999.0', True), ('un', True), ('dos', True), ('bilió', True),
('gos', False), (',', False), ('1/2', True)])
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10,000", True),
("10,00", True),
("999.0", True),
("un", True),
("dos", True),
("bilió", True),
("gos", False),
(",", False),
("1/2", True),
],
)
def test_ca_lex_attrs_like_number(ca_tokenizer, text, match):
tokens = ca_tokenizer(text)
assert len(tokens) == 1

View File

@ -32,7 +32,7 @@ def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
assert [token.norm_ for token in tokens] == norms
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
@pytest.mark.parametrize("text,norm", [("daß", "dass")])
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
tokens = de_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -7,33 +7,33 @@ import pytest
@pytest.mark.parametrize(
"text",
[
u"aujourd'hui",
u"Aujourd'hui",
u"prud'hommes",
u"prudhommal",
u"audio-numérique",
u"Audio-numérique",
u"entr'amis",
u"entr'abat",
u"rentr'ouvertes",
u"grand'hamien",
u"Châteauneuf-la-Forêt",
u"Château-Guibert",
u"11-septembre",
u"11-Septembre",
u"refox-trottâmes",
"aujourd'hui",
"Aujourd'hui",
"prud'hommes",
"prudhommal",
"audio-numérique",
"Audio-numérique",
"entr'amis",
"entr'abat",
"rentr'ouvertes",
"grand'hamien",
"Châteauneuf-la-Forêt",
"Château-Guibert",
"11-septembre",
"11-Septembre",
"refox-trottâmes",
# u"K-POP",
# u"K-Pop",
# u"K-pop",
u"z'yeutes",
u"black-outeront",
u"états-unienne",
u"courtes-pattes",
u"court-pattes",
u"saut-de-ski",
u"Écourt-Saint-Quentin",
u"Bout-de-l'Îlien",
u"pet-en-l'air",
"z'yeutes",
"black-outeront",
"états-unienne",
"courtes-pattes",
"court-pattes",
"saut-de-ski",
"Écourt-Saint-Quentin",
"Bout-de-l'Îlien",
"pet-en-l'air",
],
)
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):

View File

@ -3,13 +3,18 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize("tokens,lemmas", [
# fmt: off
TEST_CASES = [
(["Galime", "vadinti", "gerovės", "valstybe", ",", "turime", "išvystytą", "socialinę", "apsaugą", ",",
"sveikatos", "apsaugą", "ir", "prieinamą", "švietimą", "."],
["galėti", "vadintas", "gerovė", "valstybė", ",", "turėti", "išvystytas", "socialinis",
"apsauga", ",", "sveikata", "apsauga", "ir", "prieinamas", "švietimas", "."]),
(["taip", ",", "uoliai", "tyrinėjau", "ir", "pasirinkau", "geriausią", "variantą", "."],
["taip", ",", "uolus", "tyrinėti", "ir", "pasirinkti", "geras", "variantas", "."])])
["taip", ",", "uolus", "tyrinėti", "ir", "pasirinkti", "geras", "variantas", "."])
]
# fmt: on
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
assert lemmas == [lt_lemmatizer.lookup(token) for token in tokens]

View File

@ -7,10 +7,21 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text,match', [
('10', True), ('1', True), ('10,000', True), ('10,00', True),
('jeden', True), ('dwa', True), ('milion', True),
('pies', False), (',', False), ('1/2', True)])
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10,000", True),
("10,00", True),
("jeden", True),
("dwa", True),
("milion", True),
("pies", False),
(",", False),
("1/2", True),
],
)
def test_lex_attrs_like_number(pl_tokenizer, text, match):
tokens = pl_tokenizer(text)
assert len(tokens) == 1

View File

@ -4,9 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"text", ['ہےں۔', 'کیا۔']
)
@pytest.mark.parametrize("text", ["ہےں۔", "کیا۔"])
def test_contractions(ur_tokenizer, text):
"""Test specific Urdu punctuation character"""
tokens = ur_tokenizer(text)

View File

@ -134,12 +134,12 @@ def test_matcher_end_zero_plus(en_vocab):
def test_matcher_sets_return_correct_tokens(en_vocab):
matcher = Matcher(en_vocab)
patterns = [
[{'LOWER': {'IN': ["zero"]}}],
[{'LOWER': {'IN': ["one"]}}],
[{'LOWER': {'IN': ["two"]}}],
[{"LOWER": {"IN": ["zero"]}}],
[{"LOWER": {"IN": ["one"]}}],
[{"LOWER": {"IN": ["two"]}}],
]
matcher.add('TEST', None, *patterns)
matcher.add("TEST", None, *patterns)
doc = Doc(en_vocab, words="zero one two three".split())
matches = matcher(doc)
texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
assert texts == ['zero', 'one', 'two']
assert texts == ["zero", "one", "two"]

View File

@ -52,7 +52,9 @@ def test_get_pipe(nlp, name):
assert nlp.get_pipe(name) == new_pipe
@pytest.mark.parametrize("name,replacement,not_callable", [("my_component", lambda doc: doc, {})])
@pytest.mark.parametrize(
"name,replacement,not_callable", [("my_component", lambda doc: doc, {})]
)
def test_replace_pipe(nlp, name, replacement, not_callable):
with pytest.raises(ValueError):
nlp.replace_pipe(name, new_pipe)

View File

@ -358,7 +358,9 @@ def test_issue850_basic():
assert end == 4
@pytest.mark.skip(reason="French exception list is not enabled in the default tokenizer anymore")
@pytest.mark.skip(
reason="French exception list is not enabled in the default tokenizer anymore"
)
@pytest.mark.parametrize(
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
)

View File

@ -19,7 +19,7 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
def test_issue1235():
"""Test that g is not split of if preceded by a number and a letter"""
nlp = English()
testwords = u'e2g 2g 52g'
testwords = "e2g 2g 52g"
doc = nlp(testwords)
assert len(doc) == 5
assert doc[0].text == "e2g"

View File

@ -4,15 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"word",
[
"don't",
"dont",
"I'd",
"Id",
],
)
@pytest.mark.parametrize("word", ["don't", "dont", "I'd", "Id"])
def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms

View File

@ -9,7 +9,10 @@ import numpy as np
def test_issue3540(en_vocab):
words = ["I", "live", "in", "NewYork", "right", "now"]
tensor = np.asarray([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], dtype="f")
tensor = np.asarray(
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
dtype="f",
)
doc = Doc(en_vocab, words=words)
doc.tensor = tensor
@ -25,7 +28,7 @@ def test_issue3540(en_vocab):
with doc.retokenize() as retokenizer:
heads = [(doc[3], 1), doc[2]]
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
retokenizer.split(doc[3], [u"New", u"York"], heads=heads, attrs=attrs)
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
assert [token.text for token in doc] == gold_text

View File

@ -35,7 +35,9 @@ def test_issue3962(doc):
doc2_json = doc2.to_json()
assert doc2_json
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root
assert (
doc2[0].head.text == "jests"
) # head set to itself, being the new artificial root
assert doc2[0].dep_ == "dep"
assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep"
@ -92,7 +94,9 @@ def test_issue3962_long(two_sent_doc):
doc2_json = doc2.to_json()
assert doc2_json
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root (in sentence 1)
assert (
doc2[0].head.text == "jests"
) # head set to itself, being the new artificial root (in sentence 1)
assert doc2[0].dep_ == "ROOT"
assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep"
@ -100,9 +104,13 @@ def test_issue3962_long(two_sent_doc):
assert doc2[2].dep_ == "pobj"
assert doc2[3].head.text == "jests"
assert doc2[3].dep_ == "punct"
assert doc2[4].head.text == "They" # head set to itself, being the new artificial root (in sentence 2)
assert (
doc2[4].head.text == "They"
) # head set to itself, being the new artificial root (in sentence 2)
assert doc2[4].dep_ == "dep"
assert doc2[4].head.text == "They" # head set to the new artificial head (in sentence 2)
assert (
doc2[4].head.text == "They"
) # head set to the new artificial head (in sentence 2)
assert doc2[4].dep_ == "dep"
# We should still have 2 sentences

View File

@ -30,14 +30,18 @@ def test_serialize_kb_disk(en_vocab):
def _get_dummy_kb(vocab):
kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
kb.add_entity(entity='Q53', freq=33, entity_vector=[0, 5, 3])
kb.add_entity(entity='Q17', freq=2, entity_vector=[7, 1, 0])
kb.add_entity(entity='Q007', freq=7, entity_vector=[0, 0, 7])
kb.add_entity(entity='Q44', freq=342, entity_vector=[4, 4, 4])
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
kb.add_entity(entity="Q44", freq=342, entity_vector=[4, 4, 4])
kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9])
kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])
kb.add_alias(alias='random', entities=['Q007'], probabilities=[1.0])
kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
kb.add_alias(
alias="guy",
entities=["Q53", "Q007", "Q17", "Q44"],
probabilities=[0.3, 0.3, 0.2, 0.1],
)
kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
return kb
@ -45,30 +49,30 @@ def _get_dummy_kb(vocab):
def _check_kb(kb):
# check entities
assert kb.get_size_entities() == 4
for entity_string in ['Q53', 'Q17', 'Q007', 'Q44']:
for entity_string in ["Q53", "Q17", "Q007", "Q44"]:
assert entity_string in kb.get_entity_strings()
for entity_string in ['', 'Q0']:
for entity_string in ["", "Q0"]:
assert entity_string not in kb.get_entity_strings()
# check aliases
assert kb.get_size_aliases() == 3
for alias_string in ['double07', 'guy', 'random']:
for alias_string in ["double07", "guy", "random"]:
assert alias_string in kb.get_alias_strings()
for alias_string in ['nothingness', '', 'randomnoise']:
for alias_string in ["nothingness", "", "randomnoise"]:
assert alias_string not in kb.get_alias_strings()
# check candidates & probabilities
candidates = sorted(kb.get_candidates('double07'), key=lambda x: x.entity_)
candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_)
assert len(candidates) == 2
assert candidates[0].entity_ == 'Q007'
assert candidates[0].entity_ == "Q007"
assert 6.999 < candidates[0].entity_freq < 7.01
assert candidates[0].entity_vector == [0, 0, 7]
assert candidates[0].alias_ == 'double07'
assert candidates[0].alias_ == "double07"
assert 0.899 < candidates[0].prior_prob < 0.901
assert candidates[1].entity_ == 'Q17'
assert candidates[1].entity_ == "Q17"
assert 1.99 < candidates[1].entity_freq < 2.01
assert candidates[1].entity_vector == [7, 1, 0]
assert candidates[1].alias_ == 'double07'
assert candidates[1].alias_ == "double07"
assert 0.099 < candidates[1].prior_prob < 0.101