mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Tidy up and auto-format
This commit is contained in:
parent
364aaf5bc2
commit
f580302673
|
@ -430,8 +430,7 @@ class Errors(object):
|
|||
E150 = ("The language of the `nlp` object and the `vocab` should be the "
|
||||
"same, but found '{nlp}' and '{vocab}' respectively.")
|
||||
E151 = ("Trying to call nlp.update without required annotation types. "
|
||||
"Expected top-level keys: {expected_keys}."
|
||||
" Got: {unexpected_keys}.")
|
||||
"Expected top-level keys: {exp}. Got: {unexp}.")
|
||||
E152 = ("The `nlp` object should have a pre-trained `ner` component.")
|
||||
E153 = ("Either provide a path to a preprocessed training directory, "
|
||||
"or to the original Wikipedia XML dump.")
|
||||
|
|
|
@ -10,8 +10,4 @@ Example sentences to test spaCy and its language models.
|
|||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
'তুই খুব ভালো',
|
||||
'আজ আমরা ডাক্তার দেখতে যাবো',
|
||||
'আমি জানি না '
|
||||
]
|
||||
sentences = ["তুই খুব ভালো", "আজ আমরা ডাক্তার দেখতে যাবো", "আমি জানি না "]
|
||||
|
|
|
@ -22,7 +22,9 @@ _suffixes = (
|
|||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency),
|
||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
@ -35,8 +37,8 @@ _infixes = (
|
|||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])({h})(?=[{ae}])".format(a=ALPHA, h=HYPHENS, ae="এ"),
|
||||
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
|
||||
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ _infixes = (
|
|||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
|
|
|
@ -59,7 +59,9 @@ _suffixes = (
|
|||
r"([0-9])+\&", # 12&
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"²\-\+", q=CONCAT_QUOTES),
|
||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"²\-\+", q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\-", # όνομα-
|
||||
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\.",
|
||||
|
@ -87,8 +89,8 @@ _infixes = (
|
|||
r"([a-zA-Z]+)(\-([a-zA-Z]+))+", # abc-abc
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
|
||||
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
|
|
@ -27,5 +27,5 @@ ADVERBS_IRREG = {
|
|||
"slower": ("slow",),
|
||||
"slowest": ("slowest",),
|
||||
"sooner": ("soon",),
|
||||
"soonest": ("soon",)
|
||||
"soonest": ("soon",),
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ NOUNS_IRREG = {
|
|||
"allodia": ("allodium",),
|
||||
"alluvia": ("alluvium",),
|
||||
"alodia": ("alodium",),
|
||||
"alto-relievos": ("alto-relievo", "alto-rilievo",),
|
||||
"alto-relievos": ("alto-relievo", "alto-rilievo"),
|
||||
"altocumuli": ("altocumulus",),
|
||||
"altostrati": ("altostratus",),
|
||||
"alulae": ("alula",),
|
||||
|
@ -81,7 +81,7 @@ NOUNS_IRREG = {
|
|||
"anamorphoses": ("anamorphosis",),
|
||||
"anastomoses": ("anastomosis",),
|
||||
"anatyxes": ("anaptyxis",),
|
||||
"ancones": ("ancon", "ancone",),
|
||||
"ancones": ("ancon", "ancone"),
|
||||
"androclinia": ("androclinium",),
|
||||
"androecia": ("androecium",),
|
||||
"androsphinges": ("androsphinx",),
|
||||
|
@ -90,7 +90,7 @@ NOUNS_IRREG = {
|
|||
"angiomata": ("angioma",),
|
||||
"animalcula": ("animalculum",),
|
||||
"anlagen": ("anlage",),
|
||||
"annattos": ("anatto", "annatto",),
|
||||
"annattos": ("anatto", "annatto"),
|
||||
"annuli": ("annulus",),
|
||||
"antae": ("anta",),
|
||||
"antalkalies": ("antalkali",),
|
||||
|
@ -158,7 +158,7 @@ NOUNS_IRREG = {
|
|||
"aspergilli": ("aspergillus",),
|
||||
"aspergilloses": ("aspergillosis",),
|
||||
"aspersoria": ("aspersorium",),
|
||||
"assegais": ("assagai", "assegai",),
|
||||
"assegais": ("assagai", "assegai"),
|
||||
"astragali": ("astragalus",),
|
||||
"asyndeta": ("asyndeton",),
|
||||
"atheromata": ("atheroma",),
|
||||
|
@ -172,15 +172,15 @@ NOUNS_IRREG = {
|
|||
"aurei": ("aureus",),
|
||||
"auriculae": ("auricula",),
|
||||
"aurorae": ("aurora",),
|
||||
"auspices": ("auspex", "auspice",),
|
||||
"auspices": ("auspex", "auspice"),
|
||||
"autocatalyses": ("autocatalysis",),
|
||||
"autochthones": ("autochthon",),
|
||||
"automata": ("automaton",),
|
||||
"autos-da-fe": ("auto-da-fe",),
|
||||
"avitaminoses": ("avitaminosis",),
|
||||
"axes": ("ax", "axis",),
|
||||
"axes": ("ax", "axis"),
|
||||
"axillae": ("axilla",),
|
||||
"bacchantes": ("bacchant", "bacchante",),
|
||||
"bacchantes": ("bacchant", "bacchante"),
|
||||
"bacchii": ("bacchius",),
|
||||
"bacilli": ("bacillus",),
|
||||
"bacteriostases": ("bacteriostasis",),
|
||||
|
@ -195,7 +195,7 @@ NOUNS_IRREG = {
|
|||
"banjoes": ("banjo",),
|
||||
"barklice": ("barklouse",),
|
||||
"barramundies": ("barramundi",),
|
||||
"bases": ("base", "basis",),
|
||||
"bases": ("base", "basis"),
|
||||
"bases-on-balls": ("base_on_balls",),
|
||||
"bases_on_balls": ("base_on_balls",),
|
||||
"basidia": ("basidium",),
|
||||
|
@ -204,15 +204,15 @@ NOUNS_IRREG = {
|
|||
"bastinadoes": ("bastinado",),
|
||||
"bateaux": ("bateau",),
|
||||
"batfishes": ("batfish",),
|
||||
"beadsmen": ("beadsman", "bedesman",),
|
||||
"beadsmen": ("beadsman", "bedesman"),
|
||||
"beaux": ("beau",),
|
||||
"beches-de-mer": ("beche-de-mer",),
|
||||
"beeves": ("beef",),
|
||||
"behooves": ("behoof",),
|
||||
"bersaglieri": ("bersagliere",),
|
||||
"bhishties": ("bheesty", "bhishti",),
|
||||
"bhishties": ("bheesty", "bhishti"),
|
||||
"bibliothecae": ("bibliotheca",),
|
||||
"bicennaries": ("bicentenary", "bicentennial",),
|
||||
"bicennaries": ("bicentenary", "bicentennial"),
|
||||
"bijoux": ("bijou",),
|
||||
"bilboes": ("bilbo",),
|
||||
"billets-doux": ("billet-doux",),
|
||||
|
@ -245,7 +245,7 @@ NOUNS_IRREG = {
|
|||
"brachia": ("brachium",),
|
||||
"brainchildren": ("brainchild",),
|
||||
"branchiae": ("branchia",),
|
||||
"brants": ("brant", "brent",),
|
||||
"brants": ("brant", "brent"),
|
||||
"bravadoes": ("bravado",),
|
||||
"bravoes": ("bravo",),
|
||||
"bregmata": ("bregma",),
|
||||
|
@ -275,7 +275,7 @@ NOUNS_IRREG = {
|
|||
"caesurae": ("caesura",),
|
||||
"calami": ("calamus",),
|
||||
"calathi": ("calathus",),
|
||||
"calcanei": ("calcaneum", "calcaneus",),
|
||||
"calcanei": ("calcaneum", "calcaneus"),
|
||||
"calces": ("calx",),
|
||||
"calculi": ("calculus",),
|
||||
"caldaria": ("caldarium",),
|
||||
|
@ -421,7 +421,7 @@ NOUNS_IRREG = {
|
|||
"comae": ("coma",),
|
||||
"comatulae": ("comatula",),
|
||||
"comedones": ("comedo",),
|
||||
"comics": ("comic_strip", "comic",),
|
||||
"comics": ("comic_strip", "comic"),
|
||||
"commandoes": ("commando",),
|
||||
"concertanti": ("concertante",),
|
||||
"concerti": ("concerto",),
|
||||
|
@ -549,11 +549,11 @@ NOUNS_IRREG = {
|
|||
"diplococci": ("diplococcus",),
|
||||
"directors-general": ("director-general",),
|
||||
"disci": ("discus",),
|
||||
"discoboli": ("discobolos", "discobolus",),
|
||||
"discoboli": ("discobolos", "discobolus"),
|
||||
"dive": ("diva",),
|
||||
"diverticula": ("diverticulum",),
|
||||
"divertimenti": ("divertimento",),
|
||||
"djinn": ("djinni", "djinny",),
|
||||
"djinn": ("djinni", "djinny"),
|
||||
"dodoes": ("dodo",),
|
||||
"dogfishes": ("dogfish",),
|
||||
"dogmata": ("dogma",),
|
||||
|
@ -593,7 +593,7 @@ NOUNS_IRREG = {
|
|||
"ellipses": ("ellipsis",),
|
||||
"eluvia": ("eluvium",),
|
||||
"elves": ("elf",),
|
||||
"elytra": ("elytron", "elytrum",),
|
||||
"elytra": ("elytron", "elytrum"),
|
||||
"embargoes": ("embargo",),
|
||||
"emboli": ("embolus",),
|
||||
"emphases": ("emphasis",),
|
||||
|
@ -623,7 +623,7 @@ NOUNS_IRREG = {
|
|||
"entases": ("entasis",),
|
||||
"entera": ("enteron",),
|
||||
"entia": ("ens",),
|
||||
"entozoa": ("entozoan", "entozoon",),
|
||||
"entozoa": ("entozoan", "entozoon"),
|
||||
"epencephala": ("epencephalon",),
|
||||
"epentheses": ("epenthesis",),
|
||||
"epexegeses": ("epexegesis",),
|
||||
|
@ -643,10 +643,10 @@ NOUNS_IRREG = {
|
|||
"epiphenomena": ("epiphenomenon",),
|
||||
"epiphyses": ("epiphysis",),
|
||||
"episterna": ("episternum",),
|
||||
"epithalamia": ("epithalamion", "epithalamium",),
|
||||
"epithalamia": ("epithalamion", "epithalamium"),
|
||||
"epithelia": ("epithelium",),
|
||||
"epitheliomata": ("epithelioma",),
|
||||
"epizoa": ("epizoan", "epizoon",),
|
||||
"epizoa": ("epizoan", "epizoon"),
|
||||
"epyllia": ("epyllion",),
|
||||
"equilibria": ("equilibrium",),
|
||||
"equiseta": ("equisetum",),
|
||||
|
@ -845,11 +845,11 @@ NOUNS_IRREG = {
|
|||
"groszy": ("grosz",),
|
||||
"grottoes": ("grotto",),
|
||||
"guilder": ("guilde",),
|
||||
"guilders": ("guilde", "guilder",),
|
||||
"guilders": ("guilde", "guilder"),
|
||||
"guitarfishes": ("guitarfish",),
|
||||
"gummata": ("gumma",),
|
||||
"gurnard": ("gurnar",),
|
||||
"gurnards": ("gurnar", "gurnard",),
|
||||
"gurnards": ("gurnar", "gurnard"),
|
||||
"guttae": ("gutta",),
|
||||
"gymnasia": ("gymnasium",),
|
||||
"gynaecea": ("gynaeceum",),
|
||||
|
@ -870,7 +870,7 @@ NOUNS_IRREG = {
|
|||
"haeredes": ("haeres",),
|
||||
"haftaroth": ("haftarah",),
|
||||
"hagfishes": ("hagfish",),
|
||||
"haggadas": ("haggada", "haggadah",),
|
||||
"haggadas": ("haggada", "haggadah"),
|
||||
"haggadoth": ("haggada",),
|
||||
"hajjes": ("hajj",),
|
||||
"haleru": ("haler",),
|
||||
|
@ -879,7 +879,7 @@ NOUNS_IRREG = {
|
|||
"halloth": ("hallah",),
|
||||
"halluces": ("hallux",),
|
||||
"haloes": ("halo",),
|
||||
"halteres": ("halter", "haltere",),
|
||||
"halteres": ("halter", "haltere"),
|
||||
"halves": ("half",),
|
||||
"hamuli": ("hamulus",),
|
||||
"hangers-on": ("hanger-on",),
|
||||
|
@ -909,7 +909,7 @@ NOUNS_IRREG = {
|
|||
"heraclidae": ("heraclid",),
|
||||
"heraklidae": ("heraklid",),
|
||||
"herbaria": ("herbarium",),
|
||||
"hermae": ("herm", "herma",),
|
||||
"hermae": ("herm", "herma"),
|
||||
"hermai": ("herma",),
|
||||
"herniae": ("hernia",),
|
||||
"heroes": ("hero",),
|
||||
|
@ -955,8 +955,8 @@ NOUNS_IRREG = {
|
|||
"ibices": ("ibex",),
|
||||
"ibo": ("igbo",),
|
||||
"ichthyosauri": ("ichthyosaurus",),
|
||||
"ichthyosauruses": ("ichthyosaur", "ichthyosaurus",),
|
||||
"iconostases": ("iconostas", "iconostasis",),
|
||||
"ichthyosauruses": ("ichthyosaur", "ichthyosaurus"),
|
||||
"iconostases": ("iconostas", "iconostasis"),
|
||||
"icosahedra": ("icosahedron",),
|
||||
"ideata": ("ideatum",),
|
||||
"igorrorote": ("igorrote",),
|
||||
|
@ -991,7 +991,7 @@ NOUNS_IRREG = {
|
|||
"is": ("is",),
|
||||
"ischia": ("ischium",),
|
||||
"isthmi": ("isthmus",),
|
||||
"jackeroos": ("jackaroo", "jackeroo",),
|
||||
"jackeroos": ("jackaroo", "jackeroo"),
|
||||
"jackfishes": ("jackfish",),
|
||||
"jackknives": ("jackknife",),
|
||||
"jacks-in-the-box": ("jack-in-the-box",),
|
||||
|
@ -1001,12 +1001,12 @@ NOUNS_IRREG = {
|
|||
"jewfishes": ("jewfish",),
|
||||
"jingoes": ("jingo",),
|
||||
"jinn": ("jinni",),
|
||||
"joes": ("jo", "joe",),
|
||||
"joes": ("jo", "joe"),
|
||||
"judge_advocates_general": ("judge_advocate_general",),
|
||||
"jura": ("jus",),
|
||||
"kaddishim": ("kaddish",),
|
||||
"kalmuck": ("kalmuc",),
|
||||
"kalmucks": ("kalmuc", "kalmuck",),
|
||||
"kalmucks": ("kalmuc", "kalmuck"),
|
||||
"katabases": ("katabasis",),
|
||||
"keeshonden": ("keeshond",),
|
||||
"kibbutzim": ("kibbutz",),
|
||||
|
@ -1045,7 +1045,7 @@ NOUNS_IRREG = {
|
|||
"latifundia": ("latifundium",),
|
||||
"latu": ("lat",),
|
||||
"lavaboes": ("lavabo",),
|
||||
"leaves": ("leaf", "leave",),
|
||||
"leaves": ("leaf", "leave"),
|
||||
"lecythi": ("lecythus",),
|
||||
"leges": ("lex",),
|
||||
"lei": ("leu",),
|
||||
|
@ -1078,7 +1078,7 @@ NOUNS_IRREG = {
|
|||
"liriodendra": ("liriodendron",),
|
||||
"lisente": ("sente",),
|
||||
"listente": ("sente",),
|
||||
"litai": ("lit", "litas",),
|
||||
"litai": ("lit", "litas"),
|
||||
"litu": ("litas",),
|
||||
"lives": ("life",),
|
||||
"lixivia": ("lixivium",),
|
||||
|
@ -1098,7 +1098,7 @@ NOUNS_IRREG = {
|
|||
"lumpfishes": ("lumpfish",),
|
||||
"lungfishes": ("lungfish",),
|
||||
"lunulae": ("lunula",),
|
||||
"lures": ("lur", "lure",),
|
||||
"lures": ("lur", "lure"),
|
||||
"lustra": ("lustre",),
|
||||
"lyings-in": ("lying-in",),
|
||||
"lymphangitides": ("lymphangitis",),
|
||||
|
@ -1142,7 +1142,7 @@ NOUNS_IRREG = {
|
|||
"marsupia": ("marsupium",),
|
||||
"marvels-of-peru": ("marvel-of-peru",),
|
||||
"mass_media": ("mass_medium",),
|
||||
"masses": ("mass", "masse",),
|
||||
"masses": ("mass", "masse"),
|
||||
"masters-at-arms": ("master-at-arms",),
|
||||
"matrices": ("matrix",),
|
||||
"matzoth": ("matzo",),
|
||||
|
@ -1210,7 +1210,7 @@ NOUNS_IRREG = {
|
|||
"mioses": ("miosis",),
|
||||
"miracidia": ("miracidium",),
|
||||
"miri": ("mir",),
|
||||
"mishnayoth": ("mishna", "mishnah",),
|
||||
"mishnayoth": ("mishna", "mishnah"),
|
||||
"mitochondria": ("mitochondrion",),
|
||||
"mitzvoth": ("mitzvah",),
|
||||
"modioli": ("modiolus",),
|
||||
|
@ -1218,7 +1218,7 @@ NOUNS_IRREG = {
|
|||
"momenta": ("momentum",),
|
||||
"moments_of_truth": ("moment_of_truth",),
|
||||
"momi": ("momus",),
|
||||
"monades": ("monad", "monas",),
|
||||
"monades": ("monad", "monas"),
|
||||
"monkfishes": ("monkfish",),
|
||||
"monochasia": ("monochasium",),
|
||||
"monopodia": ("monopodium",),
|
||||
|
@ -1235,7 +1235,7 @@ NOUNS_IRREG = {
|
|||
"moriscoes": ("morisco",),
|
||||
"morphallaxes": ("morphallaxis",),
|
||||
"morphoses": ("morphosis",),
|
||||
"morses": ("morse", "mors",),
|
||||
"morses": ("morse", "mors"),
|
||||
"morulae": ("morula",),
|
||||
"mosasauri": ("mosasaurus",),
|
||||
"moshavim": ("moshav",),
|
||||
|
@ -1328,13 +1328,13 @@ NOUNS_IRREG = {
|
|||
"oceanides": ("oceanid",),
|
||||
"ocelli": ("ocellus",),
|
||||
"ochreae": ("ochrea",),
|
||||
"ocreae": ("ochrea", "ocrea",),
|
||||
"ocreae": ("ochrea", "ocrea"),
|
||||
"octahedra": ("octahedron",),
|
||||
"octopi": ("octopus",),
|
||||
"oculi": ("oculus",),
|
||||
"odea": ("odeum",),
|
||||
"oedemata": ("edema", "oedema",),
|
||||
"oesophagi": ("esophagus", "oesophagus",),
|
||||
"oedemata": ("edema", "oedema"),
|
||||
"oesophagi": ("esophagus", "oesophagus"),
|
||||
"oldwives": ("oldwife",),
|
||||
"olea": ("oleum",),
|
||||
"omasa": ("omasum",),
|
||||
|
@ -1350,15 +1350,15 @@ NOUNS_IRREG = {
|
|||
"optic_axes": ("optic_axis",),
|
||||
"optima": ("optimum",),
|
||||
"ora": ("os",),
|
||||
"organa": ("organon", "organum",),
|
||||
"organums": ("organa", "organum",),
|
||||
"organa": ("organon", "organum"),
|
||||
"organums": ("organa", "organum"),
|
||||
"orthoptera": ("orthopteron",),
|
||||
"osar": ("os",),
|
||||
"oscula": ("osculum",),
|
||||
"ossa": ("os",),
|
||||
"osteomata": ("osteoma",),
|
||||
"ostia": ("ostium",),
|
||||
"ottomans": ("othman", "ottoman",),
|
||||
"ottomans": ("othman", "ottoman"),
|
||||
"ova": ("ovum",),
|
||||
"ovoli": ("ovolo",),
|
||||
"ovotestes": ("ovotestis",),
|
||||
|
@ -1382,7 +1382,7 @@ NOUNS_IRREG = {
|
|||
"papulae": ("papula",),
|
||||
"papyri": ("papyrus",),
|
||||
"parabases": ("parabasis",),
|
||||
"paraleipses": ("paraleipsis", "paralipsis",),
|
||||
"paraleipses": ("paraleipsis", "paralipsis"),
|
||||
"paralyses": ("paralysis",),
|
||||
"paramecia": ("paramecium",),
|
||||
"paramenta": ("parament",),
|
||||
|
@ -1442,13 +1442,13 @@ NOUNS_IRREG = {
|
|||
"personae": ("persona",),
|
||||
"petechiae": ("petechia",),
|
||||
"pfennige": ("pfennig",),
|
||||
"phalanges": ("phalange", "phalanx",),
|
||||
"phalanges": ("phalange", "phalanx"),
|
||||
"phalli": ("phallus",),
|
||||
"pharynges": ("pharynx",),
|
||||
"phenomena": ("phenomenon",),
|
||||
"phi-phenomena": ("phi-phenomenon",),
|
||||
"philodendra": ("philodendron",),
|
||||
"phlyctenae": ("phlyctaena", "phlyctena",),
|
||||
"phlyctenae": ("phlyctaena", "phlyctena"),
|
||||
"phyla": ("phylum",),
|
||||
"phylae": ("phyle",),
|
||||
"phyllotaxes": ("phyllotaxis",),
|
||||
|
@ -1475,12 +1475,12 @@ NOUNS_IRREG = {
|
|||
"plasmodesmata": ("plasmodesma",),
|
||||
"plasmodia": ("plasmodium",),
|
||||
"plateaux": ("plateau",),
|
||||
"plectra": ("plectron", "plectrum",),
|
||||
"plectra": ("plectron", "plectrum"),
|
||||
"plena": ("plenum",),
|
||||
"pleura": ("pleuron",),
|
||||
"pleurae": ("pleura",),
|
||||
"plicae": ("plica",),
|
||||
"ploughmen": ("ploughman", "plowman",),
|
||||
"ploughmen": ("ploughman", "plowman"),
|
||||
"pneumobacilli": ("pneumobacillus",),
|
||||
"pneumococci": ("pneumococcus",),
|
||||
"pocketknives": ("pocketknife",),
|
||||
|
@ -1515,7 +1515,7 @@ NOUNS_IRREG = {
|
|||
"principia": ("principium",),
|
||||
"proboscides": ("proboscis",),
|
||||
"proces-verbaux": ("proces-verbal",),
|
||||
"proglottides": ("proglottid", "proglottis",),
|
||||
"proglottides": ("proglottid", "proglottis"),
|
||||
"prognoses": ("prognosis",),
|
||||
"prolegomena": ("prolegomenon",),
|
||||
"prolepses": ("prolepsis",),
|
||||
|
@ -1532,7 +1532,7 @@ NOUNS_IRREG = {
|
|||
"prostheses": ("prosthesis",),
|
||||
"prostomia": ("prostomium",),
|
||||
"protases": ("protasis",),
|
||||
"prothalamia": ("prothalamion", "prothalamium",),
|
||||
"prothalamia": ("prothalamion", "prothalamium"),
|
||||
"prothalli": ("prothallus",),
|
||||
"prothallia": ("prothallium",),
|
||||
"prothoraces": ("prothorax",),
|
||||
|
@ -1572,7 +1572,7 @@ NOUNS_IRREG = {
|
|||
"quezales": ("quezal",),
|
||||
"quinquennia": ("quinquennium",),
|
||||
"quizzes": ("quiz",),
|
||||
"rabatos": ("rabato", "rebato",),
|
||||
"rabatos": ("rabato", "rebato"),
|
||||
"rabbitfishes": ("rabbitfish",),
|
||||
"rachides": ("rhachis",),
|
||||
"radices": ("radix",),
|
||||
|
@ -1583,7 +1583,7 @@ NOUNS_IRREG = {
|
|||
"ranulae": ("ranula",),
|
||||
"ranunculi": ("ranunculus",),
|
||||
"raphae": ("raphe",),
|
||||
"raphides": ("raphide", "raphis",),
|
||||
"raphides": ("raphide", "raphis"),
|
||||
"ratfishes": ("ratfish",),
|
||||
"reales": ("real",),
|
||||
"rearmice": ("rearmouse",),
|
||||
|
@ -1598,7 +1598,7 @@ NOUNS_IRREG = {
|
|||
"reis": ("real",),
|
||||
"relata": ("relatum",),
|
||||
"remiges": ("remex",),
|
||||
"reremice": ("rearmouse", "reremouse",),
|
||||
"reremice": ("rearmouse", "reremouse"),
|
||||
"reseaux": ("reseau",),
|
||||
"residua": ("residuum",),
|
||||
"responsa": ("responsum",),
|
||||
|
@ -1609,7 +1609,7 @@ NOUNS_IRREG = {
|
|||
"retinae": ("retina",),
|
||||
"rhabdomyomata": ("rhabdomyoma",),
|
||||
"rhachides": ("rhachis",),
|
||||
"rhachises": ("rachis", "rhachis",),
|
||||
"rhachises": ("rachis", "rhachis"),
|
||||
"rhinencephala": ("rhinencephalon",),
|
||||
"rhizobia": ("rhizobium",),
|
||||
"rhombi": ("rhombus",),
|
||||
|
@ -1636,7 +1636,7 @@ NOUNS_IRREG = {
|
|||
"runners-up": ("runner-up",),
|
||||
"sacra": ("sacrum",),
|
||||
"sacraria": ("sacrarium",),
|
||||
"saguaros": ("saguaro", "sahuaro",),
|
||||
"saguaros": ("saguaro", "sahuaro"),
|
||||
"sailfishes": ("sailfish",),
|
||||
"salespeople": ("salesperson",),
|
||||
"salmonellae": ("salmonella",),
|
||||
|
@ -1657,7 +1657,7 @@ NOUNS_IRREG = {
|
|||
"scapulae": ("scapula",),
|
||||
"scarabaei": ("scarabaeus",),
|
||||
"scarves": ("scarf",),
|
||||
"schatchonim": ("schatchen", "shadchan",),
|
||||
"schatchonim": ("schatchen", "shadchan"),
|
||||
"schemata": ("schema",),
|
||||
"scherzandi": ("scherzando",),
|
||||
"scherzi": ("scherzo",),
|
||||
|
@ -1690,7 +1690,7 @@ NOUNS_IRREG = {
|
|||
"senores": ("senor",),
|
||||
"sensilla": ("sensillum",),
|
||||
"senti": ("sent",),
|
||||
"senussis": ("senusi", "senussi",),
|
||||
"senussis": ("senusi", "senussi"),
|
||||
"separatrices": ("separatrix",),
|
||||
"sephardim": ("sephardi",),
|
||||
"septa": ("septum",),
|
||||
|
@ -1707,9 +1707,9 @@ NOUNS_IRREG = {
|
|||
"shabbatim": ("shabbat",),
|
||||
"shackoes": ("shacko",),
|
||||
"shadchanim": ("shadchan",),
|
||||
"shadchans": ("schatchen", "shadchan",),
|
||||
"shadchans": ("schatchen", "shadchan"),
|
||||
"shakoes": ("shako",),
|
||||
"shammosim": ("shammas", "shammes",),
|
||||
"shammosim": ("shammas", "shammes"),
|
||||
"sheatfishes": ("sheatfish",),
|
||||
"sheaves": ("sheaf",),
|
||||
"shellfishes": ("shellfish",),
|
||||
|
@ -1717,14 +1717,14 @@ NOUNS_IRREG = {
|
|||
"shinleaves": ("shinleaf",),
|
||||
"shittim": ("shittah",),
|
||||
"shmoes": ("shmo",),
|
||||
"shofroth": ("shofar", "shophar",),
|
||||
"shofroth": ("shofar", "shophar"),
|
||||
"shophroth": ("shophar",),
|
||||
"shrewmice": ("shrewmouse",),
|
||||
"shuln": ("shul",),
|
||||
"siddurim": ("siddur",),
|
||||
"sigloi": ("siglos",),
|
||||
"signore": ("signora",),
|
||||
"signori": ("signior", "signore",),
|
||||
"signori": ("signior", "signore"),
|
||||
"signorine": ("signorina",),
|
||||
"siliquae": ("siliqua",),
|
||||
"silvae": ("silva",),
|
||||
|
@ -1739,12 +1739,12 @@ NOUNS_IRREG = {
|
|||
"snaggleteeth": ("snaggletooth",),
|
||||
"snailfishes": ("snailfish",),
|
||||
"snipefishes": ("snipefish",),
|
||||
"socmen": ("socman", "sokeman",),
|
||||
"socmen": ("socman", "sokeman"),
|
||||
"sola": ("solum",),
|
||||
"solaria": ("solarium",),
|
||||
"solatia": ("solatium",),
|
||||
"soldi": ("soldo",),
|
||||
"soles": ("sol", "sole",),
|
||||
"soles": ("sol", "sole"),
|
||||
"solfeggi": ("solfeggio",),
|
||||
"soli": ("solo",),
|
||||
"solidi": ("solidus",),
|
||||
|
@ -1864,7 +1864,7 @@ NOUNS_IRREG = {
|
|||
"syringes": ("syrinx",),
|
||||
"syssarcoses": ("syssarcosis",),
|
||||
"tableaux": ("tableau",),
|
||||
"taeniae": ("taenia", "tenia",),
|
||||
"taeniae": ("taenia", "tenia"),
|
||||
"tali": ("talus",),
|
||||
"tallaisim": ("tallith",),
|
||||
"tallithes": ("tallith",),
|
||||
|
@ -1874,14 +1874,14 @@ NOUNS_IRREG = {
|
|||
"tarsi": ("tarsus",),
|
||||
"tarsometatarsi": ("tarsometatarsus",),
|
||||
"taxa": ("taxon",),
|
||||
"taxes": ("tax", "taxis",),
|
||||
"taxes": ("tax", "taxis"),
|
||||
"taxies": ("taxi",),
|
||||
"tectrices": ("tectrix",),
|
||||
"teeth": ("tooth",),
|
||||
"tegmina": ("tegmen",),
|
||||
"telae": ("tela",),
|
||||
"telamones": ("telamon",),
|
||||
"telangiectases": ("telangiectasia", "telangiectasis",),
|
||||
"telangiectases": ("telangiectasia", "telangiectasis"),
|
||||
"telia": ("telium",),
|
||||
"tempi": ("tempo",),
|
||||
"tenacula": ("tenaculum",),
|
||||
|
@ -1932,7 +1932,7 @@ NOUNS_IRREG = {
|
|||
"tornadoes": ("tornado",),
|
||||
"torpedoes": ("torpedo",),
|
||||
"torsi": ("torso",),
|
||||
"touracos": ("touraco", "turaco",),
|
||||
"touracos": ("touraco", "turaco"),
|
||||
"trabeculae": ("trabecula",),
|
||||
"tracheae": ("trachea",),
|
||||
"traditores": ("traditor",),
|
||||
|
@ -1960,7 +1960,7 @@ NOUNS_IRREG = {
|
|||
"tubae": ("tuba",),
|
||||
"turves": ("turf",),
|
||||
"tympana": ("tympanum",),
|
||||
"tyros": ("tiro", "tyro",),
|
||||
"tyros": ("tiro", "tyro"),
|
||||
"ubermenschen": ("ubermensch",),
|
||||
"uglies": ("ugli",),
|
||||
"uigurs": ("uighur",),
|
||||
|
@ -1980,7 +1980,7 @@ NOUNS_IRREG = {
|
|||
"utriculi": ("utriculus",),
|
||||
"uvulae": ("uvula",),
|
||||
"vacua": ("vacuum",),
|
||||
"vagi": ("vagus", "vagus",),
|
||||
"vagi": ("vagus", "vagus"),
|
||||
"vaginae": ("vagina",),
|
||||
"valleculae": ("vallecula",),
|
||||
"vaporetti": ("vaporetto",),
|
||||
|
@ -2026,7 +2026,7 @@ NOUNS_IRREG = {
|
|||
"vortices": ("vortex",),
|
||||
"vulvae": ("vulva",),
|
||||
"wagons-lits": ("wagon-lit",),
|
||||
"wahhabis": ("wahabi", "wahhabi",),
|
||||
"wahhabis": ("wahabi", "wahhabi"),
|
||||
"wanderjahre": ("wanderjahr",),
|
||||
"weakfishes": ("weakfish",),
|
||||
"werewolves": ("werewolf",),
|
||||
|
@ -2044,13 +2044,13 @@ NOUNS_IRREG = {
|
|||
"yeshivoth": ("yeshiva",),
|
||||
"yogin": ("yogi",),
|
||||
"yourselves": ("yourself",),
|
||||
"zamindaris": ("zamindari", "zemindari",),
|
||||
"zamindaris": ("zamindari", "zemindari"),
|
||||
"zecchini": ("zecchino",),
|
||||
"zeroes": ("zero",),
|
||||
"zoa": ("zoon",),
|
||||
"zoaeae": ("zoaea", "zoea",),
|
||||
"zoaeae": ("zoaea", "zoea"),
|
||||
"zoeae": ("zoea",),
|
||||
"zoeas": ("zoaea",),
|
||||
"zoonoses": ("zoonosis",),
|
||||
"zoosporangia": ("zoosporangium",)
|
||||
"zoosporangia": ("zoosporangium",),
|
||||
}
|
||||
|
|
|
@ -42,8 +42,8 @@ VERBS_IRREG = {
|
|||
"anglified": ("anglify",),
|
||||
"annulled": ("annul",),
|
||||
"annulling": ("annul",),
|
||||
"appalled": ("appal", "appall",),
|
||||
"appalling": ("appal", "appall",),
|
||||
"appalled": ("appal", "appall"),
|
||||
"appalling": ("appal", "appall"),
|
||||
"applied": ("apply",),
|
||||
"arcked": ("arc",),
|
||||
"arcking": ("arc",),
|
||||
|
@ -244,9 +244,9 @@ VERBS_IRREG = {
|
|||
"bypast": ("bypass",),
|
||||
"caballed": ("cabal",),
|
||||
"caballing": ("cabal",),
|
||||
"caddied": ("caddie", "caddy",),
|
||||
"caddies": ("caddie", "caddy",),
|
||||
"caddying": ("caddie", "caddy",),
|
||||
"caddied": ("caddie", "caddy"),
|
||||
"caddies": ("caddie", "caddy"),
|
||||
"caddying": ("caddie", "caddy"),
|
||||
"calcified": ("calcify",),
|
||||
"came": ("come",),
|
||||
"canalled": ("canal",),
|
||||
|
@ -506,8 +506,8 @@ VERBS_IRREG = {
|
|||
"disembodied": ("disembody",),
|
||||
"disembowelled": ("disembowel",),
|
||||
"disembowelling": ("disembowel",),
|
||||
"disenthralled": ("disenthral", "disenthrall",),
|
||||
"disenthralling": ("disenthral", "disenthrall",),
|
||||
"disenthralled": ("disenthral", "disenthrall"),
|
||||
"disenthralling": ("disenthral", "disenthrall"),
|
||||
"disenthralls": ("disenthral",),
|
||||
"disenthrals": ("disenthrall",),
|
||||
"dishevelled": ("dishevel",),
|
||||
|
@ -518,8 +518,8 @@ VERBS_IRREG = {
|
|||
"dispelling": ("dispel",),
|
||||
"disqualified": ("disqualify",),
|
||||
"dissatisfied": ("dissatisfy",),
|
||||
"distilled": ("distil", "distill",),
|
||||
"distilling": ("distil", "distill",),
|
||||
"distilled": ("distil", "distill"),
|
||||
"distilling": ("distil", "distill"),
|
||||
"diversified": ("diversify",),
|
||||
"divvied": ("divvy",),
|
||||
"dizzied": ("dizzy",),
|
||||
|
@ -595,10 +595,10 @@ VERBS_IRREG = {
|
|||
"enamelling": ("enamel",),
|
||||
"englutted": ("englut",),
|
||||
"englutting": ("englut",),
|
||||
"enrolled": ("enrol", "enroll",),
|
||||
"enrolling": ("enrol", "enroll",),
|
||||
"enthralled": ("enthral", "enthrall",),
|
||||
"enthralling": ("enthral", "enthrall",),
|
||||
"enrolled": ("enrol", "enroll"),
|
||||
"enrolling": ("enrol", "enroll"),
|
||||
"enthralled": ("enthral", "enthrall"),
|
||||
"enthralling": ("enthral", "enthrall"),
|
||||
"entrammelled": ("entrammel",),
|
||||
"entrammelling": ("entrammel",),
|
||||
"entrapped": ("entrap",),
|
||||
|
@ -621,8 +621,8 @@ VERBS_IRREG = {
|
|||
"exemplified": ("exemplify",),
|
||||
"expelled": ("expel",),
|
||||
"expelling": ("expel",),
|
||||
"extolled": ("extol", "extoll",),
|
||||
"extolling": ("extol", "extoll",),
|
||||
"extolled": ("extol", "extoll"),
|
||||
"extolling": ("extol", "extoll"),
|
||||
"facetted": ("facet",),
|
||||
"facetting": ("facet",),
|
||||
"fagged": ("fag",),
|
||||
|
@ -638,7 +638,7 @@ VERBS_IRREG = {
|
|||
"featherbedded": ("featherbed",),
|
||||
"featherbedding": ("featherbed",),
|
||||
"fed": ("feed",),
|
||||
"feed": ("feed", "fee",),
|
||||
"feed": ("feed", "fee"),
|
||||
"fell": ("fall",),
|
||||
"felt": ("feel",),
|
||||
"ferried": ("ferry",),
|
||||
|
@ -744,8 +744,8 @@ VERBS_IRREG = {
|
|||
"fried": ("fry",),
|
||||
"frigged": ("frig",),
|
||||
"frigging": ("frig",),
|
||||
"fritted": ("frit", "fritt",),
|
||||
"fritting": ("frit", "fritt",),
|
||||
"fritted": ("frit", "fritt"),
|
||||
"fritting": ("frit", "fritt"),
|
||||
"frivolled": ("frivol",),
|
||||
"frivolling": ("frivol",),
|
||||
"frogged": ("frog",),
|
||||
|
@ -757,8 +757,8 @@ VERBS_IRREG = {
|
|||
"fructified": ("fructify",),
|
||||
"fuelled": ("fuel",),
|
||||
"fuelling": ("fuel",),
|
||||
"fulfilled": ("fulfil", "fulfill",),
|
||||
"fulfilling": ("fulfil", "fulfill",),
|
||||
"fulfilled": ("fulfil", "fulfill"),
|
||||
"fulfilling": ("fulfil", "fulfill"),
|
||||
"funned": ("fun",),
|
||||
"funnelled": ("funnel",),
|
||||
"funnelling": ("funnel",),
|
||||
|
@ -955,8 +955,8 @@ VERBS_IRREG = {
|
|||
"insetting": ("inset",),
|
||||
"inspanned": ("inspan",),
|
||||
"inspanning": ("inspan",),
|
||||
"installed": ("instal", "install",),
|
||||
"installing": ("instal", "install",),
|
||||
"installed": ("instal", "install"),
|
||||
"installing": ("instal", "install"),
|
||||
"intensified": ("intensify",),
|
||||
"interbred": ("interbreed",),
|
||||
"intercropped": ("intercrop",),
|
||||
|
@ -1303,7 +1303,7 @@ VERBS_IRREG = {
|
|||
"overdriven": ("overdrive",),
|
||||
"overdrove": ("overdrive",),
|
||||
"overflew": ("overfly",),
|
||||
"overflown": ("overflow", "overfly",),
|
||||
"overflown": ("overflow", "overfly"),
|
||||
"overgrew": ("overgrow",),
|
||||
"overgrown": ("overgrow",),
|
||||
"overheard": ("overhear",),
|
||||
|
@ -1547,8 +1547,8 @@ VERBS_IRREG = {
|
|||
"red": ("red",),
|
||||
"red-pencilled": ("red-pencil",),
|
||||
"red-pencilling": ("red-pencil",),
|
||||
"redded": ("red", "redd",),
|
||||
"redding": ("red", "redd",),
|
||||
"redded": ("red", "redd"),
|
||||
"redding": ("red", "redd"),
|
||||
"redid": ("redo",),
|
||||
"redone": ("redo",),
|
||||
"referred": ("refer",),
|
||||
|
@ -1763,7 +1763,7 @@ VERBS_IRREG = {
|
|||
"signified": ("signify",),
|
||||
"silicified": ("silicify",),
|
||||
"simplified": ("simplify",),
|
||||
"singing": ("sing", "singe",),
|
||||
"singing": ("sing", "singe"),
|
||||
"single-stepped": ("single-step",),
|
||||
"single-stepping": ("single-step",),
|
||||
"sinned": ("sin",),
|
||||
|
@ -2404,5 +2404,5 @@ VERBS_IRREG = {
|
|||
"zigzagged": ("zigzag",),
|
||||
"zigzagging": ("zigzag",),
|
||||
"zipped": ("zip",),
|
||||
"zipping": ("zip",)
|
||||
"zipping": ("zip",),
|
||||
}
|
||||
|
|
|
@ -538,7 +538,7 @@ for orth in [
|
|||
"Sen.",
|
||||
"St.",
|
||||
"vs.",
|
||||
"v.s."
|
||||
"v.s.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -18,16 +18,24 @@ from ._verbs import VERBS
|
|||
|
||||
from ....util import load_language_data
|
||||
|
||||
BASE_PATH = Path(__file__).parent
|
||||
BASE_PATH = Path(__file__).parent
|
||||
|
||||
LOOKUP = load_language_data(BASE_PATH / 'lookup.json')
|
||||
VERBS_IRREG = load_language_data(BASE_PATH / '_verbs_irreg.json')
|
||||
ADJECTIVES_IRREG = load_language_data(BASE_PATH / '_adjectives_irreg.json')
|
||||
LOOKUP = load_language_data(BASE_PATH / "lookup.json")
|
||||
VERBS_IRREG = load_language_data(BASE_PATH / "_verbs_irreg.json")
|
||||
ADJECTIVES_IRREG = load_language_data(BASE_PATH / "_adjectives_irreg.json")
|
||||
|
||||
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
|
||||
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
|
||||
|
||||
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adp': ADP_IRREG, 'aux': AUXILIARY_VERBS_IRREG,
|
||||
'cconj': CCONJ_IRREG, 'det': DETS_IRREG, 'noun': NOUNS_IRREG, 'verb': VERBS_IRREG,
|
||||
'pron': PRONOUNS_IRREG, 'sconj': SCONJ_IRREG}
|
||||
LEMMA_EXC = {
|
||||
"adj": ADJECTIVES_IRREG,
|
||||
"adp": ADP_IRREG,
|
||||
"aux": AUXILIARY_VERBS_IRREG,
|
||||
"cconj": CCONJ_IRREG,
|
||||
"det": DETS_IRREG,
|
||||
"noun": NOUNS_IRREG,
|
||||
"verb": VERBS_IRREG,
|
||||
"pron": PRONOUNS_IRREG,
|
||||
"sconj": SCONJ_IRREG,
|
||||
}
|
||||
|
||||
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES}
|
||||
LEMMA_RULES = {"adj": ADJECTIVE_RULES, "noun": NOUN_RULES, "verb": VERB_RULES}
|
||||
|
|
|
@ -3,22 +3,22 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
ADP_IRREG = {
|
||||
"a": ("à",),
|
||||
"apr.": ("après",),
|
||||
"aux": ("à",),
|
||||
"av.": ("avant",),
|
||||
"avt": ("avant",),
|
||||
"cf.": ("cf",),
|
||||
"conf.": ("cf",),
|
||||
"confer": ("cf",),
|
||||
"d'": ("de",),
|
||||
"des": ("de",),
|
||||
"du": ("de",),
|
||||
"jusqu'": ("jusque",),
|
||||
"pdt": ("pendant",),
|
||||
"+": ("plus",),
|
||||
"pr": ("pour",),
|
||||
"/": ("sur",),
|
||||
"versus": ("vs",),
|
||||
"vs.": ("vs",)
|
||||
"a": ("à",),
|
||||
"apr.": ("après",),
|
||||
"aux": ("à",),
|
||||
"av.": ("avant",),
|
||||
"avt": ("avant",),
|
||||
"cf.": ("cf",),
|
||||
"conf.": ("cf",),
|
||||
"confer": ("cf",),
|
||||
"d'": ("de",),
|
||||
"des": ("de",),
|
||||
"du": ("de",),
|
||||
"jusqu'": ("jusque",),
|
||||
"pdt": ("pendant",),
|
||||
"+": ("plus",),
|
||||
"pr": ("pour",),
|
||||
"/": ("sur",),
|
||||
"versus": ("vs",),
|
||||
"vs.": ("vs",),
|
||||
}
|
||||
|
|
|
@ -365,5 +365,5 @@ AUXILIARY_VERBS_IRREG = {
|
|||
"va": ("aller",),
|
||||
"vais": ("aller",),
|
||||
"vas": ("aller",),
|
||||
"vont": ("aller",)
|
||||
"vont": ("aller",),
|
||||
}
|
||||
|
|
|
@ -3,15 +3,15 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
CCONJ_IRREG = {
|
||||
"&": ("et",),
|
||||
"c-à-d": ("c'est-à-dire",),
|
||||
"c.-à.-d.": ("c'est-à-dire",),
|
||||
"càd": ("c'est-à-dire",),
|
||||
"&": ("et",),
|
||||
"et|ou": ("et-ou",),
|
||||
"et/ou": ("et-ou",),
|
||||
"i.e.": ("c'est-à-dire",),
|
||||
"ie": ("c'est-à-dire",),
|
||||
"ou/et": ("et-ou",),
|
||||
"+": ("plus",)
|
||||
"&": ("et",),
|
||||
"c-à-d": ("c'est-à-dire",),
|
||||
"c.-à.-d.": ("c'est-à-dire",),
|
||||
"càd": ("c'est-à-dire",),
|
||||
"&": ("et",),
|
||||
"et|ou": ("et-ou",),
|
||||
"et/ou": ("et-ou",),
|
||||
"i.e.": ("c'est-à-dire",),
|
||||
"ie": ("c'est-à-dire",),
|
||||
"ou/et": ("et-ou",),
|
||||
"+": ("plus",),
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -3,17 +3,17 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
SCONJ_IRREG = {
|
||||
"lorsqu'": ("lorsque",),
|
||||
"pac'que": ("parce que",),
|
||||
"pac'qu'": ("parce que",),
|
||||
"parc'que": ("parce que",),
|
||||
"parc'qu'": ("parce que",),
|
||||
"paske": ("parce que",),
|
||||
"pask'": ("parce que",),
|
||||
"pcq": ("parce que",),
|
||||
"+": ("plus",),
|
||||
"puisqu'": ("puisque",),
|
||||
"qd": ("quand",),
|
||||
"quoiqu'": ("quoique",),
|
||||
"qu'": ("que",)
|
||||
"lorsqu'": ("lorsque",),
|
||||
"pac'que": ("parce que",),
|
||||
"pac'qu'": ("parce que",),
|
||||
"parc'que": ("parce que",),
|
||||
"parc'qu'": ("parce que",),
|
||||
"paske": ("parce que",),
|
||||
"pask'": ("parce que",),
|
||||
"pcq": ("parce que",),
|
||||
"+": ("plus",),
|
||||
"puisqu'": ("puisque",),
|
||||
"qd": ("quand",),
|
||||
"quoiqu'": ("quoique",),
|
||||
"qu'": ("que",),
|
||||
}
|
||||
|
|
|
@ -3,20 +3,22 @@ from __future__ import unicode_literals
|
|||
|
||||
from pathlib import Path
|
||||
|
||||
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP, SCONJ, CCONJ
|
||||
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
||||
from ....symbols import SCONJ, CCONJ
|
||||
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||
from ....util import load_language_data
|
||||
|
||||
LOOKUP = load_language_data(Path(__file__).parent / 'lookup.json')
|
||||
LOOKUP = load_language_data(Path(__file__).parent / "lookup.json")
|
||||
|
||||
'''
|
||||
"""
|
||||
French language lemmatizer applies the default rule based lemmatization
|
||||
procedure with some modifications for better French language support.
|
||||
|
||||
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use the
|
||||
rule-based lemmatization. As a last resort, the lemmatizer checks in
|
||||
the lookup table.
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class FrenchLemmatizer(object):
|
||||
@classmethod
|
||||
|
@ -32,36 +34,39 @@ class FrenchLemmatizer(object):
|
|||
def __call__(self, string, univ_pos, morphology=None):
|
||||
if not self.rules:
|
||||
return [self.lookup_table.get(string, string)]
|
||||
if univ_pos in (NOUN, 'NOUN', 'noun'):
|
||||
univ_pos = 'noun'
|
||||
elif univ_pos in (VERB, 'VERB', 'verb'):
|
||||
univ_pos = 'verb'
|
||||
elif univ_pos in (ADJ, 'ADJ', 'adj'):
|
||||
univ_pos = 'adj'
|
||||
elif univ_pos in (ADP, 'ADP', 'adp'):
|
||||
univ_pos = 'adp'
|
||||
elif univ_pos in (ADV, 'ADV', 'adv'):
|
||||
univ_pos = 'adv'
|
||||
elif univ_pos in (AUX, 'AUX', 'aux'):
|
||||
univ_pos = 'aux'
|
||||
elif univ_pos in (CCONJ, 'CCONJ', 'cconj'):
|
||||
univ_pos = 'cconj'
|
||||
elif univ_pos in (DET, 'DET', 'det'):
|
||||
univ_pos = 'det'
|
||||
elif univ_pos in (PRON, 'PRON', 'pron'):
|
||||
univ_pos = 'pron'
|
||||
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
||||
univ_pos = 'punct'
|
||||
elif univ_pos in (SCONJ, 'SCONJ', 'sconj'):
|
||||
univ_pos = 'sconj'
|
||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||
univ_pos = "noun"
|
||||
elif univ_pos in (VERB, "VERB", "verb"):
|
||||
univ_pos = "verb"
|
||||
elif univ_pos in (ADJ, "ADJ", "adj"):
|
||||
univ_pos = "adj"
|
||||
elif univ_pos in (ADP, "ADP", "adp"):
|
||||
univ_pos = "adp"
|
||||
elif univ_pos in (ADV, "ADV", "adv"):
|
||||
univ_pos = "adv"
|
||||
elif univ_pos in (AUX, "AUX", "aux"):
|
||||
univ_pos = "aux"
|
||||
elif univ_pos in (CCONJ, "CCONJ", "cconj"):
|
||||
univ_pos = "cconj"
|
||||
elif univ_pos in (DET, "DET", "det"):
|
||||
univ_pos = "det"
|
||||
elif univ_pos in (PRON, "PRON", "pron"):
|
||||
univ_pos = "pron"
|
||||
elif univ_pos in (PUNCT, "PUNCT", "punct"):
|
||||
univ_pos = "punct"
|
||||
elif univ_pos in (SCONJ, "SCONJ", "sconj"):
|
||||
univ_pos = "sconj"
|
||||
else:
|
||||
return [self.lookup(string)]
|
||||
# See Issue #435 for example of where this logic is requied.
|
||||
if self.is_base_form(univ_pos, morphology):
|
||||
return list(set([string.lower()]))
|
||||
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
||||
self.exc.get(univ_pos, {}),
|
||||
self.rules.get(univ_pos, []))
|
||||
lemmas = lemmatize(
|
||||
string,
|
||||
self.index.get(univ_pos, {}),
|
||||
self.exc.get(univ_pos, {}),
|
||||
self.rules.get(univ_pos, []),
|
||||
)
|
||||
return lemmas
|
||||
|
||||
def is_base_form(self, univ_pos, morphology=None):
|
||||
|
@ -70,20 +75,25 @@ class FrenchLemmatizer(object):
|
|||
avoid lemmatization entirely.
|
||||
"""
|
||||
morphology = {} if morphology is None else morphology
|
||||
others = [key for key in morphology
|
||||
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
|
||||
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
||||
others = [
|
||||
key
|
||||
for key in morphology
|
||||
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
|
||||
]
|
||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||
return True
|
||||
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and
|
||||
morphology.get('Tense') == 'pres' and
|
||||
morphology.get('Number') is None and
|
||||
not others):
|
||||
elif univ_pos == "verb" and (
|
||||
morphology.get("VerbForm") == "fin"
|
||||
and morphology.get("Tense") == "pres"
|
||||
and morphology.get("Number") is None
|
||||
and not others
|
||||
):
|
||||
return True
|
||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||
return True
|
||||
elif VerbForm_inf in morphology:
|
||||
return True
|
||||
|
@ -97,16 +107,16 @@ class FrenchLemmatizer(object):
|
|||
return False
|
||||
|
||||
def noun(self, string, morphology=None):
|
||||
return self(string, 'noun', morphology)
|
||||
return self(string, "noun", morphology)
|
||||
|
||||
def verb(self, string, morphology=None):
|
||||
return self(string, 'verb', morphology)
|
||||
return self(string, "verb", morphology)
|
||||
|
||||
def adj(self, string, morphology=None):
|
||||
return self(string, 'adj', morphology)
|
||||
return self(string, "adj", morphology)
|
||||
|
||||
def punct(self, string, morphology=None):
|
||||
return self(string, 'punct', morphology)
|
||||
return self(string, "punct", morphology)
|
||||
|
||||
def lookup(self, string):
|
||||
if string in self.lookup_table:
|
||||
|
@ -117,7 +127,7 @@ class FrenchLemmatizer(object):
|
|||
def lemmatize(string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if (string in index):
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
return forms
|
||||
forms.extend(exceptions.get(string, []))
|
||||
|
@ -125,7 +135,7 @@ def lemmatize(string, index, exceptions, rules):
|
|||
if not forms:
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[:len(string) - len(old)] + new
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index or not form.isalpha():
|
||||
|
|
|
@ -2,8 +2,6 @@
|
|||
from __future__ import unicode_literals
|
||||
from ...symbols import ORTH, LEMMA
|
||||
|
||||
_exc = {
|
||||
"po'": [{ORTH: "po'", LEMMA: 'poco'}]
|
||||
}
|
||||
_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
@ -11,5 +12,5 @@ sentences = [
|
|||
"애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.",
|
||||
"자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.",
|
||||
"자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.",
|
||||
"런던은 영국의 수도이자 가장 큰 도시입니다."
|
||||
"런던은 영국의 수도이자 가장 큰 도시입니다.",
|
||||
]
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
STOP_WORDS = set("""
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
이
|
||||
있
|
||||
하
|
||||
|
@ -65,4 +66,5 @@ STOP_WORDS = set("""
|
|||
원
|
||||
잘
|
||||
놓
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -20,10 +20,10 @@ LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
|
|||
BASE_PATH = Path(__file__).parent
|
||||
|
||||
LEMMA_EXC = {
|
||||
"adj": load_language_data(BASE_PATH / '_adjectives_wordforms.json'),
|
||||
"adj": load_language_data(BASE_PATH / "_adjectives_wordforms.json"),
|
||||
"adv": ADVERBS_WORDFORMS,
|
||||
"noun": load_language_data(BASE_PATH / '_nouns_wordforms.json'),
|
||||
"verb": load_language_data(BASE_PATH / '_verbs_wordforms.json'),
|
||||
"noun": load_language_data(BASE_PATH / "_nouns_wordforms.json"),
|
||||
"verb": load_language_data(BASE_PATH / "_verbs_wordforms.json"),
|
||||
}
|
||||
|
||||
LEMMA_RULES = {
|
||||
|
@ -39,5 +39,3 @@ LEMMA_RULES = {
|
|||
# https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
|
||||
# License:
|
||||
# Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)
|
||||
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -14,7 +14,7 @@ _infixes = (
|
|||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
|
|
|
@ -118,7 +118,7 @@ for orth in [
|
|||
"o.l.",
|
||||
"on.",
|
||||
"op.",
|
||||
"org."
|
||||
"org.",
|
||||
"osv.",
|
||||
"ovf.",
|
||||
"p.",
|
||||
|
|
|
@ -14,5 +14,5 @@ sentences = [
|
|||
"Apple overweegt om voor 1 miljard een U.K. startup te kopen",
|
||||
"Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",
|
||||
"San Francisco overweegt robots op voetpaden te verbieden",
|
||||
"Londen is een grote stad in het Verenigd Koninkrijk"
|
||||
"Londen is een grote stad in het Verenigd Koninkrijk",
|
||||
]
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -3,22 +3,25 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
ADPOSITIONS = set(
|
||||
('aan aangaande aanwezig achter af afgezien al als an annex anno anti '
|
||||
'behalve behoudens beneden benevens benoorden beoosten betreffende bewesten '
|
||||
'bezijden bezuiden bij binnen binnenuit binst bladzij blijkens boven bovenop '
|
||||
'buiten conform contra cq daaraan daarbij daarbuiten daarin daarnaar '
|
||||
'daaronder daartegenover daarvan dankzij deure dichtbij door doordat doorheen '
|
||||
'echter eraf erop erover errond eruit ervoor evenals exclusief gedaan '
|
||||
'gedurende gegeven getuige gezien halfweg halverwege heen hierdoorheen hierop '
|
||||
'houdende in inclusief indien ingaande ingevolge inzake jegens kortweg '
|
||||
'krachtens kralj langs langsheen langst lastens linksom lopende luidens mede '
|
||||
'mee met middels midden middenop mits na naan naar naartoe naast naat nabij '
|
||||
'nadat namens neer neffe neffen neven nevenst niettegenstaande nopens '
|
||||
'officieel om omheen omstreeks omtrent onafgezien ondanks onder onderaan '
|
||||
'ondere ongeacht ooit op open over per plus pro qua rechtover rond rondom '
|
||||
"sedert sinds spijts strekkende te tegen tegenaan tegenop tegenover telde "
|
||||
'teneinde terug tijdens toe tot totdat trots tussen tégen uit uitgenomen '
|
||||
'ultimo van vanaf vandaan vandoor vanop vanuit vanwege versus via vinnen '
|
||||
'vlakbij volgens voor voor- voorbij voordat voort voren vòòr vóór waaraan '
|
||||
'waarbij waardoor waaronder weg wegens weleens zijdens zoals zodat zonder '
|
||||
'zónder à').split())
|
||||
(
|
||||
"aan aangaande aanwezig achter af afgezien al als an annex anno anti "
|
||||
"behalve behoudens beneden benevens benoorden beoosten betreffende bewesten "
|
||||
"bezijden bezuiden bij binnen binnenuit binst bladzij blijkens boven bovenop "
|
||||
"buiten conform contra cq daaraan daarbij daarbuiten daarin daarnaar "
|
||||
"daaronder daartegenover daarvan dankzij deure dichtbij door doordat doorheen "
|
||||
"echter eraf erop erover errond eruit ervoor evenals exclusief gedaan "
|
||||
"gedurende gegeven getuige gezien halfweg halverwege heen hierdoorheen hierop "
|
||||
"houdende in inclusief indien ingaande ingevolge inzake jegens kortweg "
|
||||
"krachtens kralj langs langsheen langst lastens linksom lopende luidens mede "
|
||||
"mee met middels midden middenop mits na naan naar naartoe naast naat nabij "
|
||||
"nadat namens neer neffe neffen neven nevenst niettegenstaande nopens "
|
||||
"officieel om omheen omstreeks omtrent onafgezien ondanks onder onderaan "
|
||||
"ondere ongeacht ooit op open over per plus pro qua rechtover rond rondom "
|
||||
"sedert sinds spijts strekkende te tegen tegenaan tegenop tegenover telde "
|
||||
"teneinde terug tijdens toe tot totdat trots tussen tégen uit uitgenomen "
|
||||
"ultimo van vanaf vandaan vandoor vanop vanuit vanwege versus via vinnen "
|
||||
"vlakbij volgens voor voor- voorbij voordat voort voren vòòr vóór waaraan "
|
||||
"waarbij waardoor waaronder weg wegens weleens zijdens zoals zodat zonder "
|
||||
"zónder à"
|
||||
).split()
|
||||
)
|
||||
|
|
|
@ -3,10 +3,10 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
ADPOSITIONS_IRREG = {
|
||||
"'t": ('te',),
|
||||
'me': ('mee',),
|
||||
'meer': ('mee',),
|
||||
'on': ('om',),
|
||||
'ten': ('te',),
|
||||
'ter': ('te',)
|
||||
"'t": ("te",),
|
||||
"me": ("mee",),
|
||||
"meer": ("mee",),
|
||||
"on": ("om",),
|
||||
"ten": ("te",),
|
||||
"ter": ("te",),
|
||||
}
|
||||
|
|
|
@ -3,17 +3,17 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
ADVERBS_IRREG = {
|
||||
"'ns": ('eens',),
|
||||
"'s": ('eens',),
|
||||
"'t": ('het',),
|
||||
"d'r": ('er',),
|
||||
"d'raf": ('eraf',),
|
||||
"d'rbij": ('erbij',),
|
||||
"d'rheen": ('erheen',),
|
||||
"d'rin": ('erin',),
|
||||
"d'rna": ('erna',),
|
||||
"d'rnaar": ('ernaar',),
|
||||
'hele': ('heel',),
|
||||
'nevenst': ('nevens',),
|
||||
'overend': ('overeind',)
|
||||
"'ns": ("eens",),
|
||||
"'s": ("eens",),
|
||||
"'t": ("het",),
|
||||
"d'r": ("er",),
|
||||
"d'raf": ("eraf",),
|
||||
"d'rbij": ("erbij",),
|
||||
"d'rheen": ("erheen",),
|
||||
"d'rin": ("erin",),
|
||||
"d'rna": ("erna",),
|
||||
"d'rnaar": ("ernaar",),
|
||||
"hele": ("heel",),
|
||||
"nevenst": ("nevens",),
|
||||
"overend": ("overeind",),
|
||||
}
|
||||
|
|
|
@ -3,15 +3,18 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
DETERMINERS = set(
|
||||
("al allebei allerhande allerminst alletwee"
|
||||
"beide clip-on d'n d'r dat datgeen datgene de dees degeen degene den dewelke "
|
||||
'deze dezelfde die diegeen diegene diehien dien diene diens diezelfde dit '
|
||||
'ditgene e een eene eigen elk elkens elkes enig enkel enne ettelijke eure '
|
||||
'euren evenveel ewe ge geen ginds géén haar haaren halfelf het hetgeen '
|
||||
'hetwelk hetzelfde heur heure hulder hulle hullen hullie hun hunder hunderen '
|
||||
'ieder iederes ja je jen jouw jouwen jouwes jullie junder keiveel keiweinig '
|
||||
"m'ne me meer meerder meerdere menen menig mijn mijnes minst méér niemendal "
|
||||
'oe ons onse se sommig sommigeder superveel telken teveel titulair ulder '
|
||||
'uldere ulderen ulle under une uw vaak veel veels véél wat weinig welk welken '
|
||||
"welkene welksten z'nen ze zenen zijn zo'n zo'ne zoiet zoveel zovele zovelen "
|
||||
'zuk zulk zulkdanig zulken zulks zullie zíjn àlle álle').split())
|
||||
(
|
||||
"al allebei allerhande allerminst alletwee"
|
||||
"beide clip-on d'n d'r dat datgeen datgene de dees degeen degene den dewelke "
|
||||
"deze dezelfde die diegeen diegene diehien dien diene diens diezelfde dit "
|
||||
"ditgene e een eene eigen elk elkens elkes enig enkel enne ettelijke eure "
|
||||
"euren evenveel ewe ge geen ginds géén haar haaren halfelf het hetgeen "
|
||||
"hetwelk hetzelfde heur heure hulder hulle hullen hullie hun hunder hunderen "
|
||||
"ieder iederes ja je jen jouw jouwen jouwes jullie junder keiveel keiweinig "
|
||||
"m'ne me meer meerder meerdere menen menig mijn mijnes minst méér niemendal "
|
||||
"oe ons onse se sommig sommigeder superveel telken teveel titulair ulder "
|
||||
"uldere ulderen ulle under une uw vaak veel veels véél wat weinig welk welken "
|
||||
"welkene welksten z'nen ze zenen zijn zo'n zo'ne zoiet zoveel zovele zovelen "
|
||||
"zuk zulk zulkdanig zulken zulks zullie zíjn àlle álle"
|
||||
).split()
|
||||
)
|
||||
|
|
|
@ -3,67 +3,67 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
DETERMINERS_IRREG = {
|
||||
"'r": ('haar',),
|
||||
"'s": ('de',),
|
||||
"'t": ('het',),
|
||||
"'tgene": ('hetgeen',),
|
||||
'alle': ('al',),
|
||||
'allen': ('al',),
|
||||
'aller': ('al',),
|
||||
'beiden': ('beide',),
|
||||
'beider': ('beide',),
|
||||
"d'": ('het',),
|
||||
"d'r": ('haar',),
|
||||
'der': ('de',),
|
||||
'des': ('de',),
|
||||
'dezer': ('deze',),
|
||||
'dienen': ('die',),
|
||||
'dier': ('die',),
|
||||
'elke': ('elk',),
|
||||
'ene': ('een',),
|
||||
'enen': ('een',),
|
||||
'ener': ('een',),
|
||||
'enige': ('enig',),
|
||||
'enigen': ('enig',),
|
||||
'er': ('haar',),
|
||||
'gene': ('geen',),
|
||||
'genen': ('geen',),
|
||||
'hare': ('haar',),
|
||||
'haren': ('haar',),
|
||||
'harer': ('haar',),
|
||||
'hunne': ('hun',),
|
||||
'hunnen': ('hun',),
|
||||
'jou': ('jouw',),
|
||||
'jouwe': ('jouw',),
|
||||
'julliejen': ('jullie',),
|
||||
"m'n": ('mijn',),
|
||||
'mee': ('meer',),
|
||||
'meer': ('veel',),
|
||||
'meerderen': ('meerdere',),
|
||||
'meest': ('veel',),
|
||||
'meesten': ('veel',),
|
||||
'meet': ('veel',),
|
||||
'menige': ('menig',),
|
||||
'mij': ('mijn',),
|
||||
'mijnen': ('mijn',),
|
||||
'minder': ('weinig',),
|
||||
'mindere': ('weinig',),
|
||||
'minst': ('weinig',),
|
||||
'minste': ('minst',),
|
||||
'ne': ('een',),
|
||||
'onze': ('ons',),
|
||||
'onzent': ('ons',),
|
||||
'onzer': ('ons',),
|
||||
'ouw': ('uw',),
|
||||
'sommige': ('sommig',),
|
||||
'sommigen': ('sommig',),
|
||||
'u': ('uw',),
|
||||
'vaker': ('vaak',),
|
||||
'vele': ('veel',),
|
||||
'velen': ('veel',),
|
||||
'welke': ('welk',),
|
||||
'zijne': ('zijn',),
|
||||
'zijnen': ('zijn',),
|
||||
'zijns': ('zijn',),
|
||||
'één': ('een',)
|
||||
"'r": ("haar",),
|
||||
"'s": ("de",),
|
||||
"'t": ("het",),
|
||||
"'tgene": ("hetgeen",),
|
||||
"alle": ("al",),
|
||||
"allen": ("al",),
|
||||
"aller": ("al",),
|
||||
"beiden": ("beide",),
|
||||
"beider": ("beide",),
|
||||
"d'": ("het",),
|
||||
"d'r": ("haar",),
|
||||
"der": ("de",),
|
||||
"des": ("de",),
|
||||
"dezer": ("deze",),
|
||||
"dienen": ("die",),
|
||||
"dier": ("die",),
|
||||
"elke": ("elk",),
|
||||
"ene": ("een",),
|
||||
"enen": ("een",),
|
||||
"ener": ("een",),
|
||||
"enige": ("enig",),
|
||||
"enigen": ("enig",),
|
||||
"er": ("haar",),
|
||||
"gene": ("geen",),
|
||||
"genen": ("geen",),
|
||||
"hare": ("haar",),
|
||||
"haren": ("haar",),
|
||||
"harer": ("haar",),
|
||||
"hunne": ("hun",),
|
||||
"hunnen": ("hun",),
|
||||
"jou": ("jouw",),
|
||||
"jouwe": ("jouw",),
|
||||
"julliejen": ("jullie",),
|
||||
"m'n": ("mijn",),
|
||||
"mee": ("meer",),
|
||||
"meer": ("veel",),
|
||||
"meerderen": ("meerdere",),
|
||||
"meest": ("veel",),
|
||||
"meesten": ("veel",),
|
||||
"meet": ("veel",),
|
||||
"menige": ("menig",),
|
||||
"mij": ("mijn",),
|
||||
"mijnen": ("mijn",),
|
||||
"minder": ("weinig",),
|
||||
"mindere": ("weinig",),
|
||||
"minst": ("weinig",),
|
||||
"minste": ("minst",),
|
||||
"ne": ("een",),
|
||||
"onze": ("ons",),
|
||||
"onzent": ("ons",),
|
||||
"onzer": ("ons",),
|
||||
"ouw": ("uw",),
|
||||
"sommige": ("sommig",),
|
||||
"sommigen": ("sommig",),
|
||||
"u": ("uw",),
|
||||
"vaker": ("vaak",),
|
||||
"vele": ("veel",),
|
||||
"velen": ("veel",),
|
||||
"welke": ("welk",),
|
||||
"zijne": ("zijn",),
|
||||
"zijnen": ("zijn",),
|
||||
"zijns": ("zijn",),
|
||||
"één": ("een",),
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ ADJECTIVE_SUFFIX_RULES = [
|
|||
["er", ""],
|
||||
["en", ""],
|
||||
["e", ""],
|
||||
["ende", "end"]
|
||||
["ende", "end"],
|
||||
]
|
||||
|
||||
VERB_SUFFIX_RULES = [
|
||||
|
@ -39,7 +39,7 @@ NOUN_SUFFIX_RULES = [
|
|||
["ssen", "s"],
|
||||
["rren", "r"],
|
||||
["kken", "k"],
|
||||
["bben", "b"]
|
||||
["bben", "b"],
|
||||
]
|
||||
|
||||
NUM_SUFFIX_RULES = [
|
||||
|
@ -50,23 +50,20 @@ NUM_SUFFIX_RULES = [
|
|||
["de", ""],
|
||||
["er", ""],
|
||||
["ër", ""],
|
||||
["tjes", ""]
|
||||
["tjes", ""],
|
||||
]
|
||||
|
||||
PUNCT_SUFFIX_RULES = [
|
||||
["“", "\""],
|
||||
["”", "\""],
|
||||
["\u2018", "'"],
|
||||
["\u2019", "'"]
|
||||
]
|
||||
PUNCT_SUFFIX_RULES = [["“", '"'], ["”", '"'], ["\u2018", "'"], ["\u2019", "'"]]
|
||||
|
||||
|
||||
# In-place sort guaranteeing that longer -- more specific -- rules are
|
||||
# applied first.
|
||||
for rule_set in (ADJECTIVE_SUFFIX_RULES,
|
||||
NOUN_SUFFIX_RULES,
|
||||
NUM_SUFFIX_RULES,
|
||||
VERB_SUFFIX_RULES):
|
||||
for rule_set in (
|
||||
ADJECTIVE_SUFFIX_RULES,
|
||||
NOUN_SUFFIX_RULES,
|
||||
NUM_SUFFIX_RULES,
|
||||
VERB_SUFFIX_RULES,
|
||||
):
|
||||
rule_set.sort(key=lambda r: len(r[0]), reverse=True)
|
||||
|
||||
|
||||
|
@ -75,5 +72,5 @@ RULES = {
|
|||
"noun": NOUN_SUFFIX_RULES,
|
||||
"verb": VERB_SUFFIX_RULES,
|
||||
"num": NUM_SUFFIX_RULES,
|
||||
"punct": PUNCT_SUFFIX_RULES
|
||||
"punct": PUNCT_SUFFIX_RULES,
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -3,29 +3,29 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
NUMBERS_IRREG = {
|
||||
'achten': ('acht',),
|
||||
'biljoenen': ('biljoen',),
|
||||
'drieën': ('drie',),
|
||||
'duizenden': ('duizend',),
|
||||
'eentjes': ('één',),
|
||||
'elven': ('elf',),
|
||||
'miljoenen': ('miljoen',),
|
||||
'negenen': ('negen',),
|
||||
'negentiger': ('negentig',),
|
||||
'tienduizenden': ('tienduizend',),
|
||||
'tienen': ('tien',),
|
||||
'tientjes': ('tien',),
|
||||
'twaalven': ('twaalf',),
|
||||
'tweeën': ('twee',),
|
||||
'twintiger': ('twintig',),
|
||||
'twintigsten': ('twintig',),
|
||||
'vieren': ('vier',),
|
||||
'vijftiger': ('vijftig',),
|
||||
'vijven': ('vijf',),
|
||||
'zessen': ('zes',),
|
||||
'zestiger': ('zestig',),
|
||||
'zevenen': ('zeven',),
|
||||
'zeventiger': ('zeventig',),
|
||||
'zovele': ('zoveel',),
|
||||
'zovelen': ('zoveel',)
|
||||
"achten": ("acht",),
|
||||
"biljoenen": ("biljoen",),
|
||||
"drieën": ("drie",),
|
||||
"duizenden": ("duizend",),
|
||||
"eentjes": ("één",),
|
||||
"elven": ("elf",),
|
||||
"miljoenen": ("miljoen",),
|
||||
"negenen": ("negen",),
|
||||
"negentiger": ("negentig",),
|
||||
"tienduizenden": ("tienduizend",),
|
||||
"tienen": ("tien",),
|
||||
"tientjes": ("tien",),
|
||||
"twaalven": ("twaalf",),
|
||||
"tweeën": ("twee",),
|
||||
"twintiger": ("twintig",),
|
||||
"twintigsten": ("twintig",),
|
||||
"vieren": ("vier",),
|
||||
"vijftiger": ("vijftig",),
|
||||
"vijven": ("vijf",),
|
||||
"zessen": ("zes",),
|
||||
"zestiger": ("zestig",),
|
||||
"zevenen": ("zeven",),
|
||||
"zeventiger": ("zeventig",),
|
||||
"zovele": ("zoveel",),
|
||||
"zovelen": ("zoveel",),
|
||||
}
|
||||
|
|
|
@ -3,33 +3,33 @@ from __future__ import unicode_literals
|
|||
|
||||
|
||||
PRONOUNS_IRREG = {
|
||||
"'r": ('haar',),
|
||||
"'rzelf": ('haarzelf',),
|
||||
"'t": ('het',),
|
||||
"d'r": ('haar',),
|
||||
'da': ('dat',),
|
||||
'dienen': ('die',),
|
||||
'diens': ('die',),
|
||||
'dies': ('die',),
|
||||
'elkaars': ('elkaar',),
|
||||
'elkanders': ('elkander',),
|
||||
'ene': ('een',),
|
||||
'enen': ('een',),
|
||||
'fik': ('ik',),
|
||||
'gaat': ('gaan',),
|
||||
'gene': ('geen',),
|
||||
'harer': ('haar',),
|
||||
'ieders': ('ieder',),
|
||||
'iemands': ('iemand',),
|
||||
'ikke': ('ik',),
|
||||
'mijnen': ('mijn',),
|
||||
'oe': ('je',),
|
||||
'onzer': ('ons',),
|
||||
'wa': ('wat',),
|
||||
'watte': ('wat',),
|
||||
'wier': ('wie',),
|
||||
'zijns': ('zijn',),
|
||||
'zoietsken': ('zoietske',),
|
||||
'zulks': ('zulk',),
|
||||
'één': ('een',)
|
||||
"'r": ("haar",),
|
||||
"'rzelf": ("haarzelf",),
|
||||
"'t": ("het",),
|
||||
"d'r": ("haar",),
|
||||
"da": ("dat",),
|
||||
"dienen": ("die",),
|
||||
"diens": ("die",),
|
||||
"dies": ("die",),
|
||||
"elkaars": ("elkaar",),
|
||||
"elkanders": ("elkander",),
|
||||
"ene": ("een",),
|
||||
"enen": ("een",),
|
||||
"fik": ("ik",),
|
||||
"gaat": ("gaan",),
|
||||
"gene": ("geen",),
|
||||
"harer": ("haar",),
|
||||
"ieders": ("ieder",),
|
||||
"iemands": ("iemand",),
|
||||
"ikke": ("ik",),
|
||||
"mijnen": ("mijn",),
|
||||
"oe": ("je",),
|
||||
"onzer": ("ons",),
|
||||
"wa": ("wat",),
|
||||
"watte": ("wat",),
|
||||
"wier": ("wie",),
|
||||
"zijns": ("zijn",),
|
||||
"zoietsken": ("zoietske",),
|
||||
"zulks": ("zulk",),
|
||||
"één": ("een",),
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -7,15 +7,33 @@ from ....symbols import POS, NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
|||
class DutchLemmatizer(object):
|
||||
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
|
||||
univ_pos_name_variants = {
|
||||
NOUN: "noun", "NOUN": "noun", "noun": "noun",
|
||||
VERB: "verb", "VERB": "verb", "verb": "verb",
|
||||
AUX: "verb", "AUX": "verb", "aux": "verb",
|
||||
ADJ: "adj", "ADJ": "adj", "adj": "adj",
|
||||
ADV: "adv", "ADV": "adv", "adv": "adv",
|
||||
PRON: "pron", "PRON": "pron", "pron": "pron",
|
||||
DET: "det", "DET": "det", "det": "det",
|
||||
ADP: "adp", "ADP": "adp", "adp": "adp",
|
||||
NUM: "num", "NUM": "num", "num": "num"
|
||||
NOUN: "noun",
|
||||
"NOUN": "noun",
|
||||
"noun": "noun",
|
||||
VERB: "verb",
|
||||
"VERB": "verb",
|
||||
"verb": "verb",
|
||||
AUX: "verb",
|
||||
"AUX": "verb",
|
||||
"aux": "verb",
|
||||
ADJ: "adj",
|
||||
"ADJ": "adj",
|
||||
"adj": "adj",
|
||||
ADV: "adv",
|
||||
"ADV": "adv",
|
||||
"adv": "adv",
|
||||
PRON: "pron",
|
||||
"PRON": "pron",
|
||||
"pron": "pron",
|
||||
DET: "det",
|
||||
"DET": "det",
|
||||
"det": "det",
|
||||
ADP: "adp",
|
||||
"ADP": "adp",
|
||||
"adp": "adp",
|
||||
NUM: "num",
|
||||
"NUM": "num",
|
||||
"num": "num",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
@ -62,10 +80,8 @@ class DutchLemmatizer(object):
|
|||
return [looked_up_lemma]
|
||||
|
||||
forms, is_known = lemmatize(
|
||||
string,
|
||||
lemma_index,
|
||||
exceptions,
|
||||
self.rules.get(univ_pos, []))
|
||||
string, lemma_index, exceptions, self.rules.get(univ_pos, [])
|
||||
)
|
||||
|
||||
# Back-off through remaining return value candidates.
|
||||
if forms:
|
||||
|
@ -92,25 +108,25 @@ class DutchLemmatizer(object):
|
|||
return self.lookup_table.get(string, string)
|
||||
|
||||
def noun(self, string, morphology=None):
|
||||
return self(string, 'noun', morphology)
|
||||
return self(string, "noun", morphology)
|
||||
|
||||
def verb(self, string, morphology=None):
|
||||
return self(string, 'verb', morphology)
|
||||
return self(string, "verb", morphology)
|
||||
|
||||
def adj(self, string, morphology=None):
|
||||
return self(string, 'adj', morphology)
|
||||
return self(string, "adj", morphology)
|
||||
|
||||
def det(self, string, morphology=None):
|
||||
return self(string, 'det', morphology)
|
||||
return self(string, "det", morphology)
|
||||
|
||||
def pron(self, string, morphology=None):
|
||||
return self(string, 'pron', morphology)
|
||||
return self(string, "pron", morphology)
|
||||
|
||||
def adp(self, string, morphology=None):
|
||||
return self(string, 'adp', morphology)
|
||||
return self(string, "adp", morphology)
|
||||
|
||||
def punct(self, string, morphology=None):
|
||||
return self(string, 'punct', morphology)
|
||||
return self(string, "punct", morphology)
|
||||
|
||||
|
||||
# Reimplemented to focus more on application of suffix rules and to return
|
||||
|
@ -120,7 +136,7 @@ def lemmatize(string, index, exceptions, rules):
|
|||
oov_forms = []
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[:len(string) - len(old)] + new
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index:
|
||||
|
|
|
@ -4,18 +4,22 @@ from __future__ import unicode_literals
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = set("""
|
||||
_num_words = set(
|
||||
"""
|
||||
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
|
||||
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
|
||||
duizend miljoen miljard biljoen biljard triljoen triljard
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
||||
_ordinal_words = set("""
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
|
||||
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
|
||||
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
|
||||
miljardste biljoenste biljardste triljoenste triljardste
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
@ -23,11 +27,11 @@ def like_num(text):
|
|||
# or matches one of the number words. In order to handle numbers like
|
||||
# "drieëntwintig", more work is required.
|
||||
# See this discussion: https://github.com/explosion/spaCy/pull/1177
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text.lower() in _num_words:
|
||||
|
@ -37,6 +41,4 @@ def like_num(text):
|
|||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||
|
|
|
@ -10,24 +10,32 @@ from ..punctuation import TOKENIZER_SUFFIXES as DEFAULT_TOKENIZER_SUFFIXES
|
|||
# Copied from `de` package. Main purpose is to ensure that hyphens are not
|
||||
# split on.
|
||||
|
||||
_quotes = CONCAT_QUOTES.replace("'", '')
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])'.format(a=ALPHA, q=_quotes),
|
||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[0-9])-(?=[0-9])'])
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[{}])\.(?=[{}])".format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[0-9])-(?=[0-9])",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# Remove "'s" suffix from suffix list. In Dutch, "'s" is a plural ending when
|
||||
# it occurs as a suffix and a clitic for "eens" in standalone use. To avoid
|
||||
# ambiguity it's better to just leave it attached when it occurs as a suffix.
|
||||
default_suffix_blacklist = ("'s", "'S", '’s', '’S')
|
||||
_suffixes = [suffix for suffix in DEFAULT_TOKENIZER_SUFFIXES
|
||||
if suffix not in default_suffix_blacklist]
|
||||
default_suffix_blacklist = ("'s", "'S", "’s", "’S")
|
||||
_suffixes = [
|
||||
suffix
|
||||
for suffix in DEFAULT_TOKENIZER_SUFFIXES
|
||||
if suffix not in default_suffix_blacklist
|
||||
]
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
|
|
|
@ -16,7 +16,8 @@ from __future__ import unicode_literals
|
|||
# should have a Dutch counterpart here.
|
||||
|
||||
|
||||
STOP_WORDS = set("""
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
|
||||
afgelopen aldus alhoewel anderzijds
|
||||
|
||||
|
@ -70,4 +71,5 @@ welk welke welken werd werden wiens wier wilde wordt
|
|||
|
||||
zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zo’n zoals zodra zouden
|
||||
zoveel zowat zulk zulke zulks zullen zult
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -47,8 +47,12 @@ TAG_MAP = {
|
|||
"Adj_Prep|adv|vergr|onverv_voor__Degree=Cmp|Variant=Short": {POS: ADJ},
|
||||
"Adj_V_Conj_V__Degree=Pos|VerbForm=Inf": {POS: ADJ},
|
||||
"Adj_V_N__Degree=Pos|Number=Sing|Tense=Past|VerbForm=Part": {POS: ADJ},
|
||||
"Adj_V|adv|stell|onverv_intrans|inf__Degree=Pos|Variant=Short|VerbForm=Inf": {POS: ADJ},
|
||||
"Adj_V|adv|stell|onverv_trans|imp__Degree=Pos|Mood=Imp|Variant=Short|VerbForm=Fin": {POS: ADJ},
|
||||
"Adj_V|adv|stell|onverv_intrans|inf__Degree=Pos|Variant=Short|VerbForm=Inf": {
|
||||
POS: ADJ
|
||||
},
|
||||
"Adj_V|adv|stell|onverv_trans|imp__Degree=Pos|Mood=Imp|Variant=Short|VerbForm=Fin": {
|
||||
POS: ADJ
|
||||
},
|
||||
"Adj|adv|stell|onverv__Degree=Pos|Variant=Short": {POS: ADJ},
|
||||
"Adj|adv|stell|vervneut__Case=Nom|Degree=Pos|Variant=Short": {POS: ADJ},
|
||||
"Adj|adv|vergr|onverv__Degree=Cmp|Variant=Short": {POS: ADJ},
|
||||
|
@ -133,15 +137,21 @@ TAG_MAP = {
|
|||
"Art_Num__Definite=Def|Degree=Sup|Gender=Neut|PronType=Ind": {POS: DET},
|
||||
"Art_Num__Definite=Def|Gender=Neut": {POS: DET},
|
||||
"Art_Num__Degree=Pos|Number=Sing|PronType=Ind": {POS: DET},
|
||||
"Art_N|bep|onzijd|neut_eigen|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {POS: DET},
|
||||
"Art_N|bep|onzijd|neut_soort|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {POS: DET},
|
||||
"Art_N|bep|onzijd|neut_eigen|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {
|
||||
POS: DET
|
||||
},
|
||||
"Art_N|bep|onzijd|neut_soort|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {
|
||||
POS: DET
|
||||
},
|
||||
"Art_Pron_N__Case=Gen|Number=Plur|PronType=Ind": {POS: DET},
|
||||
"Art_Pron__Number=Sing|PronType=Ind": {POS: DET},
|
||||
"Art_V_N__AdpType=Prep": {POS: DET},
|
||||
"Art|bep|onzijd|neut__Definite=Def|Gender=Neut|PronType=Art": {POS: DET},
|
||||
"Art|bep|zijdofmv|gen__Case=Gen|Definite=Def|PronType=Art": {POS: DET},
|
||||
"Art|bep|zijdofmv|neut__Definite=Def|PronType=Art": {POS: DET},
|
||||
"Art|bep|zijdofonzijd|gen__Case=Gen|Definite=Def|Number=Sing|PronType=Art": {POS: DET},
|
||||
"Art|bep|zijdofonzijd|gen__Case=Gen|Definite=Def|Number=Sing|PronType=Art": {
|
||||
POS: DET
|
||||
},
|
||||
"Art|bep|zijd|dat__Case=Dat|Definite=Def|Gender=Com|PronType=Art": {POS: DET},
|
||||
"Art|onbep|zijdofonzijd|neut__Definite=Ind|Number=Sing|PronType=Art": {POS: DET},
|
||||
"CCONJ___": {POS: CONJ},
|
||||
|
@ -159,17 +169,23 @@ TAG_MAP = {
|
|||
"Conj_Int|onder|metfin___": {POS: CONJ},
|
||||
"Conj_N_Adv__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
||||
"Conj_N_Prep__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
||||
"Conj_N|onder|metfin_soort|ev|neut__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
||||
"Conj_N|onder|metfin_soort|ev|neut__AdpType=Preppron|Gender=Masc|Number=Plur": {
|
||||
POS: CONJ
|
||||
},
|
||||
"Conj_Pron_Adv__Degree=Pos|Number=Sing|Person=3": {POS: CONJ},
|
||||
"Conj_Pron_V__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
||||
"Conj_Pron|neven_aanw|neut|zelfst__AdpType=Prep": {POS: CONJ},
|
||||
"Conj_Punc_Conj|neven_schuinstreep_neven__AdpType=Prep": {POS: CONJ},
|
||||
"Conj_V|onder|metfin_intrans|ott|3|ev__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
||||
"Conj_V|onder|metfin_intrans|ott|3|ev__AdpType=Preppron|Gender=Masc|Number=Plur": {
|
||||
POS: CONJ
|
||||
},
|
||||
"Conj|neven___": {POS: CONJ},
|
||||
"Conj|onder|metfin___": {POS: CONJ},
|
||||
"Conj|onder|metinf___": {POS: CONJ},
|
||||
"DET__Degree=Cmp|NumType=Card|PronType=Ind": {POS: DET},
|
||||
"DET__Gender=Fem|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs": {POS: DET},
|
||||
"DET__Gender=Fem|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs": {
|
||||
POS: DET
|
||||
},
|
||||
"DET__Gender=Fem|Number=Sing|PronType=Art": {POS: DET},
|
||||
"DET__Gender=Masc|Number=Plur|PronType=Art": {POS: DET},
|
||||
"DET__Gender=Masc|Number=Sing|PronType=Tot": {POS: DET},
|
||||
|
@ -185,7 +201,9 @@ TAG_MAP = {
|
|||
"Misc_Misc_Misc_Misc_Misc_Misc_Punc_Misc_Misc_Misc___": {POS: X},
|
||||
"Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X},
|
||||
"Misc_Misc_Misc_Misc_Misc_N_Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X},
|
||||
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd__AdpType=Preppron|Gender=Masc|Number=Sing": {POS: X},
|
||||
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd__AdpType=Preppron|Gender=Masc|Number=Sing": {
|
||||
POS: X
|
||||
},
|
||||
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd___": {POS: X},
|
||||
"Misc_Misc_Misc_N__Number=Sing": {POS: X},
|
||||
"Misc_Misc_Misc|vreemd_vreemd_vreemd___": {POS: X},
|
||||
|
@ -217,7 +235,9 @@ TAG_MAP = {
|
|||
"N_Adj__Degree=Pos|Number=Plur": {POS: NOUN},
|
||||
"N_Adj__Degree=Pos|Number=Sing": {POS: NOUN},
|
||||
"N_Adj___": {POS: NOUN},
|
||||
"N_Adv_Punc_V_Pron_V__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {POS: NOUN},
|
||||
"N_Adv_Punc_V_Pron_V__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {
|
||||
POS: NOUN
|
||||
},
|
||||
"N_Adv__Degree=Pos|Number=Sing": {POS: NOUN},
|
||||
"N_Adv___": {POS: NOUN},
|
||||
"N_Adv|soort|ev|neut_deelv__Number=Sing": {POS: NOUN},
|
||||
|
@ -320,12 +340,20 @@ TAG_MAP = {
|
|||
"N_N|eigen|ev|gen_soort|mv|neut___": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|gen___": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Prep": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Fem|Number=Plur": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Masc|Number=Sing": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Fem|Number=Plur": {
|
||||
POS: NOUN
|
||||
},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Masc|Number=Sing": {
|
||||
POS: NOUN
|
||||
},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Plur|PronType=Art": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Sing|PronType=Art": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Sing|PronType=Art": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {
|
||||
POS: NOUN
|
||||
},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Sing|PronType=Art": {
|
||||
POS: NOUN
|
||||
},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__NumType=Card": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN},
|
||||
"N_N|eigen|ev|neut_eigen|ev|neut___": {POS: NOUN},
|
||||
|
@ -335,7 +363,9 @@ TAG_MAP = {
|
|||
"N_N|eigen|ev|neut_soort|mv|neut___": {POS: NOUN},
|
||||
"N_N|eigen|mv|neut_eigen|mv|neut___": {POS: NOUN},
|
||||
"N_N|soort|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN},
|
||||
"N_N|soort|ev|neut_soort|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {POS: NOUN},
|
||||
"N_N|soort|ev|neut_soort|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {
|
||||
POS: NOUN
|
||||
},
|
||||
"N_N|soort|ev|neut_soort|ev|neut__NumForm=Digit|NumType=Card": {POS: NOUN},
|
||||
"N_N|soort|ev|neut_soort|ev|neut__Number=Sing": {POS: NOUN},
|
||||
"N_N|soort|ev|neut_soort|mv|neut__Number=Plur": {POS: NOUN},
|
||||
|
@ -365,7 +395,9 @@ TAG_MAP = {
|
|||
"N_Pron___": {POS: NOUN},
|
||||
"N_Punc_Adj_N___": {POS: NOUN},
|
||||
"N_Punc_Adj_Pron_Punc__Degree=Pos|Number=Sing|Person=2": {POS: NOUN},
|
||||
"N_Punc_Adv_V_Pron_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: NOUN},
|
||||
"N_Punc_Adv_V_Pron_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
|
||||
POS: NOUN
|
||||
},
|
||||
"N_Punc_Misc_Punc_N___": {POS: NOUN},
|
||||
"N_Punc_N_N_N_N__Number=Sing": {POS: NOUN},
|
||||
"N_Punc_N_Punc_N__Number=Sing": {POS: NOUN},
|
||||
|
@ -415,8 +447,12 @@ TAG_MAP = {
|
|||
"Num|hoofd|bep|attr|onverv__Definite=Def|NumType=Card": {POS: NUM},
|
||||
"Num|hoofd|bep|zelfst|onverv__Definite=Def|NumType=Card": {POS: NUM},
|
||||
"Num|hoofd|bep|zelfst|vervmv__Definite=Def|Number=Plur|NumType=Card": {POS: NUM},
|
||||
"Num|hoofd|onbep|attr|stell|onverv__Degree=Pos|NumType=Card|PronType=Ind": {POS: NUM},
|
||||
"Num|hoofd|onbep|attr|vergr|onverv__Degree=Cmp|NumType=Card|PronType=Ind": {POS: NUM},
|
||||
"Num|hoofd|onbep|attr|stell|onverv__Degree=Pos|NumType=Card|PronType=Ind": {
|
||||
POS: NUM
|
||||
},
|
||||
"Num|hoofd|onbep|attr|vergr|onverv__Degree=Cmp|NumType=Card|PronType=Ind": {
|
||||
POS: NUM
|
||||
},
|
||||
"Num|rang|bep|attr|onverv__Definite=Def|NumType=Ord": {POS: NUM},
|
||||
"Num|rang|bep|zelfst|onverv__Definite=Def|NumType=Ord": {POS: NUM},
|
||||
"N|eigen|ev|gen__Case=Gen|Number=Sing": {POS: NOUN},
|
||||
|
@ -469,7 +505,9 @@ TAG_MAP = {
|
|||
"Prep_N_Adv|voor_soort|ev|neut_pron|aanw__AdpType=Prep": {POS: ADP},
|
||||
"Prep_N_Adv|voor_soort|ev|neut_pron|aanw__Number=Sing|PronType=Dem": {POS: ADP},
|
||||
"Prep_N_Adv|voor_soort|ev|neut_pron|vrag__Number=Sing|PronType=Int": {POS: ADP},
|
||||
"Prep_N_Adv|voor_soort|mv|neut_deelv__Gender=Masc|Number=Sing|PronType=Tot": {POS: ADP},
|
||||
"Prep_N_Adv|voor_soort|mv|neut_deelv__Gender=Masc|Number=Sing|PronType=Tot": {
|
||||
POS: ADP
|
||||
},
|
||||
"Prep_N_Conj_N__Number=Sing": {POS: ADP},
|
||||
"Prep_N_Conj__AdpType=Prep": {POS: ADP},
|
||||
"Prep_N_Prep_N__Number=Sing": {POS: ADP},
|
||||
|
@ -489,7 +527,9 @@ TAG_MAP = {
|
|||
"Prep_N|voor_soort|ev|neut__Number=Sing": {POS: ADP},
|
||||
"Prep_N|voor_soort|mv|neut__AdpType=Prep": {POS: ADP},
|
||||
"Prep_N|voor_soort|mv|neut__Number=Plur": {POS: ADP},
|
||||
"Prep_Prep_Adj|voor_voor_adv|stell|onverv__Gender=Masc|Number=Sing|PronType=Tot": {POS: ADP},
|
||||
"Prep_Prep_Adj|voor_voor_adv|stell|onverv__Gender=Masc|Number=Sing|PronType=Tot": {
|
||||
POS: ADP
|
||||
},
|
||||
"Prep_Prep_Adv__Degree=Pos": {POS: ADP},
|
||||
"Prep_Pron_Adj__Degree=Cmp|Number=Sing|Person=3": {POS: ADP},
|
||||
"Prep_Pron_N_Adv__Number=Plur": {POS: ADP},
|
||||
|
@ -503,7 +543,9 @@ TAG_MAP = {
|
|||
"Prep_Pron|voor_ref|3|evofmv__Number=Plur,Sing|Person=3": {POS: ADP},
|
||||
"Prep_Punc_N_Conj_N__AdpType=Prep": {POS: ADP},
|
||||
"Prep_V_N__Number=Sing|Tense=Pres|VerbForm=Part": {POS: ADP},
|
||||
"Prep_V_Pron_Pron_Adv__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {POS: ADP},
|
||||
"Prep_V_Pron_Pron_Adv__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {
|
||||
POS: ADP
|
||||
},
|
||||
"Prep_V|voor_intrans|inf__VerbForm=Inf": {POS: ADP},
|
||||
"Prep_V|voorinf_trans|inf__VerbForm=Inf": {POS: ADP},
|
||||
"Prep|achter__AdpType=Post": {POS: ADP},
|
||||
|
@ -511,17 +553,25 @@ TAG_MAP = {
|
|||
"Prep|voor__AdpType=Prep": {POS: ADP},
|
||||
"Prep|voorinf__AdpType=Prep|PartType=Inf": {POS: ADP},
|
||||
"Pron_Adj_N_Punc_Art_Adj_N_Prep_Art_Adj_N__NumType=Card": {POS: PRON},
|
||||
"Pron_Adj__Case=Nom|Degree=Sup|Number=Sing|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||
"Pron_Adj__Case=Nom|Degree=Sup|Number=Sing|Person=2|Poss=Yes|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron_Adj__Degree=Cmp|PronType=Ind": {POS: PRON},
|
||||
"Pron_Adv|vrag|neut|attr_deelv__PronType=Int": {POS: PRON},
|
||||
"Pron_Art_N_N__Number=Plur|PronType=Ind": {POS: PRON},
|
||||
"Pron_Art__Number=Sing|PronType=Int": {POS: PRON},
|
||||
"Pron_N_Adv__Number=Sing|PronType=Ind": {POS: PRON},
|
||||
"Pron_N_V_Adv_Num_Punc__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: PRON},
|
||||
"Pron_N_V_Conj_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: PRON},
|
||||
"Pron_N_V_Adv_Num_Punc__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron_N_V_Conj_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron_N__Case=Gen|Number=Sing|PronType=Ind": {POS: PRON},
|
||||
"Pron_N__Number=Sing|PronType=Ind": {POS: PRON},
|
||||
"Pron_N|aanw|gen|attr_soort|mv|neut__Case=Gen|Number=Plur|PronType=Dem": {POS: PRON},
|
||||
"Pron_N|aanw|gen|attr_soort|mv|neut__Case=Gen|Number=Plur|PronType=Dem": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron_N|onbep|neut|attr_soort|ev|neut__Number=Sing|PronType=Ind": {POS: PRON},
|
||||
"Pron_Prep_Art__Number=Sing|PronType=Int": {POS: PRON},
|
||||
"Pron_Prep_Art__Number=Sing|PronType=Rel": {POS: PRON},
|
||||
|
@ -529,10 +579,16 @@ TAG_MAP = {
|
|||
"Pron_Prep|betr|neut|zelfst_voor__PronType=Rel": {POS: PRON},
|
||||
"Pron_Prep|onbep|neut|zelfst_voor__PronType=Ind": {POS: PRON},
|
||||
"Pron_Prep|vrag|neut|attr_voor__PronType=Int": {POS: PRON},
|
||||
"Pron_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {POS: PRON},
|
||||
"Pron_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron_Pron__Person=3|PronType=Prs|Reflex=Yes": {POS: PRON},
|
||||
"Pron_V_V__Aspect=Imp|Mood=Ind|Person=3|PronType=Dem|Tense=Pres|VerbForm=Inf": {POS: PRON},
|
||||
"Pron_V__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs|VerbForm=Inf": {POS: PRON},
|
||||
"Pron_V_V__Aspect=Imp|Mood=Ind|Person=3|PronType=Dem|Tense=Pres|VerbForm=Inf": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron_V__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs|VerbForm=Inf": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron_V__Number=Plur|Person=1|Poss=Yes|PronType=Prs|VerbForm=Inf": {POS: PRON},
|
||||
"Pron|aanw|dat|attr__Case=Dat|PronType=Dem": {POS: PRON},
|
||||
"Pron|aanw|gen|attr__Case=Gen|PronType=Dem": {POS: PRON},
|
||||
|
@ -547,27 +603,47 @@ TAG_MAP = {
|
|||
"Pron|bez|1|mv|neut|attr__Number=Plur|Person=1|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||
"Pron|bez|2|ev|neut|attr__Number=Sing|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||
"Pron|bez|2|mv|neut|attr__Number=Plur|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||
"Pron|bez|3|ev|gen|attr__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||
"Pron|bez|3|ev|gen|attr__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron|bez|3|ev|neut|attr__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||
"Pron|bez|3|ev|neut|zelfst__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||
"Pron|bez|3|ev|neut|zelfst__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron|bez|3|mv|neut|attr__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||
"Pron|onbep|gen|attr__Case=Gen|PronType=Ind": {POS: PRON},
|
||||
"Pron|onbep|gen|zelfst__Case=Gen|PronType=Ind": {POS: PRON},
|
||||
"Pron|onbep|neut|attr__PronType=Ind": {POS: PRON},
|
||||
"Pron|onbep|neut|zelfst__PronType=Ind": {POS: PRON},
|
||||
"Pron|per|1|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=1|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|1|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=1|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron|per|1|ev|nom__Case=Nom|Number=Sing|Person=1|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|1|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|1|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron|per|1|mv|nom__Case=Nom|Number=Plur|Person=1|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|2|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=2|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|2|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=2|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron|per|2|ev|nom__Case=Nom|Number=Sing|Person=2|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|2|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|2|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron|per|2|mv|nom__Case=Nom|Number=Plur|Person=2|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|3|evofmv|datofacc__Case=Acc,Dat|Number=Plur,Sing|Person=3|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|3|evofmv|nom__Case=Nom|Number=Plur,Sing|Person=3|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|3|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=3|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|3|evofmv|datofacc__Case=Acc,Dat|Number=Plur,Sing|Person=3|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron|per|3|evofmv|nom__Case=Nom|Number=Plur,Sing|Person=3|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron|per|3|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=3|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron|per|3|ev|nom__Case=Nom|Number=Sing|Person=3|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|3|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs": {POS: PRON},
|
||||
"Pron|per|3|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs": {
|
||||
POS: PRON
|
||||
},
|
||||
"Pron|rec|gen__Case=Gen|PronType=Rcp": {POS: PRON},
|
||||
"Pron|rec|neut__PronType=Rcp": {POS: PRON},
|
||||
"Pron|ref|1|ev__Number=Sing|Person=1|PronType=Prs|Reflex=Yes": {POS: PRON},
|
||||
|
@ -597,20 +673,34 @@ TAG_MAP = {
|
|||
"Punc|vraag__PunctType=Qest": {POS: PUNCT},
|
||||
"V_Adv_Art_N_Prep_Pron_N__Degree=Pos|Number=Plur|Person=2|Subcat=Tran": {POS: VERB},
|
||||
"V_Adv__Degree=Pos|Subcat=Tran": {POS: VERB},
|
||||
"V_Art_N_Num_N__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
||||
"V_Art_N_Num_N__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||
POS: VERB
|
||||
},
|
||||
"V_Art_N__Number=Sing|Subcat=Tran": {POS: VERB},
|
||||
"V_Conj_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V_Conj_Pron__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V_N_Conj_Adj_N_Prep_Art_N__Degree=Pos|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V_N_Conj_Adj_N_Prep_Art_N__Degree=Pos|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"V_N_N__Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||
"V_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V_N_V__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {POS: VERB},
|
||||
"V_N_V__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {
|
||||
POS: VERB
|
||||
},
|
||||
"V_N__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V_N|trans|imp_eigen|ev|neut__Number=Sing|Subcat=Tran": {POS: VERB},
|
||||
"V_Prep|intrans|verldw|onverv_voor__Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V_Pron_Adv_Adv_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V_Pron_Adv__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V_Prep|intrans|verldw|onverv_voor__Subcat=Intr|Tense=Past|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"V_Pron_Adv_Adv_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V_Pron_Adv__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V_Pron__VerbType=Aux,Cop": {POS: VERB},
|
||||
"V_V|hulp|imp_intrans|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB},
|
||||
"V|hulpofkopp|conj__Mood=Sub|VerbForm=Fin": {POS: VERB},
|
||||
|
@ -620,94 +710,220 @@ TAG_MAP = {
|
|||
"V|hulpofkopp|inf__VerbForm=Inf": {POS: VERB},
|
||||
"V|hulpofkopp|inf__VerbForm=Inf|VerbType=Aux,Cop": {POS: VERB},
|
||||
"V|hulpofkopp|inf|subst__VerbForm=Inf": {POS: VERB},
|
||||
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
||||
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
||||
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
||||
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
||||
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
||||
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
||||
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part|VerbType=Aux,Cop": {POS: VERB},
|
||||
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part|VerbType=Aux,Cop": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Aux,Cop": {POS: VERB},
|
||||
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Aux,Cop": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|conj__Mood=Sub|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
||||
"V|hulp|inf__VerbForm=Inf": {POS: VERB},
|
||||
"V|hulp|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB},
|
||||
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
||||
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
||||
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
||||
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
||||
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
||||
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {POS: VERB},
|
||||
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
||||
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Mod": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Mod": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|hulp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V|hulp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Mod": {POS: VERB},
|
||||
"V|intrans|conj__Mood=Sub|Subcat=Intr|VerbForm=Fin": {POS: VERB},
|
||||
"V|intrans|imp__Mood=Imp|Subcat=Intr|VerbForm=Fin": {POS: VERB},
|
||||
"V|intrans|inf__Subcat=Intr|VerbForm=Inf": {POS: VERB},
|
||||
"V|intrans|inf|subst__Subcat=Intr|VerbForm=Inf": {POS: VERB},
|
||||
"V|intrans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|intrans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|intrans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|intrans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|intrans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Past|VerbForm=Fin": {POS: VERB},
|
||||
"V|intrans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Fin": {POS: VERB},
|
||||
"V|intrans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|intrans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|intrans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|intrans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|intrans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Past|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|intrans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|intrans|tegdw|onverv__Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||
"V|intrans|tegdw|vervmv__Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||
"V|intrans|tegdw|vervneut__Case=Nom|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||
"V|intrans|tegdw|vervvergr__Degree=Cmp|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||
"V|intrans|tegdw|vervmv__Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|intrans|tegdw|vervneut__Case=Nom|Subcat=Intr|Tense=Pres|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|intrans|tegdw|vervvergr__Degree=Cmp|Subcat=Intr|Tense=Pres|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|intrans|verldw|onverv__Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V|intrans|verldw|vervmv__Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V|intrans|verldw|vervneut__Case=Nom|Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V|intrans|verldw|vervmv__Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|intrans|verldw|vervneut__Case=Nom|Subcat=Intr|Tense=Past|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|refl|imp__Mood=Imp|Reflex=Yes|VerbForm=Fin": {POS: VERB},
|
||||
"V|refl|inf__Reflex=Yes|VerbForm=Inf": {POS: VERB},
|
||||
"V|refl|inf|subst__Reflex=Yes|VerbForm=Inf": {POS: VERB},
|
||||
"V|refl|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|refl|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|refl|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|refl|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|refl|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Reflex=Yes|Tense=Past|VerbForm=Fin": {POS: VERB},
|
||||
"V|refl|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Past|VerbForm=Fin": {POS: VERB},
|
||||
"V|refl|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|refl|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|refl|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|refl|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|refl|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Reflex=Yes|Tense=Past|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|refl|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Past|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|refl|tegdw|vervneut__Case=Nom|Reflex=Yes|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||
"V|refl|verldw|onverv__Reflex=Yes|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V|trans|conj__Mood=Sub|Subcat=Tran|VerbForm=Fin": {POS: VERB},
|
||||
"V|trans|imp__Mood=Imp|Subcat=Tran|VerbForm=Fin": {POS: VERB},
|
||||
"V|trans|inf__Subcat=Tran|VerbForm=Inf": {POS: VERB},
|
||||
"V|trans|inf|subst__Subcat=Tran|VerbForm=Inf": {POS: VERB},
|
||||
"V|trans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|trans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|trans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|trans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
||||
"V|trans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Fin": {POS: VERB},
|
||||
"V|trans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Fin": {POS: VERB},
|
||||
"V|trans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|trans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|trans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|trans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|trans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|trans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Fin": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|trans|tegdw|onverv__Subcat=Tran|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||
"V|trans|tegdw|vervneut__Case=Nom|Subcat=Tran|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||
"V|trans|tegdw|vervneut__Case=Nom|Subcat=Tran|Tense=Pres|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|trans|verldw|onverv__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V|trans|verldw|vervmv__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V|trans|verldw|vervneut__Case=Nom|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"V|trans|verldw|vervvergr__Degree=Cmp|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: X},
|
||||
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Past|VerbForm=Part": {POS: X},
|
||||
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {POS: X},
|
||||
"V|trans|verldw|vervmv__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|trans|verldw|vervneut__Case=Nom|Subcat=Tran|Tense=Past|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"V|trans|verldw|vervvergr__Degree=Cmp|Subcat=Tran|Tense=Past|VerbForm=Part": {
|
||||
POS: VERB
|
||||
},
|
||||
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||
POS: X
|
||||
},
|
||||
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
|
||||
POS: X
|
||||
},
|
||||
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||
POS: X
|
||||
},
|
||||
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Past|VerbForm=Part": {
|
||||
POS: X
|
||||
},
|
||||
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {
|
||||
POS: X
|
||||
},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {
|
||||
POS: X
|
||||
},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {
|
||||
POS: X
|
||||
},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||
POS: X
|
||||
},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
|
||||
POS: X
|
||||
},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||
POS: X
|
||||
},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {POS: X},
|
||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: X},
|
||||
|
@ -808,5 +1024,5 @@ TAG_MAP = {
|
|||
"X__VerbForm=Inf|VerbType=Mod": {POS: X},
|
||||
"X__VerbType=Aux,Cop": {POS: X},
|
||||
"X___": {POS: X},
|
||||
"_SP": {POS: SPACE}
|
||||
"_SP": {POS: SPACE},
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -5039,5 +5039,5 @@ TAG_MAP = {
|
|||
"punc": {POS: PUNCT},
|
||||
"v-pcp|M|P": {POS: VERB},
|
||||
"v-pcp|M|S": {POS: VERB},
|
||||
"_SP": {POS: SPACE}
|
||||
"_SP": {POS: SPACE},
|
||||
}
|
||||
|
|
|
@ -39,7 +39,9 @@ _infixes = (
|
|||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
|
|
|
@ -19,7 +19,6 @@ _abbrev_exc = [
|
|||
{ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"},
|
||||
{ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"},
|
||||
{ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"},
|
||||
|
||||
# Months abbreviations
|
||||
{ORTH: "янв", LEMMA: "январь", NORM: "январь"},
|
||||
{ORTH: "фев", LEMMA: "февраль", NORM: "февраль"},
|
||||
|
@ -49,16 +48,18 @@ for abbrev_desc in _abbrev_exc:
|
|||
abbrev = abbrev_desc[ORTH]
|
||||
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
|
||||
_exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
|
||||
_exc[orth + '.'] = [{ORTH: orth + '.', LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
|
||||
_exc[orth + "."] = [
|
||||
{ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}
|
||||
]
|
||||
|
||||
|
||||
_slang_exc = [
|
||||
{ORTH: '2к15', LEMMA: '2015', NORM: '2015'},
|
||||
{ORTH: '2к16', LEMMA: '2016', NORM: '2016'},
|
||||
{ORTH: '2к17', LEMMA: '2017', NORM: '2017'},
|
||||
{ORTH: '2к18', LEMMA: '2018', NORM: '2018'},
|
||||
{ORTH: '2к19', LEMMA: '2019', NORM: '2019'},
|
||||
{ORTH: '2к20', LEMMA: '2020', NORM: '2020'},
|
||||
{ORTH: "2к15", LEMMA: "2015", NORM: "2015"},
|
||||
{ORTH: "2к16", LEMMA: "2016", NORM: "2016"},
|
||||
{ORTH: "2к17", LEMMA: "2017", NORM: "2017"},
|
||||
{ORTH: "2к18", LEMMA: "2018", NORM: "2018"},
|
||||
{ORTH: "2к19", LEMMA: "2019", NORM: "2019"},
|
||||
{ORTH: "2к20", LEMMA: "2020", NORM: "2020"},
|
||||
]
|
||||
|
||||
for slang_desc in _slang_exc:
|
||||
|
|
|
@ -15,7 +15,7 @@ _infixes = (
|
|||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
|
||||
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=_hyphens_no_dash),
|
||||
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash),
|
||||
r"(?<=[0-9])-(?=[0-9])",
|
||||
]
|
||||
)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -16,5 +16,5 @@ sentences = [
|
|||
"此外,中文还是联合国正式语文,并被上海合作组织等国际组织采用为官方语言。",
|
||||
"在中国大陆,汉语通称为“汉语”。",
|
||||
"在联合国、台湾、香港及澳门,通称为“中文”。",
|
||||
"在新加坡及马来西亚,通称为“华语”。"
|
||||
"在新加坡及马来西亚,通称为“华语”。",
|
||||
]
|
||||
|
|
|
@ -47,7 +47,7 @@ _single_num_words = [
|
|||
"拾陆",
|
||||
"拾柒",
|
||||
"拾捌",
|
||||
"拾玖"
|
||||
"拾玖",
|
||||
]
|
||||
|
||||
_count_num_words = [
|
||||
|
@ -68,27 +68,16 @@ _count_num_words = [
|
|||
"陆",
|
||||
"柒",
|
||||
"捌",
|
||||
"玖"
|
||||
"玖",
|
||||
]
|
||||
|
||||
_base_num_words = [
|
||||
"十",
|
||||
"百",
|
||||
"千",
|
||||
"万",
|
||||
"亿",
|
||||
"兆",
|
||||
"拾",
|
||||
"佰",
|
||||
"仟"
|
||||
]
|
||||
_base_num_words = ["十", "百", "千", "万", "亿", "兆", "拾", "佰", "仟"]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(
|
||||
".", "").replace(",", "").replace("。", "")
|
||||
text = text.replace(",", "").replace(".", "").replace(",", "").replace("。", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
|
@ -97,10 +86,12 @@ def like_num(text):
|
|||
return True
|
||||
if text in _single_num_words:
|
||||
return True
|
||||
# fmt: off
|
||||
if re.match('^((' + '|'.join(_count_num_words) + '){1}'
|
||||
+ '(' + '|'.join(_base_num_words) + '){1})+'
|
||||
+ '(' + '|'.join(_count_num_words) + ')?$', text):
|
||||
return True
|
||||
# fmt: on
|
||||
return False
|
||||
|
||||
|
||||
|
|
|
@ -430,6 +430,7 @@ class Language(object):
|
|||
|
||||
DOCS: https://spacy.io/api/language#update
|
||||
"""
|
||||
expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
|
||||
if len(docs) != len(golds):
|
||||
raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
|
||||
if len(docs) == 0:
|
||||
|
@ -445,10 +446,10 @@ class Language(object):
|
|||
if isinstance(doc, basestring_):
|
||||
doc = self.make_doc(doc)
|
||||
if not isinstance(gold, GoldParse):
|
||||
expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
|
||||
unexpected_keys = [k for k in gold if k not in expected_keys]
|
||||
if unexpected_keys:
|
||||
raise ValueError(Errors.E151.format(unexpected_keys=unexpected_keys, expected_keys=expected_keys))
|
||||
unexpected = [k for k in gold if k not in expected_keys]
|
||||
if unexpected:
|
||||
err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
|
||||
raise ValueError(err)
|
||||
gold = GoldParse(doc, **gold)
|
||||
doc_objs.append(doc)
|
||||
gold_objs.append(gold)
|
||||
|
|
|
@ -5,10 +5,10 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,lemma', [("aprox.", "aproximadament"),
|
||||
("pàg.", "pàgina"),
|
||||
("p.ex.", "per exemple")
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"text,lemma",
|
||||
[("aprox.", "aproximadament"), ("pàg.", "pàgina"), ("p.ex.", "per exemple")],
|
||||
)
|
||||
def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
|
||||
tokens = ca_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -21,21 +21,37 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
|
|||
assert len(tokens) == 138
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
("Perquè va anar-hi?", 6),
|
||||
("“Ah no?”", 5),
|
||||
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
||||
("Van córrer aprox. 10km", 5),
|
||||
("Llavors perqué...", 3)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("Perquè va anar-hi?", 6),
|
||||
("“Ah no?”", 5),
|
||||
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
||||
("Van córrer aprox. 10km", 5),
|
||||
("Llavors perqué...", 3),
|
||||
],
|
||||
)
|
||||
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
|
||||
tokens = ca_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,match', [
|
||||
('10', True), ('1', True), ('10,000', True), ('10,00', True),
|
||||
('999.0', True), ('un', True), ('dos', True), ('bilió', True),
|
||||
('gos', False), (',', False), ('1/2', True)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("10,000", True),
|
||||
("10,00", True),
|
||||
("999.0", True),
|
||||
("un", True),
|
||||
("dos", True),
|
||||
("bilió", True),
|
||||
("gos", False),
|
||||
(",", False),
|
||||
("1/2", True),
|
||||
],
|
||||
)
|
||||
def test_ca_lex_attrs_like_number(ca_tokenizer, text, match):
|
||||
tokens = ca_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -32,7 +32,7 @@ def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
|||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
|
||||
@pytest.mark.parametrize("text,norm", [("daß", "dass")])
|
||||
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
||||
tokens = de_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -7,33 +7,33 @@ import pytest
|
|||
@pytest.mark.parametrize(
|
||||
"text",
|
||||
[
|
||||
u"aujourd'hui",
|
||||
u"Aujourd'hui",
|
||||
u"prud'hommes",
|
||||
u"prud’hommal",
|
||||
u"audio-numérique",
|
||||
u"Audio-numérique",
|
||||
u"entr'amis",
|
||||
u"entr'abat",
|
||||
u"rentr'ouvertes",
|
||||
u"grand'hamien",
|
||||
u"Châteauneuf-la-Forêt",
|
||||
u"Château-Guibert",
|
||||
u"11-septembre",
|
||||
u"11-Septembre",
|
||||
u"refox-trottâmes",
|
||||
"aujourd'hui",
|
||||
"Aujourd'hui",
|
||||
"prud'hommes",
|
||||
"prud’hommal",
|
||||
"audio-numérique",
|
||||
"Audio-numérique",
|
||||
"entr'amis",
|
||||
"entr'abat",
|
||||
"rentr'ouvertes",
|
||||
"grand'hamien",
|
||||
"Châteauneuf-la-Forêt",
|
||||
"Château-Guibert",
|
||||
"11-septembre",
|
||||
"11-Septembre",
|
||||
"refox-trottâmes",
|
||||
# u"K-POP",
|
||||
# u"K-Pop",
|
||||
# u"K-pop",
|
||||
u"z'yeutes",
|
||||
u"black-outeront",
|
||||
u"états-unienne",
|
||||
u"courtes-pattes",
|
||||
u"court-pattes",
|
||||
u"saut-de-ski",
|
||||
u"Écourt-Saint-Quentin",
|
||||
u"Bout-de-l'Îlien",
|
||||
u"pet-en-l'air",
|
||||
"z'yeutes",
|
||||
"black-outeront",
|
||||
"états-unienne",
|
||||
"courtes-pattes",
|
||||
"court-pattes",
|
||||
"saut-de-ski",
|
||||
"Écourt-Saint-Quentin",
|
||||
"Bout-de-l'Îlien",
|
||||
"pet-en-l'air",
|
||||
],
|
||||
)
|
||||
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
||||
|
|
|
@ -3,13 +3,18 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokens,lemmas", [
|
||||
# fmt: off
|
||||
TEST_CASES = [
|
||||
(["Galime", "vadinti", "gerovės", "valstybe", ",", "turime", "išvystytą", "socialinę", "apsaugą", ",",
|
||||
"sveikatos", "apsaugą", "ir", "prieinamą", "švietimą", "."],
|
||||
["galėti", "vadintas", "gerovė", "valstybė", ",", "turėti", "išvystytas", "socialinis",
|
||||
"apsauga", ",", "sveikata", "apsauga", "ir", "prieinamas", "švietimas", "."]),
|
||||
(["taip", ",", "uoliai", "tyrinėjau", "ir", "pasirinkau", "geriausią", "variantą", "."],
|
||||
["taip", ",", "uolus", "tyrinėti", "ir", "pasirinkti", "geras", "variantas", "."])])
|
||||
["taip", ",", "uolus", "tyrinėti", "ir", "pasirinkti", "geras", "variantas", "."])
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
|
||||
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
|
||||
assert lemmas == [lt_lemmatizer.lookup(token) for token in tokens]
|
||||
|
|
|
@ -7,10 +7,21 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,match', [
|
||||
('10', True), ('1', True), ('10,000', True), ('10,00', True),
|
||||
('jeden', True), ('dwa', True), ('milion', True),
|
||||
('pies', False), (',', False), ('1/2', True)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("10,000", True),
|
||||
("10,00", True),
|
||||
("jeden", True),
|
||||
("dwa", True),
|
||||
("milion", True),
|
||||
("pies", False),
|
||||
(",", False),
|
||||
("1/2", True),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(pl_tokenizer, text, match):
|
||||
tokens = pl_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -4,9 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text", ['ہےں۔', 'کیا۔']
|
||||
)
|
||||
@pytest.mark.parametrize("text", ["ہےں۔", "کیا۔"])
|
||||
def test_contractions(ur_tokenizer, text):
|
||||
"""Test specific Urdu punctuation character"""
|
||||
tokens = ur_tokenizer(text)
|
||||
|
|
|
@ -134,12 +134,12 @@ def test_matcher_end_zero_plus(en_vocab):
|
|||
def test_matcher_sets_return_correct_tokens(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
patterns = [
|
||||
[{'LOWER': {'IN': ["zero"]}}],
|
||||
[{'LOWER': {'IN': ["one"]}}],
|
||||
[{'LOWER': {'IN': ["two"]}}],
|
||||
[{"LOWER": {"IN": ["zero"]}}],
|
||||
[{"LOWER": {"IN": ["one"]}}],
|
||||
[{"LOWER": {"IN": ["two"]}}],
|
||||
]
|
||||
matcher.add('TEST', None, *patterns)
|
||||
matcher.add("TEST", None, *patterns)
|
||||
doc = Doc(en_vocab, words="zero one two three".split())
|
||||
matches = matcher(doc)
|
||||
texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
|
||||
assert texts == ['zero', 'one', 'two']
|
||||
assert texts == ["zero", "one", "two"]
|
||||
|
|
|
@ -52,7 +52,9 @@ def test_get_pipe(nlp, name):
|
|||
assert nlp.get_pipe(name) == new_pipe
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name,replacement,not_callable", [("my_component", lambda doc: doc, {})])
|
||||
@pytest.mark.parametrize(
|
||||
"name,replacement,not_callable", [("my_component", lambda doc: doc, {})]
|
||||
)
|
||||
def test_replace_pipe(nlp, name, replacement, not_callable):
|
||||
with pytest.raises(ValueError):
|
||||
nlp.replace_pipe(name, new_pipe)
|
||||
|
|
|
@ -358,7 +358,9 @@ def test_issue850_basic():
|
|||
assert end == 4
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="French exception list is not enabled in the default tokenizer anymore")
|
||||
@pytest.mark.skip(
|
||||
reason="French exception list is not enabled in the default tokenizer anymore"
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
|
||||
)
|
||||
|
|
|
@ -19,7 +19,7 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
|||
def test_issue1235():
|
||||
"""Test that g is not split of if preceded by a number and a letter"""
|
||||
nlp = English()
|
||||
testwords = u'e2g 2g 52g'
|
||||
testwords = "e2g 2g 52g"
|
||||
doc = nlp(testwords)
|
||||
assert len(doc) == 5
|
||||
assert doc[0].text == "e2g"
|
||||
|
|
|
@ -4,15 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word",
|
||||
[
|
||||
"don't",
|
||||
"don’t",
|
||||
"I'd",
|
||||
"I’d",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||
def test_issue3521(en_tokenizer, word):
|
||||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
|
|
|
@ -9,7 +9,10 @@ import numpy as np
|
|||
def test_issue3540(en_vocab):
|
||||
|
||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
tensor = np.asarray([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], dtype="f")
|
||||
tensor = np.asarray(
|
||||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||
dtype="f",
|
||||
)
|
||||
doc = Doc(en_vocab, words=words)
|
||||
doc.tensor = tensor
|
||||
|
||||
|
@ -25,7 +28,7 @@ def test_issue3540(en_vocab):
|
|||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||
retokenizer.split(doc[3], [u"New", u"York"], heads=heads, attrs=attrs)
|
||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
|
|
|
@ -35,7 +35,9 @@ def test_issue3962(doc):
|
|||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
|
||||
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root
|
||||
assert (
|
||||
doc2[0].head.text == "jests"
|
||||
) # head set to itself, being the new artificial root
|
||||
assert doc2[0].dep_ == "dep"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
|
@ -92,7 +94,9 @@ def test_issue3962_long(two_sent_doc):
|
|||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
|
||||
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root (in sentence 1)
|
||||
assert (
|
||||
doc2[0].head.text == "jests"
|
||||
) # head set to itself, being the new artificial root (in sentence 1)
|
||||
assert doc2[0].dep_ == "ROOT"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
|
@ -100,9 +104,13 @@ def test_issue3962_long(two_sent_doc):
|
|||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests"
|
||||
assert doc2[3].dep_ == "punct"
|
||||
assert doc2[4].head.text == "They" # head set to itself, being the new artificial root (in sentence 2)
|
||||
assert (
|
||||
doc2[4].head.text == "They"
|
||||
) # head set to itself, being the new artificial root (in sentence 2)
|
||||
assert doc2[4].dep_ == "dep"
|
||||
assert doc2[4].head.text == "They" # head set to the new artificial head (in sentence 2)
|
||||
assert (
|
||||
doc2[4].head.text == "They"
|
||||
) # head set to the new artificial head (in sentence 2)
|
||||
assert doc2[4].dep_ == "dep"
|
||||
|
||||
# We should still have 2 sentences
|
||||
|
|
|
@ -30,14 +30,18 @@ def test_serialize_kb_disk(en_vocab):
|
|||
def _get_dummy_kb(vocab):
|
||||
kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
|
||||
|
||||
kb.add_entity(entity='Q53', freq=33, entity_vector=[0, 5, 3])
|
||||
kb.add_entity(entity='Q17', freq=2, entity_vector=[7, 1, 0])
|
||||
kb.add_entity(entity='Q007', freq=7, entity_vector=[0, 0, 7])
|
||||
kb.add_entity(entity='Q44', freq=342, entity_vector=[4, 4, 4])
|
||||
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
|
||||
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
|
||||
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
|
||||
kb.add_entity(entity="Q44", freq=342, entity_vector=[4, 4, 4])
|
||||
|
||||
kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9])
|
||||
kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])
|
||||
kb.add_alias(alias='random', entities=['Q007'], probabilities=[1.0])
|
||||
kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
|
||||
kb.add_alias(
|
||||
alias="guy",
|
||||
entities=["Q53", "Q007", "Q17", "Q44"],
|
||||
probabilities=[0.3, 0.3, 0.2, 0.1],
|
||||
)
|
||||
kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
|
||||
|
||||
return kb
|
||||
|
||||
|
@ -45,30 +49,30 @@ def _get_dummy_kb(vocab):
|
|||
def _check_kb(kb):
|
||||
# check entities
|
||||
assert kb.get_size_entities() == 4
|
||||
for entity_string in ['Q53', 'Q17', 'Q007', 'Q44']:
|
||||
for entity_string in ["Q53", "Q17", "Q007", "Q44"]:
|
||||
assert entity_string in kb.get_entity_strings()
|
||||
for entity_string in ['', 'Q0']:
|
||||
for entity_string in ["", "Q0"]:
|
||||
assert entity_string not in kb.get_entity_strings()
|
||||
|
||||
# check aliases
|
||||
assert kb.get_size_aliases() == 3
|
||||
for alias_string in ['double07', 'guy', 'random']:
|
||||
for alias_string in ["double07", "guy", "random"]:
|
||||
assert alias_string in kb.get_alias_strings()
|
||||
for alias_string in ['nothingness', '', 'randomnoise']:
|
||||
for alias_string in ["nothingness", "", "randomnoise"]:
|
||||
assert alias_string not in kb.get_alias_strings()
|
||||
|
||||
# check candidates & probabilities
|
||||
candidates = sorted(kb.get_candidates('double07'), key=lambda x: x.entity_)
|
||||
candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_)
|
||||
assert len(candidates) == 2
|
||||
|
||||
assert candidates[0].entity_ == 'Q007'
|
||||
assert candidates[0].entity_ == "Q007"
|
||||
assert 6.999 < candidates[0].entity_freq < 7.01
|
||||
assert candidates[0].entity_vector == [0, 0, 7]
|
||||
assert candidates[0].alias_ == 'double07'
|
||||
assert candidates[0].alias_ == "double07"
|
||||
assert 0.899 < candidates[0].prior_prob < 0.901
|
||||
|
||||
assert candidates[1].entity_ == 'Q17'
|
||||
assert candidates[1].entity_ == "Q17"
|
||||
assert 1.99 < candidates[1].entity_freq < 2.01
|
||||
assert candidates[1].entity_vector == [7, 1, 0]
|
||||
assert candidates[1].alias_ == 'double07'
|
||||
assert candidates[1].alias_ == "double07"
|
||||
assert 0.099 < candidates[1].prior_prob < 0.101
|
||||
|
|
Loading…
Reference in New Issue
Block a user