mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Tidy up and auto-format
This commit is contained in:
parent
364aaf5bc2
commit
f580302673
|
@ -430,8 +430,7 @@ class Errors(object):
|
||||||
E150 = ("The language of the `nlp` object and the `vocab` should be the "
|
E150 = ("The language of the `nlp` object and the `vocab` should be the "
|
||||||
"same, but found '{nlp}' and '{vocab}' respectively.")
|
"same, but found '{nlp}' and '{vocab}' respectively.")
|
||||||
E151 = ("Trying to call nlp.update without required annotation types. "
|
E151 = ("Trying to call nlp.update without required annotation types. "
|
||||||
"Expected top-level keys: {expected_keys}."
|
"Expected top-level keys: {exp}. Got: {unexp}.")
|
||||||
" Got: {unexpected_keys}.")
|
|
||||||
E152 = ("The `nlp` object should have a pre-trained `ner` component.")
|
E152 = ("The `nlp` object should have a pre-trained `ner` component.")
|
||||||
E153 = ("Either provide a path to a preprocessed training directory, "
|
E153 = ("Either provide a path to a preprocessed training directory, "
|
||||||
"or to the original Wikipedia XML dump.")
|
"or to the original Wikipedia XML dump.")
|
||||||
|
|
|
@ -10,8 +10,4 @@ Example sentences to test spaCy and its language models.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
sentences = [
|
sentences = ["তুই খুব ভালো", "আজ আমরা ডাক্তার দেখতে যাবো", "আমি জানি না "]
|
||||||
'তুই খুব ভালো',
|
|
||||||
'আজ আমরা ডাক্তার দেখতে যাবো',
|
|
||||||
'আমি জানি না '
|
|
||||||
]
|
|
||||||
|
|
|
@ -22,7 +22,9 @@ _suffixes = (
|
||||||
r"(?<=°[FfCcKk])\.",
|
r"(?<=°[FfCcKk])\.",
|
||||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency),
|
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||||
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -35,8 +37,8 @@ _infixes = (
|
||||||
),
|
),
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])({h})(?=[{ae}])".format(a=ALPHA, h=HYPHENS, ae="এ"),
|
r"(?<=[{a}])({h})(?=[{ae}])".format(a=ALPHA, h=HYPHENS, ae="এ"),
|
||||||
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@ _infixes = (
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
|
|
@ -59,7 +59,9 @@ _suffixes = (
|
||||||
r"([0-9])+\&", # 12&
|
r"([0-9])+\&", # 12&
|
||||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"²\-\+", q=CONCAT_QUOTES),
|
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"²\-\+", q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\-", # όνομα-
|
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\-", # όνομα-
|
||||||
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\.",
|
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\.",
|
||||||
|
@ -87,8 +89,8 @@ _infixes = (
|
||||||
r"([a-zA-Z]+)(\-([a-zA-Z]+))+", # abc-abc
|
r"([a-zA-Z]+)(\-([a-zA-Z]+))+", # abc-abc
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -27,5 +27,5 @@ ADVERBS_IRREG = {
|
||||||
"slower": ("slow",),
|
"slower": ("slow",),
|
||||||
"slowest": ("slowest",),
|
"slowest": ("slowest",),
|
||||||
"sooner": ("soon",),
|
"sooner": ("soon",),
|
||||||
"soonest": ("soon",)
|
"soonest": ("soon",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,7 +44,7 @@ NOUNS_IRREG = {
|
||||||
"allodia": ("allodium",),
|
"allodia": ("allodium",),
|
||||||
"alluvia": ("alluvium",),
|
"alluvia": ("alluvium",),
|
||||||
"alodia": ("alodium",),
|
"alodia": ("alodium",),
|
||||||
"alto-relievos": ("alto-relievo", "alto-rilievo",),
|
"alto-relievos": ("alto-relievo", "alto-rilievo"),
|
||||||
"altocumuli": ("altocumulus",),
|
"altocumuli": ("altocumulus",),
|
||||||
"altostrati": ("altostratus",),
|
"altostrati": ("altostratus",),
|
||||||
"alulae": ("alula",),
|
"alulae": ("alula",),
|
||||||
|
@ -81,7 +81,7 @@ NOUNS_IRREG = {
|
||||||
"anamorphoses": ("anamorphosis",),
|
"anamorphoses": ("anamorphosis",),
|
||||||
"anastomoses": ("anastomosis",),
|
"anastomoses": ("anastomosis",),
|
||||||
"anatyxes": ("anaptyxis",),
|
"anatyxes": ("anaptyxis",),
|
||||||
"ancones": ("ancon", "ancone",),
|
"ancones": ("ancon", "ancone"),
|
||||||
"androclinia": ("androclinium",),
|
"androclinia": ("androclinium",),
|
||||||
"androecia": ("androecium",),
|
"androecia": ("androecium",),
|
||||||
"androsphinges": ("androsphinx",),
|
"androsphinges": ("androsphinx",),
|
||||||
|
@ -90,7 +90,7 @@ NOUNS_IRREG = {
|
||||||
"angiomata": ("angioma",),
|
"angiomata": ("angioma",),
|
||||||
"animalcula": ("animalculum",),
|
"animalcula": ("animalculum",),
|
||||||
"anlagen": ("anlage",),
|
"anlagen": ("anlage",),
|
||||||
"annattos": ("anatto", "annatto",),
|
"annattos": ("anatto", "annatto"),
|
||||||
"annuli": ("annulus",),
|
"annuli": ("annulus",),
|
||||||
"antae": ("anta",),
|
"antae": ("anta",),
|
||||||
"antalkalies": ("antalkali",),
|
"antalkalies": ("antalkali",),
|
||||||
|
@ -158,7 +158,7 @@ NOUNS_IRREG = {
|
||||||
"aspergilli": ("aspergillus",),
|
"aspergilli": ("aspergillus",),
|
||||||
"aspergilloses": ("aspergillosis",),
|
"aspergilloses": ("aspergillosis",),
|
||||||
"aspersoria": ("aspersorium",),
|
"aspersoria": ("aspersorium",),
|
||||||
"assegais": ("assagai", "assegai",),
|
"assegais": ("assagai", "assegai"),
|
||||||
"astragali": ("astragalus",),
|
"astragali": ("astragalus",),
|
||||||
"asyndeta": ("asyndeton",),
|
"asyndeta": ("asyndeton",),
|
||||||
"atheromata": ("atheroma",),
|
"atheromata": ("atheroma",),
|
||||||
|
@ -172,15 +172,15 @@ NOUNS_IRREG = {
|
||||||
"aurei": ("aureus",),
|
"aurei": ("aureus",),
|
||||||
"auriculae": ("auricula",),
|
"auriculae": ("auricula",),
|
||||||
"aurorae": ("aurora",),
|
"aurorae": ("aurora",),
|
||||||
"auspices": ("auspex", "auspice",),
|
"auspices": ("auspex", "auspice"),
|
||||||
"autocatalyses": ("autocatalysis",),
|
"autocatalyses": ("autocatalysis",),
|
||||||
"autochthones": ("autochthon",),
|
"autochthones": ("autochthon",),
|
||||||
"automata": ("automaton",),
|
"automata": ("automaton",),
|
||||||
"autos-da-fe": ("auto-da-fe",),
|
"autos-da-fe": ("auto-da-fe",),
|
||||||
"avitaminoses": ("avitaminosis",),
|
"avitaminoses": ("avitaminosis",),
|
||||||
"axes": ("ax", "axis",),
|
"axes": ("ax", "axis"),
|
||||||
"axillae": ("axilla",),
|
"axillae": ("axilla",),
|
||||||
"bacchantes": ("bacchant", "bacchante",),
|
"bacchantes": ("bacchant", "bacchante"),
|
||||||
"bacchii": ("bacchius",),
|
"bacchii": ("bacchius",),
|
||||||
"bacilli": ("bacillus",),
|
"bacilli": ("bacillus",),
|
||||||
"bacteriostases": ("bacteriostasis",),
|
"bacteriostases": ("bacteriostasis",),
|
||||||
|
@ -195,7 +195,7 @@ NOUNS_IRREG = {
|
||||||
"banjoes": ("banjo",),
|
"banjoes": ("banjo",),
|
||||||
"barklice": ("barklouse",),
|
"barklice": ("barklouse",),
|
||||||
"barramundies": ("barramundi",),
|
"barramundies": ("barramundi",),
|
||||||
"bases": ("base", "basis",),
|
"bases": ("base", "basis"),
|
||||||
"bases-on-balls": ("base_on_balls",),
|
"bases-on-balls": ("base_on_balls",),
|
||||||
"bases_on_balls": ("base_on_balls",),
|
"bases_on_balls": ("base_on_balls",),
|
||||||
"basidia": ("basidium",),
|
"basidia": ("basidium",),
|
||||||
|
@ -204,15 +204,15 @@ NOUNS_IRREG = {
|
||||||
"bastinadoes": ("bastinado",),
|
"bastinadoes": ("bastinado",),
|
||||||
"bateaux": ("bateau",),
|
"bateaux": ("bateau",),
|
||||||
"batfishes": ("batfish",),
|
"batfishes": ("batfish",),
|
||||||
"beadsmen": ("beadsman", "bedesman",),
|
"beadsmen": ("beadsman", "bedesman"),
|
||||||
"beaux": ("beau",),
|
"beaux": ("beau",),
|
||||||
"beches-de-mer": ("beche-de-mer",),
|
"beches-de-mer": ("beche-de-mer",),
|
||||||
"beeves": ("beef",),
|
"beeves": ("beef",),
|
||||||
"behooves": ("behoof",),
|
"behooves": ("behoof",),
|
||||||
"bersaglieri": ("bersagliere",),
|
"bersaglieri": ("bersagliere",),
|
||||||
"bhishties": ("bheesty", "bhishti",),
|
"bhishties": ("bheesty", "bhishti"),
|
||||||
"bibliothecae": ("bibliotheca",),
|
"bibliothecae": ("bibliotheca",),
|
||||||
"bicennaries": ("bicentenary", "bicentennial",),
|
"bicennaries": ("bicentenary", "bicentennial"),
|
||||||
"bijoux": ("bijou",),
|
"bijoux": ("bijou",),
|
||||||
"bilboes": ("bilbo",),
|
"bilboes": ("bilbo",),
|
||||||
"billets-doux": ("billet-doux",),
|
"billets-doux": ("billet-doux",),
|
||||||
|
@ -245,7 +245,7 @@ NOUNS_IRREG = {
|
||||||
"brachia": ("brachium",),
|
"brachia": ("brachium",),
|
||||||
"brainchildren": ("brainchild",),
|
"brainchildren": ("brainchild",),
|
||||||
"branchiae": ("branchia",),
|
"branchiae": ("branchia",),
|
||||||
"brants": ("brant", "brent",),
|
"brants": ("brant", "brent"),
|
||||||
"bravadoes": ("bravado",),
|
"bravadoes": ("bravado",),
|
||||||
"bravoes": ("bravo",),
|
"bravoes": ("bravo",),
|
||||||
"bregmata": ("bregma",),
|
"bregmata": ("bregma",),
|
||||||
|
@ -275,7 +275,7 @@ NOUNS_IRREG = {
|
||||||
"caesurae": ("caesura",),
|
"caesurae": ("caesura",),
|
||||||
"calami": ("calamus",),
|
"calami": ("calamus",),
|
||||||
"calathi": ("calathus",),
|
"calathi": ("calathus",),
|
||||||
"calcanei": ("calcaneum", "calcaneus",),
|
"calcanei": ("calcaneum", "calcaneus"),
|
||||||
"calces": ("calx",),
|
"calces": ("calx",),
|
||||||
"calculi": ("calculus",),
|
"calculi": ("calculus",),
|
||||||
"caldaria": ("caldarium",),
|
"caldaria": ("caldarium",),
|
||||||
|
@ -421,7 +421,7 @@ NOUNS_IRREG = {
|
||||||
"comae": ("coma",),
|
"comae": ("coma",),
|
||||||
"comatulae": ("comatula",),
|
"comatulae": ("comatula",),
|
||||||
"comedones": ("comedo",),
|
"comedones": ("comedo",),
|
||||||
"comics": ("comic_strip", "comic",),
|
"comics": ("comic_strip", "comic"),
|
||||||
"commandoes": ("commando",),
|
"commandoes": ("commando",),
|
||||||
"concertanti": ("concertante",),
|
"concertanti": ("concertante",),
|
||||||
"concerti": ("concerto",),
|
"concerti": ("concerto",),
|
||||||
|
@ -549,11 +549,11 @@ NOUNS_IRREG = {
|
||||||
"diplococci": ("diplococcus",),
|
"diplococci": ("diplococcus",),
|
||||||
"directors-general": ("director-general",),
|
"directors-general": ("director-general",),
|
||||||
"disci": ("discus",),
|
"disci": ("discus",),
|
||||||
"discoboli": ("discobolos", "discobolus",),
|
"discoboli": ("discobolos", "discobolus"),
|
||||||
"dive": ("diva",),
|
"dive": ("diva",),
|
||||||
"diverticula": ("diverticulum",),
|
"diverticula": ("diverticulum",),
|
||||||
"divertimenti": ("divertimento",),
|
"divertimenti": ("divertimento",),
|
||||||
"djinn": ("djinni", "djinny",),
|
"djinn": ("djinni", "djinny"),
|
||||||
"dodoes": ("dodo",),
|
"dodoes": ("dodo",),
|
||||||
"dogfishes": ("dogfish",),
|
"dogfishes": ("dogfish",),
|
||||||
"dogmata": ("dogma",),
|
"dogmata": ("dogma",),
|
||||||
|
@ -593,7 +593,7 @@ NOUNS_IRREG = {
|
||||||
"ellipses": ("ellipsis",),
|
"ellipses": ("ellipsis",),
|
||||||
"eluvia": ("eluvium",),
|
"eluvia": ("eluvium",),
|
||||||
"elves": ("elf",),
|
"elves": ("elf",),
|
||||||
"elytra": ("elytron", "elytrum",),
|
"elytra": ("elytron", "elytrum"),
|
||||||
"embargoes": ("embargo",),
|
"embargoes": ("embargo",),
|
||||||
"emboli": ("embolus",),
|
"emboli": ("embolus",),
|
||||||
"emphases": ("emphasis",),
|
"emphases": ("emphasis",),
|
||||||
|
@ -623,7 +623,7 @@ NOUNS_IRREG = {
|
||||||
"entases": ("entasis",),
|
"entases": ("entasis",),
|
||||||
"entera": ("enteron",),
|
"entera": ("enteron",),
|
||||||
"entia": ("ens",),
|
"entia": ("ens",),
|
||||||
"entozoa": ("entozoan", "entozoon",),
|
"entozoa": ("entozoan", "entozoon"),
|
||||||
"epencephala": ("epencephalon",),
|
"epencephala": ("epencephalon",),
|
||||||
"epentheses": ("epenthesis",),
|
"epentheses": ("epenthesis",),
|
||||||
"epexegeses": ("epexegesis",),
|
"epexegeses": ("epexegesis",),
|
||||||
|
@ -643,10 +643,10 @@ NOUNS_IRREG = {
|
||||||
"epiphenomena": ("epiphenomenon",),
|
"epiphenomena": ("epiphenomenon",),
|
||||||
"epiphyses": ("epiphysis",),
|
"epiphyses": ("epiphysis",),
|
||||||
"episterna": ("episternum",),
|
"episterna": ("episternum",),
|
||||||
"epithalamia": ("epithalamion", "epithalamium",),
|
"epithalamia": ("epithalamion", "epithalamium"),
|
||||||
"epithelia": ("epithelium",),
|
"epithelia": ("epithelium",),
|
||||||
"epitheliomata": ("epithelioma",),
|
"epitheliomata": ("epithelioma",),
|
||||||
"epizoa": ("epizoan", "epizoon",),
|
"epizoa": ("epizoan", "epizoon"),
|
||||||
"epyllia": ("epyllion",),
|
"epyllia": ("epyllion",),
|
||||||
"equilibria": ("equilibrium",),
|
"equilibria": ("equilibrium",),
|
||||||
"equiseta": ("equisetum",),
|
"equiseta": ("equisetum",),
|
||||||
|
@ -845,11 +845,11 @@ NOUNS_IRREG = {
|
||||||
"groszy": ("grosz",),
|
"groszy": ("grosz",),
|
||||||
"grottoes": ("grotto",),
|
"grottoes": ("grotto",),
|
||||||
"guilder": ("guilde",),
|
"guilder": ("guilde",),
|
||||||
"guilders": ("guilde", "guilder",),
|
"guilders": ("guilde", "guilder"),
|
||||||
"guitarfishes": ("guitarfish",),
|
"guitarfishes": ("guitarfish",),
|
||||||
"gummata": ("gumma",),
|
"gummata": ("gumma",),
|
||||||
"gurnard": ("gurnar",),
|
"gurnard": ("gurnar",),
|
||||||
"gurnards": ("gurnar", "gurnard",),
|
"gurnards": ("gurnar", "gurnard"),
|
||||||
"guttae": ("gutta",),
|
"guttae": ("gutta",),
|
||||||
"gymnasia": ("gymnasium",),
|
"gymnasia": ("gymnasium",),
|
||||||
"gynaecea": ("gynaeceum",),
|
"gynaecea": ("gynaeceum",),
|
||||||
|
@ -870,7 +870,7 @@ NOUNS_IRREG = {
|
||||||
"haeredes": ("haeres",),
|
"haeredes": ("haeres",),
|
||||||
"haftaroth": ("haftarah",),
|
"haftaroth": ("haftarah",),
|
||||||
"hagfishes": ("hagfish",),
|
"hagfishes": ("hagfish",),
|
||||||
"haggadas": ("haggada", "haggadah",),
|
"haggadas": ("haggada", "haggadah"),
|
||||||
"haggadoth": ("haggada",),
|
"haggadoth": ("haggada",),
|
||||||
"hajjes": ("hajj",),
|
"hajjes": ("hajj",),
|
||||||
"haleru": ("haler",),
|
"haleru": ("haler",),
|
||||||
|
@ -879,7 +879,7 @@ NOUNS_IRREG = {
|
||||||
"halloth": ("hallah",),
|
"halloth": ("hallah",),
|
||||||
"halluces": ("hallux",),
|
"halluces": ("hallux",),
|
||||||
"haloes": ("halo",),
|
"haloes": ("halo",),
|
||||||
"halteres": ("halter", "haltere",),
|
"halteres": ("halter", "haltere"),
|
||||||
"halves": ("half",),
|
"halves": ("half",),
|
||||||
"hamuli": ("hamulus",),
|
"hamuli": ("hamulus",),
|
||||||
"hangers-on": ("hanger-on",),
|
"hangers-on": ("hanger-on",),
|
||||||
|
@ -909,7 +909,7 @@ NOUNS_IRREG = {
|
||||||
"heraclidae": ("heraclid",),
|
"heraclidae": ("heraclid",),
|
||||||
"heraklidae": ("heraklid",),
|
"heraklidae": ("heraklid",),
|
||||||
"herbaria": ("herbarium",),
|
"herbaria": ("herbarium",),
|
||||||
"hermae": ("herm", "herma",),
|
"hermae": ("herm", "herma"),
|
||||||
"hermai": ("herma",),
|
"hermai": ("herma",),
|
||||||
"herniae": ("hernia",),
|
"herniae": ("hernia",),
|
||||||
"heroes": ("hero",),
|
"heroes": ("hero",),
|
||||||
|
@ -955,8 +955,8 @@ NOUNS_IRREG = {
|
||||||
"ibices": ("ibex",),
|
"ibices": ("ibex",),
|
||||||
"ibo": ("igbo",),
|
"ibo": ("igbo",),
|
||||||
"ichthyosauri": ("ichthyosaurus",),
|
"ichthyosauri": ("ichthyosaurus",),
|
||||||
"ichthyosauruses": ("ichthyosaur", "ichthyosaurus",),
|
"ichthyosauruses": ("ichthyosaur", "ichthyosaurus"),
|
||||||
"iconostases": ("iconostas", "iconostasis",),
|
"iconostases": ("iconostas", "iconostasis"),
|
||||||
"icosahedra": ("icosahedron",),
|
"icosahedra": ("icosahedron",),
|
||||||
"ideata": ("ideatum",),
|
"ideata": ("ideatum",),
|
||||||
"igorrorote": ("igorrote",),
|
"igorrorote": ("igorrote",),
|
||||||
|
@ -991,7 +991,7 @@ NOUNS_IRREG = {
|
||||||
"is": ("is",),
|
"is": ("is",),
|
||||||
"ischia": ("ischium",),
|
"ischia": ("ischium",),
|
||||||
"isthmi": ("isthmus",),
|
"isthmi": ("isthmus",),
|
||||||
"jackeroos": ("jackaroo", "jackeroo",),
|
"jackeroos": ("jackaroo", "jackeroo"),
|
||||||
"jackfishes": ("jackfish",),
|
"jackfishes": ("jackfish",),
|
||||||
"jackknives": ("jackknife",),
|
"jackknives": ("jackknife",),
|
||||||
"jacks-in-the-box": ("jack-in-the-box",),
|
"jacks-in-the-box": ("jack-in-the-box",),
|
||||||
|
@ -1001,12 +1001,12 @@ NOUNS_IRREG = {
|
||||||
"jewfishes": ("jewfish",),
|
"jewfishes": ("jewfish",),
|
||||||
"jingoes": ("jingo",),
|
"jingoes": ("jingo",),
|
||||||
"jinn": ("jinni",),
|
"jinn": ("jinni",),
|
||||||
"joes": ("jo", "joe",),
|
"joes": ("jo", "joe"),
|
||||||
"judge_advocates_general": ("judge_advocate_general",),
|
"judge_advocates_general": ("judge_advocate_general",),
|
||||||
"jura": ("jus",),
|
"jura": ("jus",),
|
||||||
"kaddishim": ("kaddish",),
|
"kaddishim": ("kaddish",),
|
||||||
"kalmuck": ("kalmuc",),
|
"kalmuck": ("kalmuc",),
|
||||||
"kalmucks": ("kalmuc", "kalmuck",),
|
"kalmucks": ("kalmuc", "kalmuck"),
|
||||||
"katabases": ("katabasis",),
|
"katabases": ("katabasis",),
|
||||||
"keeshonden": ("keeshond",),
|
"keeshonden": ("keeshond",),
|
||||||
"kibbutzim": ("kibbutz",),
|
"kibbutzim": ("kibbutz",),
|
||||||
|
@ -1045,7 +1045,7 @@ NOUNS_IRREG = {
|
||||||
"latifundia": ("latifundium",),
|
"latifundia": ("latifundium",),
|
||||||
"latu": ("lat",),
|
"latu": ("lat",),
|
||||||
"lavaboes": ("lavabo",),
|
"lavaboes": ("lavabo",),
|
||||||
"leaves": ("leaf", "leave",),
|
"leaves": ("leaf", "leave"),
|
||||||
"lecythi": ("lecythus",),
|
"lecythi": ("lecythus",),
|
||||||
"leges": ("lex",),
|
"leges": ("lex",),
|
||||||
"lei": ("leu",),
|
"lei": ("leu",),
|
||||||
|
@ -1078,7 +1078,7 @@ NOUNS_IRREG = {
|
||||||
"liriodendra": ("liriodendron",),
|
"liriodendra": ("liriodendron",),
|
||||||
"lisente": ("sente",),
|
"lisente": ("sente",),
|
||||||
"listente": ("sente",),
|
"listente": ("sente",),
|
||||||
"litai": ("lit", "litas",),
|
"litai": ("lit", "litas"),
|
||||||
"litu": ("litas",),
|
"litu": ("litas",),
|
||||||
"lives": ("life",),
|
"lives": ("life",),
|
||||||
"lixivia": ("lixivium",),
|
"lixivia": ("lixivium",),
|
||||||
|
@ -1098,7 +1098,7 @@ NOUNS_IRREG = {
|
||||||
"lumpfishes": ("lumpfish",),
|
"lumpfishes": ("lumpfish",),
|
||||||
"lungfishes": ("lungfish",),
|
"lungfishes": ("lungfish",),
|
||||||
"lunulae": ("lunula",),
|
"lunulae": ("lunula",),
|
||||||
"lures": ("lur", "lure",),
|
"lures": ("lur", "lure"),
|
||||||
"lustra": ("lustre",),
|
"lustra": ("lustre",),
|
||||||
"lyings-in": ("lying-in",),
|
"lyings-in": ("lying-in",),
|
||||||
"lymphangitides": ("lymphangitis",),
|
"lymphangitides": ("lymphangitis",),
|
||||||
|
@ -1142,7 +1142,7 @@ NOUNS_IRREG = {
|
||||||
"marsupia": ("marsupium",),
|
"marsupia": ("marsupium",),
|
||||||
"marvels-of-peru": ("marvel-of-peru",),
|
"marvels-of-peru": ("marvel-of-peru",),
|
||||||
"mass_media": ("mass_medium",),
|
"mass_media": ("mass_medium",),
|
||||||
"masses": ("mass", "masse",),
|
"masses": ("mass", "masse"),
|
||||||
"masters-at-arms": ("master-at-arms",),
|
"masters-at-arms": ("master-at-arms",),
|
||||||
"matrices": ("matrix",),
|
"matrices": ("matrix",),
|
||||||
"matzoth": ("matzo",),
|
"matzoth": ("matzo",),
|
||||||
|
@ -1210,7 +1210,7 @@ NOUNS_IRREG = {
|
||||||
"mioses": ("miosis",),
|
"mioses": ("miosis",),
|
||||||
"miracidia": ("miracidium",),
|
"miracidia": ("miracidium",),
|
||||||
"miri": ("mir",),
|
"miri": ("mir",),
|
||||||
"mishnayoth": ("mishna", "mishnah",),
|
"mishnayoth": ("mishna", "mishnah"),
|
||||||
"mitochondria": ("mitochondrion",),
|
"mitochondria": ("mitochondrion",),
|
||||||
"mitzvoth": ("mitzvah",),
|
"mitzvoth": ("mitzvah",),
|
||||||
"modioli": ("modiolus",),
|
"modioli": ("modiolus",),
|
||||||
|
@ -1218,7 +1218,7 @@ NOUNS_IRREG = {
|
||||||
"momenta": ("momentum",),
|
"momenta": ("momentum",),
|
||||||
"moments_of_truth": ("moment_of_truth",),
|
"moments_of_truth": ("moment_of_truth",),
|
||||||
"momi": ("momus",),
|
"momi": ("momus",),
|
||||||
"monades": ("monad", "monas",),
|
"monades": ("monad", "monas"),
|
||||||
"monkfishes": ("monkfish",),
|
"monkfishes": ("monkfish",),
|
||||||
"monochasia": ("monochasium",),
|
"monochasia": ("monochasium",),
|
||||||
"monopodia": ("monopodium",),
|
"monopodia": ("monopodium",),
|
||||||
|
@ -1235,7 +1235,7 @@ NOUNS_IRREG = {
|
||||||
"moriscoes": ("morisco",),
|
"moriscoes": ("morisco",),
|
||||||
"morphallaxes": ("morphallaxis",),
|
"morphallaxes": ("morphallaxis",),
|
||||||
"morphoses": ("morphosis",),
|
"morphoses": ("morphosis",),
|
||||||
"morses": ("morse", "mors",),
|
"morses": ("morse", "mors"),
|
||||||
"morulae": ("morula",),
|
"morulae": ("morula",),
|
||||||
"mosasauri": ("mosasaurus",),
|
"mosasauri": ("mosasaurus",),
|
||||||
"moshavim": ("moshav",),
|
"moshavim": ("moshav",),
|
||||||
|
@ -1328,13 +1328,13 @@ NOUNS_IRREG = {
|
||||||
"oceanides": ("oceanid",),
|
"oceanides": ("oceanid",),
|
||||||
"ocelli": ("ocellus",),
|
"ocelli": ("ocellus",),
|
||||||
"ochreae": ("ochrea",),
|
"ochreae": ("ochrea",),
|
||||||
"ocreae": ("ochrea", "ocrea",),
|
"ocreae": ("ochrea", "ocrea"),
|
||||||
"octahedra": ("octahedron",),
|
"octahedra": ("octahedron",),
|
||||||
"octopi": ("octopus",),
|
"octopi": ("octopus",),
|
||||||
"oculi": ("oculus",),
|
"oculi": ("oculus",),
|
||||||
"odea": ("odeum",),
|
"odea": ("odeum",),
|
||||||
"oedemata": ("edema", "oedema",),
|
"oedemata": ("edema", "oedema"),
|
||||||
"oesophagi": ("esophagus", "oesophagus",),
|
"oesophagi": ("esophagus", "oesophagus"),
|
||||||
"oldwives": ("oldwife",),
|
"oldwives": ("oldwife",),
|
||||||
"olea": ("oleum",),
|
"olea": ("oleum",),
|
||||||
"omasa": ("omasum",),
|
"omasa": ("omasum",),
|
||||||
|
@ -1350,15 +1350,15 @@ NOUNS_IRREG = {
|
||||||
"optic_axes": ("optic_axis",),
|
"optic_axes": ("optic_axis",),
|
||||||
"optima": ("optimum",),
|
"optima": ("optimum",),
|
||||||
"ora": ("os",),
|
"ora": ("os",),
|
||||||
"organa": ("organon", "organum",),
|
"organa": ("organon", "organum"),
|
||||||
"organums": ("organa", "organum",),
|
"organums": ("organa", "organum"),
|
||||||
"orthoptera": ("orthopteron",),
|
"orthoptera": ("orthopteron",),
|
||||||
"osar": ("os",),
|
"osar": ("os",),
|
||||||
"oscula": ("osculum",),
|
"oscula": ("osculum",),
|
||||||
"ossa": ("os",),
|
"ossa": ("os",),
|
||||||
"osteomata": ("osteoma",),
|
"osteomata": ("osteoma",),
|
||||||
"ostia": ("ostium",),
|
"ostia": ("ostium",),
|
||||||
"ottomans": ("othman", "ottoman",),
|
"ottomans": ("othman", "ottoman"),
|
||||||
"ova": ("ovum",),
|
"ova": ("ovum",),
|
||||||
"ovoli": ("ovolo",),
|
"ovoli": ("ovolo",),
|
||||||
"ovotestes": ("ovotestis",),
|
"ovotestes": ("ovotestis",),
|
||||||
|
@ -1382,7 +1382,7 @@ NOUNS_IRREG = {
|
||||||
"papulae": ("papula",),
|
"papulae": ("papula",),
|
||||||
"papyri": ("papyrus",),
|
"papyri": ("papyrus",),
|
||||||
"parabases": ("parabasis",),
|
"parabases": ("parabasis",),
|
||||||
"paraleipses": ("paraleipsis", "paralipsis",),
|
"paraleipses": ("paraleipsis", "paralipsis"),
|
||||||
"paralyses": ("paralysis",),
|
"paralyses": ("paralysis",),
|
||||||
"paramecia": ("paramecium",),
|
"paramecia": ("paramecium",),
|
||||||
"paramenta": ("parament",),
|
"paramenta": ("parament",),
|
||||||
|
@ -1442,13 +1442,13 @@ NOUNS_IRREG = {
|
||||||
"personae": ("persona",),
|
"personae": ("persona",),
|
||||||
"petechiae": ("petechia",),
|
"petechiae": ("petechia",),
|
||||||
"pfennige": ("pfennig",),
|
"pfennige": ("pfennig",),
|
||||||
"phalanges": ("phalange", "phalanx",),
|
"phalanges": ("phalange", "phalanx"),
|
||||||
"phalli": ("phallus",),
|
"phalli": ("phallus",),
|
||||||
"pharynges": ("pharynx",),
|
"pharynges": ("pharynx",),
|
||||||
"phenomena": ("phenomenon",),
|
"phenomena": ("phenomenon",),
|
||||||
"phi-phenomena": ("phi-phenomenon",),
|
"phi-phenomena": ("phi-phenomenon",),
|
||||||
"philodendra": ("philodendron",),
|
"philodendra": ("philodendron",),
|
||||||
"phlyctenae": ("phlyctaena", "phlyctena",),
|
"phlyctenae": ("phlyctaena", "phlyctena"),
|
||||||
"phyla": ("phylum",),
|
"phyla": ("phylum",),
|
||||||
"phylae": ("phyle",),
|
"phylae": ("phyle",),
|
||||||
"phyllotaxes": ("phyllotaxis",),
|
"phyllotaxes": ("phyllotaxis",),
|
||||||
|
@ -1475,12 +1475,12 @@ NOUNS_IRREG = {
|
||||||
"plasmodesmata": ("plasmodesma",),
|
"plasmodesmata": ("plasmodesma",),
|
||||||
"plasmodia": ("plasmodium",),
|
"plasmodia": ("plasmodium",),
|
||||||
"plateaux": ("plateau",),
|
"plateaux": ("plateau",),
|
||||||
"plectra": ("plectron", "plectrum",),
|
"plectra": ("plectron", "plectrum"),
|
||||||
"plena": ("plenum",),
|
"plena": ("plenum",),
|
||||||
"pleura": ("pleuron",),
|
"pleura": ("pleuron",),
|
||||||
"pleurae": ("pleura",),
|
"pleurae": ("pleura",),
|
||||||
"plicae": ("plica",),
|
"plicae": ("plica",),
|
||||||
"ploughmen": ("ploughman", "plowman",),
|
"ploughmen": ("ploughman", "plowman"),
|
||||||
"pneumobacilli": ("pneumobacillus",),
|
"pneumobacilli": ("pneumobacillus",),
|
||||||
"pneumococci": ("pneumococcus",),
|
"pneumococci": ("pneumococcus",),
|
||||||
"pocketknives": ("pocketknife",),
|
"pocketknives": ("pocketknife",),
|
||||||
|
@ -1515,7 +1515,7 @@ NOUNS_IRREG = {
|
||||||
"principia": ("principium",),
|
"principia": ("principium",),
|
||||||
"proboscides": ("proboscis",),
|
"proboscides": ("proboscis",),
|
||||||
"proces-verbaux": ("proces-verbal",),
|
"proces-verbaux": ("proces-verbal",),
|
||||||
"proglottides": ("proglottid", "proglottis",),
|
"proglottides": ("proglottid", "proglottis"),
|
||||||
"prognoses": ("prognosis",),
|
"prognoses": ("prognosis",),
|
||||||
"prolegomena": ("prolegomenon",),
|
"prolegomena": ("prolegomenon",),
|
||||||
"prolepses": ("prolepsis",),
|
"prolepses": ("prolepsis",),
|
||||||
|
@ -1532,7 +1532,7 @@ NOUNS_IRREG = {
|
||||||
"prostheses": ("prosthesis",),
|
"prostheses": ("prosthesis",),
|
||||||
"prostomia": ("prostomium",),
|
"prostomia": ("prostomium",),
|
||||||
"protases": ("protasis",),
|
"protases": ("protasis",),
|
||||||
"prothalamia": ("prothalamion", "prothalamium",),
|
"prothalamia": ("prothalamion", "prothalamium"),
|
||||||
"prothalli": ("prothallus",),
|
"prothalli": ("prothallus",),
|
||||||
"prothallia": ("prothallium",),
|
"prothallia": ("prothallium",),
|
||||||
"prothoraces": ("prothorax",),
|
"prothoraces": ("prothorax",),
|
||||||
|
@ -1572,7 +1572,7 @@ NOUNS_IRREG = {
|
||||||
"quezales": ("quezal",),
|
"quezales": ("quezal",),
|
||||||
"quinquennia": ("quinquennium",),
|
"quinquennia": ("quinquennium",),
|
||||||
"quizzes": ("quiz",),
|
"quizzes": ("quiz",),
|
||||||
"rabatos": ("rabato", "rebato",),
|
"rabatos": ("rabato", "rebato"),
|
||||||
"rabbitfishes": ("rabbitfish",),
|
"rabbitfishes": ("rabbitfish",),
|
||||||
"rachides": ("rhachis",),
|
"rachides": ("rhachis",),
|
||||||
"radices": ("radix",),
|
"radices": ("radix",),
|
||||||
|
@ -1583,7 +1583,7 @@ NOUNS_IRREG = {
|
||||||
"ranulae": ("ranula",),
|
"ranulae": ("ranula",),
|
||||||
"ranunculi": ("ranunculus",),
|
"ranunculi": ("ranunculus",),
|
||||||
"raphae": ("raphe",),
|
"raphae": ("raphe",),
|
||||||
"raphides": ("raphide", "raphis",),
|
"raphides": ("raphide", "raphis"),
|
||||||
"ratfishes": ("ratfish",),
|
"ratfishes": ("ratfish",),
|
||||||
"reales": ("real",),
|
"reales": ("real",),
|
||||||
"rearmice": ("rearmouse",),
|
"rearmice": ("rearmouse",),
|
||||||
|
@ -1598,7 +1598,7 @@ NOUNS_IRREG = {
|
||||||
"reis": ("real",),
|
"reis": ("real",),
|
||||||
"relata": ("relatum",),
|
"relata": ("relatum",),
|
||||||
"remiges": ("remex",),
|
"remiges": ("remex",),
|
||||||
"reremice": ("rearmouse", "reremouse",),
|
"reremice": ("rearmouse", "reremouse"),
|
||||||
"reseaux": ("reseau",),
|
"reseaux": ("reseau",),
|
||||||
"residua": ("residuum",),
|
"residua": ("residuum",),
|
||||||
"responsa": ("responsum",),
|
"responsa": ("responsum",),
|
||||||
|
@ -1609,7 +1609,7 @@ NOUNS_IRREG = {
|
||||||
"retinae": ("retina",),
|
"retinae": ("retina",),
|
||||||
"rhabdomyomata": ("rhabdomyoma",),
|
"rhabdomyomata": ("rhabdomyoma",),
|
||||||
"rhachides": ("rhachis",),
|
"rhachides": ("rhachis",),
|
||||||
"rhachises": ("rachis", "rhachis",),
|
"rhachises": ("rachis", "rhachis"),
|
||||||
"rhinencephala": ("rhinencephalon",),
|
"rhinencephala": ("rhinencephalon",),
|
||||||
"rhizobia": ("rhizobium",),
|
"rhizobia": ("rhizobium",),
|
||||||
"rhombi": ("rhombus",),
|
"rhombi": ("rhombus",),
|
||||||
|
@ -1636,7 +1636,7 @@ NOUNS_IRREG = {
|
||||||
"runners-up": ("runner-up",),
|
"runners-up": ("runner-up",),
|
||||||
"sacra": ("sacrum",),
|
"sacra": ("sacrum",),
|
||||||
"sacraria": ("sacrarium",),
|
"sacraria": ("sacrarium",),
|
||||||
"saguaros": ("saguaro", "sahuaro",),
|
"saguaros": ("saguaro", "sahuaro"),
|
||||||
"sailfishes": ("sailfish",),
|
"sailfishes": ("sailfish",),
|
||||||
"salespeople": ("salesperson",),
|
"salespeople": ("salesperson",),
|
||||||
"salmonellae": ("salmonella",),
|
"salmonellae": ("salmonella",),
|
||||||
|
@ -1657,7 +1657,7 @@ NOUNS_IRREG = {
|
||||||
"scapulae": ("scapula",),
|
"scapulae": ("scapula",),
|
||||||
"scarabaei": ("scarabaeus",),
|
"scarabaei": ("scarabaeus",),
|
||||||
"scarves": ("scarf",),
|
"scarves": ("scarf",),
|
||||||
"schatchonim": ("schatchen", "shadchan",),
|
"schatchonim": ("schatchen", "shadchan"),
|
||||||
"schemata": ("schema",),
|
"schemata": ("schema",),
|
||||||
"scherzandi": ("scherzando",),
|
"scherzandi": ("scherzando",),
|
||||||
"scherzi": ("scherzo",),
|
"scherzi": ("scherzo",),
|
||||||
|
@ -1690,7 +1690,7 @@ NOUNS_IRREG = {
|
||||||
"senores": ("senor",),
|
"senores": ("senor",),
|
||||||
"sensilla": ("sensillum",),
|
"sensilla": ("sensillum",),
|
||||||
"senti": ("sent",),
|
"senti": ("sent",),
|
||||||
"senussis": ("senusi", "senussi",),
|
"senussis": ("senusi", "senussi"),
|
||||||
"separatrices": ("separatrix",),
|
"separatrices": ("separatrix",),
|
||||||
"sephardim": ("sephardi",),
|
"sephardim": ("sephardi",),
|
||||||
"septa": ("septum",),
|
"septa": ("septum",),
|
||||||
|
@ -1707,9 +1707,9 @@ NOUNS_IRREG = {
|
||||||
"shabbatim": ("shabbat",),
|
"shabbatim": ("shabbat",),
|
||||||
"shackoes": ("shacko",),
|
"shackoes": ("shacko",),
|
||||||
"shadchanim": ("shadchan",),
|
"shadchanim": ("shadchan",),
|
||||||
"shadchans": ("schatchen", "shadchan",),
|
"shadchans": ("schatchen", "shadchan"),
|
||||||
"shakoes": ("shako",),
|
"shakoes": ("shako",),
|
||||||
"shammosim": ("shammas", "shammes",),
|
"shammosim": ("shammas", "shammes"),
|
||||||
"sheatfishes": ("sheatfish",),
|
"sheatfishes": ("sheatfish",),
|
||||||
"sheaves": ("sheaf",),
|
"sheaves": ("sheaf",),
|
||||||
"shellfishes": ("shellfish",),
|
"shellfishes": ("shellfish",),
|
||||||
|
@ -1717,14 +1717,14 @@ NOUNS_IRREG = {
|
||||||
"shinleaves": ("shinleaf",),
|
"shinleaves": ("shinleaf",),
|
||||||
"shittim": ("shittah",),
|
"shittim": ("shittah",),
|
||||||
"shmoes": ("shmo",),
|
"shmoes": ("shmo",),
|
||||||
"shofroth": ("shofar", "shophar",),
|
"shofroth": ("shofar", "shophar"),
|
||||||
"shophroth": ("shophar",),
|
"shophroth": ("shophar",),
|
||||||
"shrewmice": ("shrewmouse",),
|
"shrewmice": ("shrewmouse",),
|
||||||
"shuln": ("shul",),
|
"shuln": ("shul",),
|
||||||
"siddurim": ("siddur",),
|
"siddurim": ("siddur",),
|
||||||
"sigloi": ("siglos",),
|
"sigloi": ("siglos",),
|
||||||
"signore": ("signora",),
|
"signore": ("signora",),
|
||||||
"signori": ("signior", "signore",),
|
"signori": ("signior", "signore"),
|
||||||
"signorine": ("signorina",),
|
"signorine": ("signorina",),
|
||||||
"siliquae": ("siliqua",),
|
"siliquae": ("siliqua",),
|
||||||
"silvae": ("silva",),
|
"silvae": ("silva",),
|
||||||
|
@ -1739,12 +1739,12 @@ NOUNS_IRREG = {
|
||||||
"snaggleteeth": ("snaggletooth",),
|
"snaggleteeth": ("snaggletooth",),
|
||||||
"snailfishes": ("snailfish",),
|
"snailfishes": ("snailfish",),
|
||||||
"snipefishes": ("snipefish",),
|
"snipefishes": ("snipefish",),
|
||||||
"socmen": ("socman", "sokeman",),
|
"socmen": ("socman", "sokeman"),
|
||||||
"sola": ("solum",),
|
"sola": ("solum",),
|
||||||
"solaria": ("solarium",),
|
"solaria": ("solarium",),
|
||||||
"solatia": ("solatium",),
|
"solatia": ("solatium",),
|
||||||
"soldi": ("soldo",),
|
"soldi": ("soldo",),
|
||||||
"soles": ("sol", "sole",),
|
"soles": ("sol", "sole"),
|
||||||
"solfeggi": ("solfeggio",),
|
"solfeggi": ("solfeggio",),
|
||||||
"soli": ("solo",),
|
"soli": ("solo",),
|
||||||
"solidi": ("solidus",),
|
"solidi": ("solidus",),
|
||||||
|
@ -1864,7 +1864,7 @@ NOUNS_IRREG = {
|
||||||
"syringes": ("syrinx",),
|
"syringes": ("syrinx",),
|
||||||
"syssarcoses": ("syssarcosis",),
|
"syssarcoses": ("syssarcosis",),
|
||||||
"tableaux": ("tableau",),
|
"tableaux": ("tableau",),
|
||||||
"taeniae": ("taenia", "tenia",),
|
"taeniae": ("taenia", "tenia"),
|
||||||
"tali": ("talus",),
|
"tali": ("talus",),
|
||||||
"tallaisim": ("tallith",),
|
"tallaisim": ("tallith",),
|
||||||
"tallithes": ("tallith",),
|
"tallithes": ("tallith",),
|
||||||
|
@ -1874,14 +1874,14 @@ NOUNS_IRREG = {
|
||||||
"tarsi": ("tarsus",),
|
"tarsi": ("tarsus",),
|
||||||
"tarsometatarsi": ("tarsometatarsus",),
|
"tarsometatarsi": ("tarsometatarsus",),
|
||||||
"taxa": ("taxon",),
|
"taxa": ("taxon",),
|
||||||
"taxes": ("tax", "taxis",),
|
"taxes": ("tax", "taxis"),
|
||||||
"taxies": ("taxi",),
|
"taxies": ("taxi",),
|
||||||
"tectrices": ("tectrix",),
|
"tectrices": ("tectrix",),
|
||||||
"teeth": ("tooth",),
|
"teeth": ("tooth",),
|
||||||
"tegmina": ("tegmen",),
|
"tegmina": ("tegmen",),
|
||||||
"telae": ("tela",),
|
"telae": ("tela",),
|
||||||
"telamones": ("telamon",),
|
"telamones": ("telamon",),
|
||||||
"telangiectases": ("telangiectasia", "telangiectasis",),
|
"telangiectases": ("telangiectasia", "telangiectasis"),
|
||||||
"telia": ("telium",),
|
"telia": ("telium",),
|
||||||
"tempi": ("tempo",),
|
"tempi": ("tempo",),
|
||||||
"tenacula": ("tenaculum",),
|
"tenacula": ("tenaculum",),
|
||||||
|
@ -1932,7 +1932,7 @@ NOUNS_IRREG = {
|
||||||
"tornadoes": ("tornado",),
|
"tornadoes": ("tornado",),
|
||||||
"torpedoes": ("torpedo",),
|
"torpedoes": ("torpedo",),
|
||||||
"torsi": ("torso",),
|
"torsi": ("torso",),
|
||||||
"touracos": ("touraco", "turaco",),
|
"touracos": ("touraco", "turaco"),
|
||||||
"trabeculae": ("trabecula",),
|
"trabeculae": ("trabecula",),
|
||||||
"tracheae": ("trachea",),
|
"tracheae": ("trachea",),
|
||||||
"traditores": ("traditor",),
|
"traditores": ("traditor",),
|
||||||
|
@ -1960,7 +1960,7 @@ NOUNS_IRREG = {
|
||||||
"tubae": ("tuba",),
|
"tubae": ("tuba",),
|
||||||
"turves": ("turf",),
|
"turves": ("turf",),
|
||||||
"tympana": ("tympanum",),
|
"tympana": ("tympanum",),
|
||||||
"tyros": ("tiro", "tyro",),
|
"tyros": ("tiro", "tyro"),
|
||||||
"ubermenschen": ("ubermensch",),
|
"ubermenschen": ("ubermensch",),
|
||||||
"uglies": ("ugli",),
|
"uglies": ("ugli",),
|
||||||
"uigurs": ("uighur",),
|
"uigurs": ("uighur",),
|
||||||
|
@ -1980,7 +1980,7 @@ NOUNS_IRREG = {
|
||||||
"utriculi": ("utriculus",),
|
"utriculi": ("utriculus",),
|
||||||
"uvulae": ("uvula",),
|
"uvulae": ("uvula",),
|
||||||
"vacua": ("vacuum",),
|
"vacua": ("vacuum",),
|
||||||
"vagi": ("vagus", "vagus",),
|
"vagi": ("vagus", "vagus"),
|
||||||
"vaginae": ("vagina",),
|
"vaginae": ("vagina",),
|
||||||
"valleculae": ("vallecula",),
|
"valleculae": ("vallecula",),
|
||||||
"vaporetti": ("vaporetto",),
|
"vaporetti": ("vaporetto",),
|
||||||
|
@ -2026,7 +2026,7 @@ NOUNS_IRREG = {
|
||||||
"vortices": ("vortex",),
|
"vortices": ("vortex",),
|
||||||
"vulvae": ("vulva",),
|
"vulvae": ("vulva",),
|
||||||
"wagons-lits": ("wagon-lit",),
|
"wagons-lits": ("wagon-lit",),
|
||||||
"wahhabis": ("wahabi", "wahhabi",),
|
"wahhabis": ("wahabi", "wahhabi"),
|
||||||
"wanderjahre": ("wanderjahr",),
|
"wanderjahre": ("wanderjahr",),
|
||||||
"weakfishes": ("weakfish",),
|
"weakfishes": ("weakfish",),
|
||||||
"werewolves": ("werewolf",),
|
"werewolves": ("werewolf",),
|
||||||
|
@ -2044,13 +2044,13 @@ NOUNS_IRREG = {
|
||||||
"yeshivoth": ("yeshiva",),
|
"yeshivoth": ("yeshiva",),
|
||||||
"yogin": ("yogi",),
|
"yogin": ("yogi",),
|
||||||
"yourselves": ("yourself",),
|
"yourselves": ("yourself",),
|
||||||
"zamindaris": ("zamindari", "zemindari",),
|
"zamindaris": ("zamindari", "zemindari"),
|
||||||
"zecchini": ("zecchino",),
|
"zecchini": ("zecchino",),
|
||||||
"zeroes": ("zero",),
|
"zeroes": ("zero",),
|
||||||
"zoa": ("zoon",),
|
"zoa": ("zoon",),
|
||||||
"zoaeae": ("zoaea", "zoea",),
|
"zoaeae": ("zoaea", "zoea"),
|
||||||
"zoeae": ("zoea",),
|
"zoeae": ("zoea",),
|
||||||
"zoeas": ("zoaea",),
|
"zoeas": ("zoaea",),
|
||||||
"zoonoses": ("zoonosis",),
|
"zoonoses": ("zoonosis",),
|
||||||
"zoosporangia": ("zoosporangium",)
|
"zoosporangia": ("zoosporangium",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,8 +42,8 @@ VERBS_IRREG = {
|
||||||
"anglified": ("anglify",),
|
"anglified": ("anglify",),
|
||||||
"annulled": ("annul",),
|
"annulled": ("annul",),
|
||||||
"annulling": ("annul",),
|
"annulling": ("annul",),
|
||||||
"appalled": ("appal", "appall",),
|
"appalled": ("appal", "appall"),
|
||||||
"appalling": ("appal", "appall",),
|
"appalling": ("appal", "appall"),
|
||||||
"applied": ("apply",),
|
"applied": ("apply",),
|
||||||
"arcked": ("arc",),
|
"arcked": ("arc",),
|
||||||
"arcking": ("arc",),
|
"arcking": ("arc",),
|
||||||
|
@ -244,9 +244,9 @@ VERBS_IRREG = {
|
||||||
"bypast": ("bypass",),
|
"bypast": ("bypass",),
|
||||||
"caballed": ("cabal",),
|
"caballed": ("cabal",),
|
||||||
"caballing": ("cabal",),
|
"caballing": ("cabal",),
|
||||||
"caddied": ("caddie", "caddy",),
|
"caddied": ("caddie", "caddy"),
|
||||||
"caddies": ("caddie", "caddy",),
|
"caddies": ("caddie", "caddy"),
|
||||||
"caddying": ("caddie", "caddy",),
|
"caddying": ("caddie", "caddy"),
|
||||||
"calcified": ("calcify",),
|
"calcified": ("calcify",),
|
||||||
"came": ("come",),
|
"came": ("come",),
|
||||||
"canalled": ("canal",),
|
"canalled": ("canal",),
|
||||||
|
@ -506,8 +506,8 @@ VERBS_IRREG = {
|
||||||
"disembodied": ("disembody",),
|
"disembodied": ("disembody",),
|
||||||
"disembowelled": ("disembowel",),
|
"disembowelled": ("disembowel",),
|
||||||
"disembowelling": ("disembowel",),
|
"disembowelling": ("disembowel",),
|
||||||
"disenthralled": ("disenthral", "disenthrall",),
|
"disenthralled": ("disenthral", "disenthrall"),
|
||||||
"disenthralling": ("disenthral", "disenthrall",),
|
"disenthralling": ("disenthral", "disenthrall"),
|
||||||
"disenthralls": ("disenthral",),
|
"disenthralls": ("disenthral",),
|
||||||
"disenthrals": ("disenthrall",),
|
"disenthrals": ("disenthrall",),
|
||||||
"dishevelled": ("dishevel",),
|
"dishevelled": ("dishevel",),
|
||||||
|
@ -518,8 +518,8 @@ VERBS_IRREG = {
|
||||||
"dispelling": ("dispel",),
|
"dispelling": ("dispel",),
|
||||||
"disqualified": ("disqualify",),
|
"disqualified": ("disqualify",),
|
||||||
"dissatisfied": ("dissatisfy",),
|
"dissatisfied": ("dissatisfy",),
|
||||||
"distilled": ("distil", "distill",),
|
"distilled": ("distil", "distill"),
|
||||||
"distilling": ("distil", "distill",),
|
"distilling": ("distil", "distill"),
|
||||||
"diversified": ("diversify",),
|
"diversified": ("diversify",),
|
||||||
"divvied": ("divvy",),
|
"divvied": ("divvy",),
|
||||||
"dizzied": ("dizzy",),
|
"dizzied": ("dizzy",),
|
||||||
|
@ -595,10 +595,10 @@ VERBS_IRREG = {
|
||||||
"enamelling": ("enamel",),
|
"enamelling": ("enamel",),
|
||||||
"englutted": ("englut",),
|
"englutted": ("englut",),
|
||||||
"englutting": ("englut",),
|
"englutting": ("englut",),
|
||||||
"enrolled": ("enrol", "enroll",),
|
"enrolled": ("enrol", "enroll"),
|
||||||
"enrolling": ("enrol", "enroll",),
|
"enrolling": ("enrol", "enroll"),
|
||||||
"enthralled": ("enthral", "enthrall",),
|
"enthralled": ("enthral", "enthrall"),
|
||||||
"enthralling": ("enthral", "enthrall",),
|
"enthralling": ("enthral", "enthrall"),
|
||||||
"entrammelled": ("entrammel",),
|
"entrammelled": ("entrammel",),
|
||||||
"entrammelling": ("entrammel",),
|
"entrammelling": ("entrammel",),
|
||||||
"entrapped": ("entrap",),
|
"entrapped": ("entrap",),
|
||||||
|
@ -621,8 +621,8 @@ VERBS_IRREG = {
|
||||||
"exemplified": ("exemplify",),
|
"exemplified": ("exemplify",),
|
||||||
"expelled": ("expel",),
|
"expelled": ("expel",),
|
||||||
"expelling": ("expel",),
|
"expelling": ("expel",),
|
||||||
"extolled": ("extol", "extoll",),
|
"extolled": ("extol", "extoll"),
|
||||||
"extolling": ("extol", "extoll",),
|
"extolling": ("extol", "extoll"),
|
||||||
"facetted": ("facet",),
|
"facetted": ("facet",),
|
||||||
"facetting": ("facet",),
|
"facetting": ("facet",),
|
||||||
"fagged": ("fag",),
|
"fagged": ("fag",),
|
||||||
|
@ -638,7 +638,7 @@ VERBS_IRREG = {
|
||||||
"featherbedded": ("featherbed",),
|
"featherbedded": ("featherbed",),
|
||||||
"featherbedding": ("featherbed",),
|
"featherbedding": ("featherbed",),
|
||||||
"fed": ("feed",),
|
"fed": ("feed",),
|
||||||
"feed": ("feed", "fee",),
|
"feed": ("feed", "fee"),
|
||||||
"fell": ("fall",),
|
"fell": ("fall",),
|
||||||
"felt": ("feel",),
|
"felt": ("feel",),
|
||||||
"ferried": ("ferry",),
|
"ferried": ("ferry",),
|
||||||
|
@ -744,8 +744,8 @@ VERBS_IRREG = {
|
||||||
"fried": ("fry",),
|
"fried": ("fry",),
|
||||||
"frigged": ("frig",),
|
"frigged": ("frig",),
|
||||||
"frigging": ("frig",),
|
"frigging": ("frig",),
|
||||||
"fritted": ("frit", "fritt",),
|
"fritted": ("frit", "fritt"),
|
||||||
"fritting": ("frit", "fritt",),
|
"fritting": ("frit", "fritt"),
|
||||||
"frivolled": ("frivol",),
|
"frivolled": ("frivol",),
|
||||||
"frivolling": ("frivol",),
|
"frivolling": ("frivol",),
|
||||||
"frogged": ("frog",),
|
"frogged": ("frog",),
|
||||||
|
@ -757,8 +757,8 @@ VERBS_IRREG = {
|
||||||
"fructified": ("fructify",),
|
"fructified": ("fructify",),
|
||||||
"fuelled": ("fuel",),
|
"fuelled": ("fuel",),
|
||||||
"fuelling": ("fuel",),
|
"fuelling": ("fuel",),
|
||||||
"fulfilled": ("fulfil", "fulfill",),
|
"fulfilled": ("fulfil", "fulfill"),
|
||||||
"fulfilling": ("fulfil", "fulfill",),
|
"fulfilling": ("fulfil", "fulfill"),
|
||||||
"funned": ("fun",),
|
"funned": ("fun",),
|
||||||
"funnelled": ("funnel",),
|
"funnelled": ("funnel",),
|
||||||
"funnelling": ("funnel",),
|
"funnelling": ("funnel",),
|
||||||
|
@ -955,8 +955,8 @@ VERBS_IRREG = {
|
||||||
"insetting": ("inset",),
|
"insetting": ("inset",),
|
||||||
"inspanned": ("inspan",),
|
"inspanned": ("inspan",),
|
||||||
"inspanning": ("inspan",),
|
"inspanning": ("inspan",),
|
||||||
"installed": ("instal", "install",),
|
"installed": ("instal", "install"),
|
||||||
"installing": ("instal", "install",),
|
"installing": ("instal", "install"),
|
||||||
"intensified": ("intensify",),
|
"intensified": ("intensify",),
|
||||||
"interbred": ("interbreed",),
|
"interbred": ("interbreed",),
|
||||||
"intercropped": ("intercrop",),
|
"intercropped": ("intercrop",),
|
||||||
|
@ -1303,7 +1303,7 @@ VERBS_IRREG = {
|
||||||
"overdriven": ("overdrive",),
|
"overdriven": ("overdrive",),
|
||||||
"overdrove": ("overdrive",),
|
"overdrove": ("overdrive",),
|
||||||
"overflew": ("overfly",),
|
"overflew": ("overfly",),
|
||||||
"overflown": ("overflow", "overfly",),
|
"overflown": ("overflow", "overfly"),
|
||||||
"overgrew": ("overgrow",),
|
"overgrew": ("overgrow",),
|
||||||
"overgrown": ("overgrow",),
|
"overgrown": ("overgrow",),
|
||||||
"overheard": ("overhear",),
|
"overheard": ("overhear",),
|
||||||
|
@ -1547,8 +1547,8 @@ VERBS_IRREG = {
|
||||||
"red": ("red",),
|
"red": ("red",),
|
||||||
"red-pencilled": ("red-pencil",),
|
"red-pencilled": ("red-pencil",),
|
||||||
"red-pencilling": ("red-pencil",),
|
"red-pencilling": ("red-pencil",),
|
||||||
"redded": ("red", "redd",),
|
"redded": ("red", "redd"),
|
||||||
"redding": ("red", "redd",),
|
"redding": ("red", "redd"),
|
||||||
"redid": ("redo",),
|
"redid": ("redo",),
|
||||||
"redone": ("redo",),
|
"redone": ("redo",),
|
||||||
"referred": ("refer",),
|
"referred": ("refer",),
|
||||||
|
@ -1763,7 +1763,7 @@ VERBS_IRREG = {
|
||||||
"signified": ("signify",),
|
"signified": ("signify",),
|
||||||
"silicified": ("silicify",),
|
"silicified": ("silicify",),
|
||||||
"simplified": ("simplify",),
|
"simplified": ("simplify",),
|
||||||
"singing": ("sing", "singe",),
|
"singing": ("sing", "singe"),
|
||||||
"single-stepped": ("single-step",),
|
"single-stepped": ("single-step",),
|
||||||
"single-stepping": ("single-step",),
|
"single-stepping": ("single-step",),
|
||||||
"sinned": ("sin",),
|
"sinned": ("sin",),
|
||||||
|
@ -2404,5 +2404,5 @@ VERBS_IRREG = {
|
||||||
"zigzagged": ("zigzag",),
|
"zigzagged": ("zigzag",),
|
||||||
"zigzagging": ("zigzag",),
|
"zigzagging": ("zigzag",),
|
||||||
"zipped": ("zip",),
|
"zipped": ("zip",),
|
||||||
"zipping": ("zip",)
|
"zipping": ("zip",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -538,7 +538,7 @@ for orth in [
|
||||||
"Sen.",
|
"Sen.",
|
||||||
"St.",
|
"St.",
|
||||||
"vs.",
|
"vs.",
|
||||||
"v.s."
|
"v.s.",
|
||||||
]:
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -20,14 +20,22 @@ from ....util import load_language_data
|
||||||
|
|
||||||
BASE_PATH = Path(__file__).parent
|
BASE_PATH = Path(__file__).parent
|
||||||
|
|
||||||
LOOKUP = load_language_data(BASE_PATH / 'lookup.json')
|
LOOKUP = load_language_data(BASE_PATH / "lookup.json")
|
||||||
VERBS_IRREG = load_language_data(BASE_PATH / '_verbs_irreg.json')
|
VERBS_IRREG = load_language_data(BASE_PATH / "_verbs_irreg.json")
|
||||||
ADJECTIVES_IRREG = load_language_data(BASE_PATH / '_adjectives_irreg.json')
|
ADJECTIVES_IRREG = load_language_data(BASE_PATH / "_adjectives_irreg.json")
|
||||||
|
|
||||||
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
|
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
|
||||||
|
|
||||||
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adp': ADP_IRREG, 'aux': AUXILIARY_VERBS_IRREG,
|
LEMMA_EXC = {
|
||||||
'cconj': CCONJ_IRREG, 'det': DETS_IRREG, 'noun': NOUNS_IRREG, 'verb': VERBS_IRREG,
|
"adj": ADJECTIVES_IRREG,
|
||||||
'pron': PRONOUNS_IRREG, 'sconj': SCONJ_IRREG}
|
"adp": ADP_IRREG,
|
||||||
|
"aux": AUXILIARY_VERBS_IRREG,
|
||||||
|
"cconj": CCONJ_IRREG,
|
||||||
|
"det": DETS_IRREG,
|
||||||
|
"noun": NOUNS_IRREG,
|
||||||
|
"verb": VERBS_IRREG,
|
||||||
|
"pron": PRONOUNS_IRREG,
|
||||||
|
"sconj": SCONJ_IRREG,
|
||||||
|
}
|
||||||
|
|
||||||
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES}
|
LEMMA_RULES = {"adj": ADJECTIVE_RULES, "noun": NOUN_RULES, "verb": VERB_RULES}
|
||||||
|
|
|
@ -20,5 +20,5 @@ ADP_IRREG = {
|
||||||
"pr": ("pour",),
|
"pr": ("pour",),
|
||||||
"/": ("sur",),
|
"/": ("sur",),
|
||||||
"versus": ("vs",),
|
"versus": ("vs",),
|
||||||
"vs.": ("vs",)
|
"vs.": ("vs",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -365,5 +365,5 @@ AUXILIARY_VERBS_IRREG = {
|
||||||
"va": ("aller",),
|
"va": ("aller",),
|
||||||
"vais": ("aller",),
|
"vais": ("aller",),
|
||||||
"vas": ("aller",),
|
"vas": ("aller",),
|
||||||
"vont": ("aller",)
|
"vont": ("aller",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,5 +13,5 @@ CCONJ_IRREG = {
|
||||||
"i.e.": ("c'est-à-dire",),
|
"i.e.": ("c'est-à-dire",),
|
||||||
"ie": ("c'est-à-dire",),
|
"ie": ("c'est-à-dire",),
|
||||||
"ou/et": ("et-ou",),
|
"ou/et": ("et-ou",),
|
||||||
"+": ("plus",)
|
"+": ("plus",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -9963,5 +9963,5 @@ NOUNS_IRREG = {
|
||||||
"zurichoises": ("zurichois",),
|
"zurichoises": ("zurichois",),
|
||||||
"zurichois": ("zurichois",),
|
"zurichois": ("zurichois",),
|
||||||
"zyras": ("zyras",),
|
"zyras": ("zyras",),
|
||||||
"zyzomys": ("zyzomys",)
|
"zyzomys": ("zyzomys",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,5 +15,5 @@ SCONJ_IRREG = {
|
||||||
"puisqu'": ("puisque",),
|
"puisqu'": ("puisque",),
|
||||||
"qd": ("quand",),
|
"qd": ("quand",),
|
||||||
"quoiqu'": ("quoique",),
|
"quoiqu'": ("quoique",),
|
||||||
"qu'": ("que",)
|
"qu'": ("que",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,20 +3,22 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP, SCONJ, CCONJ
|
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
||||||
|
from ....symbols import SCONJ, CCONJ
|
||||||
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||||
from ....util import load_language_data
|
from ....util import load_language_data
|
||||||
|
|
||||||
LOOKUP = load_language_data(Path(__file__).parent / 'lookup.json')
|
LOOKUP = load_language_data(Path(__file__).parent / "lookup.json")
|
||||||
|
|
||||||
'''
|
"""
|
||||||
French language lemmatizer applies the default rule based lemmatization
|
French language lemmatizer applies the default rule based lemmatization
|
||||||
procedure with some modifications for better French language support.
|
procedure with some modifications for better French language support.
|
||||||
|
|
||||||
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use the
|
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use the
|
||||||
rule-based lemmatization. As a last resort, the lemmatizer checks in
|
rule-based lemmatization. As a last resort, the lemmatizer checks in
|
||||||
the lookup table.
|
the lookup table.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
|
||||||
class FrenchLemmatizer(object):
|
class FrenchLemmatizer(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -32,36 +34,39 @@ class FrenchLemmatizer(object):
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(self, string, univ_pos, morphology=None):
|
||||||
if not self.rules:
|
if not self.rules:
|
||||||
return [self.lookup_table.get(string, string)]
|
return [self.lookup_table.get(string, string)]
|
||||||
if univ_pos in (NOUN, 'NOUN', 'noun'):
|
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||||
univ_pos = 'noun'
|
univ_pos = "noun"
|
||||||
elif univ_pos in (VERB, 'VERB', 'verb'):
|
elif univ_pos in (VERB, "VERB", "verb"):
|
||||||
univ_pos = 'verb'
|
univ_pos = "verb"
|
||||||
elif univ_pos in (ADJ, 'ADJ', 'adj'):
|
elif univ_pos in (ADJ, "ADJ", "adj"):
|
||||||
univ_pos = 'adj'
|
univ_pos = "adj"
|
||||||
elif univ_pos in (ADP, 'ADP', 'adp'):
|
elif univ_pos in (ADP, "ADP", "adp"):
|
||||||
univ_pos = 'adp'
|
univ_pos = "adp"
|
||||||
elif univ_pos in (ADV, 'ADV', 'adv'):
|
elif univ_pos in (ADV, "ADV", "adv"):
|
||||||
univ_pos = 'adv'
|
univ_pos = "adv"
|
||||||
elif univ_pos in (AUX, 'AUX', 'aux'):
|
elif univ_pos in (AUX, "AUX", "aux"):
|
||||||
univ_pos = 'aux'
|
univ_pos = "aux"
|
||||||
elif univ_pos in (CCONJ, 'CCONJ', 'cconj'):
|
elif univ_pos in (CCONJ, "CCONJ", "cconj"):
|
||||||
univ_pos = 'cconj'
|
univ_pos = "cconj"
|
||||||
elif univ_pos in (DET, 'DET', 'det'):
|
elif univ_pos in (DET, "DET", "det"):
|
||||||
univ_pos = 'det'
|
univ_pos = "det"
|
||||||
elif univ_pos in (PRON, 'PRON', 'pron'):
|
elif univ_pos in (PRON, "PRON", "pron"):
|
||||||
univ_pos = 'pron'
|
univ_pos = "pron"
|
||||||
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
elif univ_pos in (PUNCT, "PUNCT", "punct"):
|
||||||
univ_pos = 'punct'
|
univ_pos = "punct"
|
||||||
elif univ_pos in (SCONJ, 'SCONJ', 'sconj'):
|
elif univ_pos in (SCONJ, "SCONJ", "sconj"):
|
||||||
univ_pos = 'sconj'
|
univ_pos = "sconj"
|
||||||
else:
|
else:
|
||||||
return [self.lookup(string)]
|
return [self.lookup(string)]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(univ_pos, morphology):
|
if self.is_base_form(univ_pos, morphology):
|
||||||
return list(set([string.lower()]))
|
return list(set([string.lower()]))
|
||||||
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
lemmas = lemmatize(
|
||||||
|
string,
|
||||||
|
self.index.get(univ_pos, {}),
|
||||||
self.exc.get(univ_pos, {}),
|
self.exc.get(univ_pos, {}),
|
||||||
self.rules.get(univ_pos, []))
|
self.rules.get(univ_pos, []),
|
||||||
|
)
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
def is_base_form(self, univ_pos, morphology=None):
|
def is_base_form(self, univ_pos, morphology=None):
|
||||||
|
@ -70,20 +75,25 @@ class FrenchLemmatizer(object):
|
||||||
avoid lemmatization entirely.
|
avoid lemmatization entirely.
|
||||||
"""
|
"""
|
||||||
morphology = {} if morphology is None else morphology
|
morphology = {} if morphology is None else morphology
|
||||||
others = [key for key in morphology
|
others = [
|
||||||
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
|
key
|
||||||
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
for key in morphology
|
||||||
|
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
|
||||||
|
]
|
||||||
|
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||||
return True
|
return True
|
||||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||||
# morphology
|
# morphology
|
||||||
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and
|
elif univ_pos == "verb" and (
|
||||||
morphology.get('Tense') == 'pres' and
|
morphology.get("VerbForm") == "fin"
|
||||||
morphology.get('Number') is None and
|
and morphology.get("Tense") == "pres"
|
||||||
not others):
|
and morphology.get("Number") is None
|
||||||
|
and not others
|
||||||
|
):
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||||
return True
|
return True
|
||||||
elif VerbForm_inf in morphology:
|
elif VerbForm_inf in morphology:
|
||||||
return True
|
return True
|
||||||
|
@ -97,16 +107,16 @@ class FrenchLemmatizer(object):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def noun(self, string, morphology=None):
|
def noun(self, string, morphology=None):
|
||||||
return self(string, 'noun', morphology)
|
return self(string, "noun", morphology)
|
||||||
|
|
||||||
def verb(self, string, morphology=None):
|
def verb(self, string, morphology=None):
|
||||||
return self(string, 'verb', morphology)
|
return self(string, "verb", morphology)
|
||||||
|
|
||||||
def adj(self, string, morphology=None):
|
def adj(self, string, morphology=None):
|
||||||
return self(string, 'adj', morphology)
|
return self(string, "adj", morphology)
|
||||||
|
|
||||||
def punct(self, string, morphology=None):
|
def punct(self, string, morphology=None):
|
||||||
return self(string, 'punct', morphology)
|
return self(string, "punct", morphology)
|
||||||
|
|
||||||
def lookup(self, string):
|
def lookup(self, string):
|
||||||
if string in self.lookup_table:
|
if string in self.lookup_table:
|
||||||
|
@ -117,7 +127,7 @@ class FrenchLemmatizer(object):
|
||||||
def lemmatize(string, index, exceptions, rules):
|
def lemmatize(string, index, exceptions, rules):
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
if (string in index):
|
if string in index:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
return forms
|
return forms
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
|
|
|
@ -2,8 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
_exc = {
|
_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]}
|
||||||
"po'": [{ORTH: "po'", LEMMA: 'poco'}]
|
|
||||||
}
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
@ -11,5 +12,5 @@ sentences = [
|
||||||
"애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.",
|
"애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.",
|
||||||
"자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.",
|
"자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.",
|
||||||
"자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.",
|
"자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.",
|
||||||
"런던은 영국의 수도이자 가장 큰 도시입니다."
|
"런던은 영국의 수도이자 가장 큰 도시입니다.",
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
이
|
이
|
||||||
있
|
있
|
||||||
하
|
하
|
||||||
|
@ -65,4 +66,5 @@ STOP_WORDS = set("""
|
||||||
원
|
원
|
||||||
잘
|
잘
|
||||||
놓
|
놓
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -20,10 +20,10 @@ LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
|
||||||
BASE_PATH = Path(__file__).parent
|
BASE_PATH = Path(__file__).parent
|
||||||
|
|
||||||
LEMMA_EXC = {
|
LEMMA_EXC = {
|
||||||
"adj": load_language_data(BASE_PATH / '_adjectives_wordforms.json'),
|
"adj": load_language_data(BASE_PATH / "_adjectives_wordforms.json"),
|
||||||
"adv": ADVERBS_WORDFORMS,
|
"adv": ADVERBS_WORDFORMS,
|
||||||
"noun": load_language_data(BASE_PATH / '_nouns_wordforms.json'),
|
"noun": load_language_data(BASE_PATH / "_nouns_wordforms.json"),
|
||||||
"verb": load_language_data(BASE_PATH / '_verbs_wordforms.json'),
|
"verb": load_language_data(BASE_PATH / "_verbs_wordforms.json"),
|
||||||
}
|
}
|
||||||
|
|
||||||
LEMMA_RULES = {
|
LEMMA_RULES = {
|
||||||
|
@ -39,5 +39,3 @@ LEMMA_RULES = {
|
||||||
# https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
|
# https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
|
||||||
# License:
|
# License:
|
||||||
# Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)
|
# Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)
|
||||||
|
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -14,7 +14,7 @@ _infixes = (
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
|
|
@ -118,7 +118,7 @@ for orth in [
|
||||||
"o.l.",
|
"o.l.",
|
||||||
"on.",
|
"on.",
|
||||||
"op.",
|
"op.",
|
||||||
"org."
|
"org.",
|
||||||
"osv.",
|
"osv.",
|
||||||
"ovf.",
|
"ovf.",
|
||||||
"p.",
|
"p.",
|
||||||
|
|
|
@ -14,5 +14,5 @@ sentences = [
|
||||||
"Apple overweegt om voor 1 miljard een U.K. startup te kopen",
|
"Apple overweegt om voor 1 miljard een U.K. startup te kopen",
|
||||||
"Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",
|
"Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",
|
||||||
"San Francisco overweegt robots op voetpaden te verbieden",
|
"San Francisco overweegt robots op voetpaden te verbieden",
|
||||||
"Londen is een grote stad in het Verenigd Koninkrijk"
|
"Londen is een grote stad in het Verenigd Koninkrijk",
|
||||||
]
|
]
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -3,22 +3,25 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
ADPOSITIONS = set(
|
ADPOSITIONS = set(
|
||||||
('aan aangaande aanwezig achter af afgezien al als an annex anno anti '
|
(
|
||||||
'behalve behoudens beneden benevens benoorden beoosten betreffende bewesten '
|
"aan aangaande aanwezig achter af afgezien al als an annex anno anti "
|
||||||
'bezijden bezuiden bij binnen binnenuit binst bladzij blijkens boven bovenop '
|
"behalve behoudens beneden benevens benoorden beoosten betreffende bewesten "
|
||||||
'buiten conform contra cq daaraan daarbij daarbuiten daarin daarnaar '
|
"bezijden bezuiden bij binnen binnenuit binst bladzij blijkens boven bovenop "
|
||||||
'daaronder daartegenover daarvan dankzij deure dichtbij door doordat doorheen '
|
"buiten conform contra cq daaraan daarbij daarbuiten daarin daarnaar "
|
||||||
'echter eraf erop erover errond eruit ervoor evenals exclusief gedaan '
|
"daaronder daartegenover daarvan dankzij deure dichtbij door doordat doorheen "
|
||||||
'gedurende gegeven getuige gezien halfweg halverwege heen hierdoorheen hierop '
|
"echter eraf erop erover errond eruit ervoor evenals exclusief gedaan "
|
||||||
'houdende in inclusief indien ingaande ingevolge inzake jegens kortweg '
|
"gedurende gegeven getuige gezien halfweg halverwege heen hierdoorheen hierop "
|
||||||
'krachtens kralj langs langsheen langst lastens linksom lopende luidens mede '
|
"houdende in inclusief indien ingaande ingevolge inzake jegens kortweg "
|
||||||
'mee met middels midden middenop mits na naan naar naartoe naast naat nabij '
|
"krachtens kralj langs langsheen langst lastens linksom lopende luidens mede "
|
||||||
'nadat namens neer neffe neffen neven nevenst niettegenstaande nopens '
|
"mee met middels midden middenop mits na naan naar naartoe naast naat nabij "
|
||||||
'officieel om omheen omstreeks omtrent onafgezien ondanks onder onderaan '
|
"nadat namens neer neffe neffen neven nevenst niettegenstaande nopens "
|
||||||
'ondere ongeacht ooit op open over per plus pro qua rechtover rond rondom '
|
"officieel om omheen omstreeks omtrent onafgezien ondanks onder onderaan "
|
||||||
|
"ondere ongeacht ooit op open over per plus pro qua rechtover rond rondom "
|
||||||
"sedert sinds spijts strekkende te tegen tegenaan tegenop tegenover telde "
|
"sedert sinds spijts strekkende te tegen tegenaan tegenop tegenover telde "
|
||||||
'teneinde terug tijdens toe tot totdat trots tussen tégen uit uitgenomen '
|
"teneinde terug tijdens toe tot totdat trots tussen tégen uit uitgenomen "
|
||||||
'ultimo van vanaf vandaan vandoor vanop vanuit vanwege versus via vinnen '
|
"ultimo van vanaf vandaan vandoor vanop vanuit vanwege versus via vinnen "
|
||||||
'vlakbij volgens voor voor- voorbij voordat voort voren vòòr vóór waaraan '
|
"vlakbij volgens voor voor- voorbij voordat voort voren vòòr vóór waaraan "
|
||||||
'waarbij waardoor waaronder weg wegens weleens zijdens zoals zodat zonder '
|
"waarbij waardoor waaronder weg wegens weleens zijdens zoals zodat zonder "
|
||||||
'zónder à').split())
|
"zónder à"
|
||||||
|
).split()
|
||||||
|
)
|
||||||
|
|
|
@ -3,10 +3,10 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
ADPOSITIONS_IRREG = {
|
ADPOSITIONS_IRREG = {
|
||||||
"'t": ('te',),
|
"'t": ("te",),
|
||||||
'me': ('mee',),
|
"me": ("mee",),
|
||||||
'meer': ('mee',),
|
"meer": ("mee",),
|
||||||
'on': ('om',),
|
"on": ("om",),
|
||||||
'ten': ('te',),
|
"ten": ("te",),
|
||||||
'ter': ('te',)
|
"ter": ("te",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,17 +3,17 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
ADVERBS_IRREG = {
|
ADVERBS_IRREG = {
|
||||||
"'ns": ('eens',),
|
"'ns": ("eens",),
|
||||||
"'s": ('eens',),
|
"'s": ("eens",),
|
||||||
"'t": ('het',),
|
"'t": ("het",),
|
||||||
"d'r": ('er',),
|
"d'r": ("er",),
|
||||||
"d'raf": ('eraf',),
|
"d'raf": ("eraf",),
|
||||||
"d'rbij": ('erbij',),
|
"d'rbij": ("erbij",),
|
||||||
"d'rheen": ('erheen',),
|
"d'rheen": ("erheen",),
|
||||||
"d'rin": ('erin',),
|
"d'rin": ("erin",),
|
||||||
"d'rna": ('erna',),
|
"d'rna": ("erna",),
|
||||||
"d'rnaar": ('ernaar',),
|
"d'rnaar": ("ernaar",),
|
||||||
'hele': ('heel',),
|
"hele": ("heel",),
|
||||||
'nevenst': ('nevens',),
|
"nevenst": ("nevens",),
|
||||||
'overend': ('overeind',)
|
"overend": ("overeind",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,15 +3,18 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
DETERMINERS = set(
|
DETERMINERS = set(
|
||||||
("al allebei allerhande allerminst alletwee"
|
(
|
||||||
|
"al allebei allerhande allerminst alletwee"
|
||||||
"beide clip-on d'n d'r dat datgeen datgene de dees degeen degene den dewelke "
|
"beide clip-on d'n d'r dat datgeen datgene de dees degeen degene den dewelke "
|
||||||
'deze dezelfde die diegeen diegene diehien dien diene diens diezelfde dit '
|
"deze dezelfde die diegeen diegene diehien dien diene diens diezelfde dit "
|
||||||
'ditgene e een eene eigen elk elkens elkes enig enkel enne ettelijke eure '
|
"ditgene e een eene eigen elk elkens elkes enig enkel enne ettelijke eure "
|
||||||
'euren evenveel ewe ge geen ginds géén haar haaren halfelf het hetgeen '
|
"euren evenveel ewe ge geen ginds géén haar haaren halfelf het hetgeen "
|
||||||
'hetwelk hetzelfde heur heure hulder hulle hullen hullie hun hunder hunderen '
|
"hetwelk hetzelfde heur heure hulder hulle hullen hullie hun hunder hunderen "
|
||||||
'ieder iederes ja je jen jouw jouwen jouwes jullie junder keiveel keiweinig '
|
"ieder iederes ja je jen jouw jouwen jouwes jullie junder keiveel keiweinig "
|
||||||
"m'ne me meer meerder meerdere menen menig mijn mijnes minst méér niemendal "
|
"m'ne me meer meerder meerdere menen menig mijn mijnes minst méér niemendal "
|
||||||
'oe ons onse se sommig sommigeder superveel telken teveel titulair ulder '
|
"oe ons onse se sommig sommigeder superveel telken teveel titulair ulder "
|
||||||
'uldere ulderen ulle under une uw vaak veel veels véél wat weinig welk welken '
|
"uldere ulderen ulle under une uw vaak veel veels véél wat weinig welk welken "
|
||||||
"welkene welksten z'nen ze zenen zijn zo'n zo'ne zoiet zoveel zovele zovelen "
|
"welkene welksten z'nen ze zenen zijn zo'n zo'ne zoiet zoveel zovele zovelen "
|
||||||
'zuk zulk zulkdanig zulken zulks zullie zíjn àlle álle').split())
|
"zuk zulk zulkdanig zulken zulks zullie zíjn àlle álle"
|
||||||
|
).split()
|
||||||
|
)
|
||||||
|
|
|
@ -3,67 +3,67 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
DETERMINERS_IRREG = {
|
DETERMINERS_IRREG = {
|
||||||
"'r": ('haar',),
|
"'r": ("haar",),
|
||||||
"'s": ('de',),
|
"'s": ("de",),
|
||||||
"'t": ('het',),
|
"'t": ("het",),
|
||||||
"'tgene": ('hetgeen',),
|
"'tgene": ("hetgeen",),
|
||||||
'alle': ('al',),
|
"alle": ("al",),
|
||||||
'allen': ('al',),
|
"allen": ("al",),
|
||||||
'aller': ('al',),
|
"aller": ("al",),
|
||||||
'beiden': ('beide',),
|
"beiden": ("beide",),
|
||||||
'beider': ('beide',),
|
"beider": ("beide",),
|
||||||
"d'": ('het',),
|
"d'": ("het",),
|
||||||
"d'r": ('haar',),
|
"d'r": ("haar",),
|
||||||
'der': ('de',),
|
"der": ("de",),
|
||||||
'des': ('de',),
|
"des": ("de",),
|
||||||
'dezer': ('deze',),
|
"dezer": ("deze",),
|
||||||
'dienen': ('die',),
|
"dienen": ("die",),
|
||||||
'dier': ('die',),
|
"dier": ("die",),
|
||||||
'elke': ('elk',),
|
"elke": ("elk",),
|
||||||
'ene': ('een',),
|
"ene": ("een",),
|
||||||
'enen': ('een',),
|
"enen": ("een",),
|
||||||
'ener': ('een',),
|
"ener": ("een",),
|
||||||
'enige': ('enig',),
|
"enige": ("enig",),
|
||||||
'enigen': ('enig',),
|
"enigen": ("enig",),
|
||||||
'er': ('haar',),
|
"er": ("haar",),
|
||||||
'gene': ('geen',),
|
"gene": ("geen",),
|
||||||
'genen': ('geen',),
|
"genen": ("geen",),
|
||||||
'hare': ('haar',),
|
"hare": ("haar",),
|
||||||
'haren': ('haar',),
|
"haren": ("haar",),
|
||||||
'harer': ('haar',),
|
"harer": ("haar",),
|
||||||
'hunne': ('hun',),
|
"hunne": ("hun",),
|
||||||
'hunnen': ('hun',),
|
"hunnen": ("hun",),
|
||||||
'jou': ('jouw',),
|
"jou": ("jouw",),
|
||||||
'jouwe': ('jouw',),
|
"jouwe": ("jouw",),
|
||||||
'julliejen': ('jullie',),
|
"julliejen": ("jullie",),
|
||||||
"m'n": ('mijn',),
|
"m'n": ("mijn",),
|
||||||
'mee': ('meer',),
|
"mee": ("meer",),
|
||||||
'meer': ('veel',),
|
"meer": ("veel",),
|
||||||
'meerderen': ('meerdere',),
|
"meerderen": ("meerdere",),
|
||||||
'meest': ('veel',),
|
"meest": ("veel",),
|
||||||
'meesten': ('veel',),
|
"meesten": ("veel",),
|
||||||
'meet': ('veel',),
|
"meet": ("veel",),
|
||||||
'menige': ('menig',),
|
"menige": ("menig",),
|
||||||
'mij': ('mijn',),
|
"mij": ("mijn",),
|
||||||
'mijnen': ('mijn',),
|
"mijnen": ("mijn",),
|
||||||
'minder': ('weinig',),
|
"minder": ("weinig",),
|
||||||
'mindere': ('weinig',),
|
"mindere": ("weinig",),
|
||||||
'minst': ('weinig',),
|
"minst": ("weinig",),
|
||||||
'minste': ('minst',),
|
"minste": ("minst",),
|
||||||
'ne': ('een',),
|
"ne": ("een",),
|
||||||
'onze': ('ons',),
|
"onze": ("ons",),
|
||||||
'onzent': ('ons',),
|
"onzent": ("ons",),
|
||||||
'onzer': ('ons',),
|
"onzer": ("ons",),
|
||||||
'ouw': ('uw',),
|
"ouw": ("uw",),
|
||||||
'sommige': ('sommig',),
|
"sommige": ("sommig",),
|
||||||
'sommigen': ('sommig',),
|
"sommigen": ("sommig",),
|
||||||
'u': ('uw',),
|
"u": ("uw",),
|
||||||
'vaker': ('vaak',),
|
"vaker": ("vaak",),
|
||||||
'vele': ('veel',),
|
"vele": ("veel",),
|
||||||
'velen': ('veel',),
|
"velen": ("veel",),
|
||||||
'welke': ('welk',),
|
"welke": ("welk",),
|
||||||
'zijne': ('zijn',),
|
"zijne": ("zijn",),
|
||||||
'zijnen': ('zijn',),
|
"zijnen": ("zijn",),
|
||||||
'zijns': ('zijn',),
|
"zijns": ("zijn",),
|
||||||
'één': ('een',)
|
"één": ("een",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,7 +9,7 @@ ADJECTIVE_SUFFIX_RULES = [
|
||||||
["er", ""],
|
["er", ""],
|
||||||
["en", ""],
|
["en", ""],
|
||||||
["e", ""],
|
["e", ""],
|
||||||
["ende", "end"]
|
["ende", "end"],
|
||||||
]
|
]
|
||||||
|
|
||||||
VERB_SUFFIX_RULES = [
|
VERB_SUFFIX_RULES = [
|
||||||
|
@ -39,7 +39,7 @@ NOUN_SUFFIX_RULES = [
|
||||||
["ssen", "s"],
|
["ssen", "s"],
|
||||||
["rren", "r"],
|
["rren", "r"],
|
||||||
["kken", "k"],
|
["kken", "k"],
|
||||||
["bben", "b"]
|
["bben", "b"],
|
||||||
]
|
]
|
||||||
|
|
||||||
NUM_SUFFIX_RULES = [
|
NUM_SUFFIX_RULES = [
|
||||||
|
@ -50,23 +50,20 @@ NUM_SUFFIX_RULES = [
|
||||||
["de", ""],
|
["de", ""],
|
||||||
["er", ""],
|
["er", ""],
|
||||||
["ër", ""],
|
["ër", ""],
|
||||||
["tjes", ""]
|
["tjes", ""],
|
||||||
]
|
]
|
||||||
|
|
||||||
PUNCT_SUFFIX_RULES = [
|
PUNCT_SUFFIX_RULES = [["“", '"'], ["”", '"'], ["\u2018", "'"], ["\u2019", "'"]]
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["\u2018", "'"],
|
|
||||||
["\u2019", "'"]
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# In-place sort guaranteeing that longer -- more specific -- rules are
|
# In-place sort guaranteeing that longer -- more specific -- rules are
|
||||||
# applied first.
|
# applied first.
|
||||||
for rule_set in (ADJECTIVE_SUFFIX_RULES,
|
for rule_set in (
|
||||||
|
ADJECTIVE_SUFFIX_RULES,
|
||||||
NOUN_SUFFIX_RULES,
|
NOUN_SUFFIX_RULES,
|
||||||
NUM_SUFFIX_RULES,
|
NUM_SUFFIX_RULES,
|
||||||
VERB_SUFFIX_RULES):
|
VERB_SUFFIX_RULES,
|
||||||
|
):
|
||||||
rule_set.sort(key=lambda r: len(r[0]), reverse=True)
|
rule_set.sort(key=lambda r: len(r[0]), reverse=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,5 +72,5 @@ RULES = {
|
||||||
"noun": NOUN_SUFFIX_RULES,
|
"noun": NOUN_SUFFIX_RULES,
|
||||||
"verb": VERB_SUFFIX_RULES,
|
"verb": VERB_SUFFIX_RULES,
|
||||||
"num": NUM_SUFFIX_RULES,
|
"num": NUM_SUFFIX_RULES,
|
||||||
"punct": PUNCT_SUFFIX_RULES
|
"punct": PUNCT_SUFFIX_RULES,
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -3,29 +3,29 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
NUMBERS_IRREG = {
|
NUMBERS_IRREG = {
|
||||||
'achten': ('acht',),
|
"achten": ("acht",),
|
||||||
'biljoenen': ('biljoen',),
|
"biljoenen": ("biljoen",),
|
||||||
'drieën': ('drie',),
|
"drieën": ("drie",),
|
||||||
'duizenden': ('duizend',),
|
"duizenden": ("duizend",),
|
||||||
'eentjes': ('één',),
|
"eentjes": ("één",),
|
||||||
'elven': ('elf',),
|
"elven": ("elf",),
|
||||||
'miljoenen': ('miljoen',),
|
"miljoenen": ("miljoen",),
|
||||||
'negenen': ('negen',),
|
"negenen": ("negen",),
|
||||||
'negentiger': ('negentig',),
|
"negentiger": ("negentig",),
|
||||||
'tienduizenden': ('tienduizend',),
|
"tienduizenden": ("tienduizend",),
|
||||||
'tienen': ('tien',),
|
"tienen": ("tien",),
|
||||||
'tientjes': ('tien',),
|
"tientjes": ("tien",),
|
||||||
'twaalven': ('twaalf',),
|
"twaalven": ("twaalf",),
|
||||||
'tweeën': ('twee',),
|
"tweeën": ("twee",),
|
||||||
'twintiger': ('twintig',),
|
"twintiger": ("twintig",),
|
||||||
'twintigsten': ('twintig',),
|
"twintigsten": ("twintig",),
|
||||||
'vieren': ('vier',),
|
"vieren": ("vier",),
|
||||||
'vijftiger': ('vijftig',),
|
"vijftiger": ("vijftig",),
|
||||||
'vijven': ('vijf',),
|
"vijven": ("vijf",),
|
||||||
'zessen': ('zes',),
|
"zessen": ("zes",),
|
||||||
'zestiger': ('zestig',),
|
"zestiger": ("zestig",),
|
||||||
'zevenen': ('zeven',),
|
"zevenen": ("zeven",),
|
||||||
'zeventiger': ('zeventig',),
|
"zeventiger": ("zeventig",),
|
||||||
'zovele': ('zoveel',),
|
"zovele": ("zoveel",),
|
||||||
'zovelen': ('zoveel',)
|
"zovelen": ("zoveel",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,33 +3,33 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
PRONOUNS_IRREG = {
|
PRONOUNS_IRREG = {
|
||||||
"'r": ('haar',),
|
"'r": ("haar",),
|
||||||
"'rzelf": ('haarzelf',),
|
"'rzelf": ("haarzelf",),
|
||||||
"'t": ('het',),
|
"'t": ("het",),
|
||||||
"d'r": ('haar',),
|
"d'r": ("haar",),
|
||||||
'da': ('dat',),
|
"da": ("dat",),
|
||||||
'dienen': ('die',),
|
"dienen": ("die",),
|
||||||
'diens': ('die',),
|
"diens": ("die",),
|
||||||
'dies': ('die',),
|
"dies": ("die",),
|
||||||
'elkaars': ('elkaar',),
|
"elkaars": ("elkaar",),
|
||||||
'elkanders': ('elkander',),
|
"elkanders": ("elkander",),
|
||||||
'ene': ('een',),
|
"ene": ("een",),
|
||||||
'enen': ('een',),
|
"enen": ("een",),
|
||||||
'fik': ('ik',),
|
"fik": ("ik",),
|
||||||
'gaat': ('gaan',),
|
"gaat": ("gaan",),
|
||||||
'gene': ('geen',),
|
"gene": ("geen",),
|
||||||
'harer': ('haar',),
|
"harer": ("haar",),
|
||||||
'ieders': ('ieder',),
|
"ieders": ("ieder",),
|
||||||
'iemands': ('iemand',),
|
"iemands": ("iemand",),
|
||||||
'ikke': ('ik',),
|
"ikke": ("ik",),
|
||||||
'mijnen': ('mijn',),
|
"mijnen": ("mijn",),
|
||||||
'oe': ('je',),
|
"oe": ("je",),
|
||||||
'onzer': ('ons',),
|
"onzer": ("ons",),
|
||||||
'wa': ('wat',),
|
"wa": ("wat",),
|
||||||
'watte': ('wat',),
|
"watte": ("wat",),
|
||||||
'wier': ('wie',),
|
"wier": ("wie",),
|
||||||
'zijns': ('zijn',),
|
"zijns": ("zijn",),
|
||||||
'zoietsken': ('zoietske',),
|
"zoietsken": ("zoietske",),
|
||||||
'zulks': ('zulk',),
|
"zulks": ("zulk",),
|
||||||
'één': ('een',)
|
"één": ("een",),
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -7,15 +7,33 @@ from ....symbols import POS, NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
||||||
class DutchLemmatizer(object):
|
class DutchLemmatizer(object):
|
||||||
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
|
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
|
||||||
univ_pos_name_variants = {
|
univ_pos_name_variants = {
|
||||||
NOUN: "noun", "NOUN": "noun", "noun": "noun",
|
NOUN: "noun",
|
||||||
VERB: "verb", "VERB": "verb", "verb": "verb",
|
"NOUN": "noun",
|
||||||
AUX: "verb", "AUX": "verb", "aux": "verb",
|
"noun": "noun",
|
||||||
ADJ: "adj", "ADJ": "adj", "adj": "adj",
|
VERB: "verb",
|
||||||
ADV: "adv", "ADV": "adv", "adv": "adv",
|
"VERB": "verb",
|
||||||
PRON: "pron", "PRON": "pron", "pron": "pron",
|
"verb": "verb",
|
||||||
DET: "det", "DET": "det", "det": "det",
|
AUX: "verb",
|
||||||
ADP: "adp", "ADP": "adp", "adp": "adp",
|
"AUX": "verb",
|
||||||
NUM: "num", "NUM": "num", "num": "num"
|
"aux": "verb",
|
||||||
|
ADJ: "adj",
|
||||||
|
"ADJ": "adj",
|
||||||
|
"adj": "adj",
|
||||||
|
ADV: "adv",
|
||||||
|
"ADV": "adv",
|
||||||
|
"adv": "adv",
|
||||||
|
PRON: "pron",
|
||||||
|
"PRON": "pron",
|
||||||
|
"pron": "pron",
|
||||||
|
DET: "det",
|
||||||
|
"DET": "det",
|
||||||
|
"det": "det",
|
||||||
|
ADP: "adp",
|
||||||
|
"ADP": "adp",
|
||||||
|
"adp": "adp",
|
||||||
|
NUM: "num",
|
||||||
|
"NUM": "num",
|
||||||
|
"num": "num",
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -62,10 +80,8 @@ class DutchLemmatizer(object):
|
||||||
return [looked_up_lemma]
|
return [looked_up_lemma]
|
||||||
|
|
||||||
forms, is_known = lemmatize(
|
forms, is_known = lemmatize(
|
||||||
string,
|
string, lemma_index, exceptions, self.rules.get(univ_pos, [])
|
||||||
lemma_index,
|
)
|
||||||
exceptions,
|
|
||||||
self.rules.get(univ_pos, []))
|
|
||||||
|
|
||||||
# Back-off through remaining return value candidates.
|
# Back-off through remaining return value candidates.
|
||||||
if forms:
|
if forms:
|
||||||
|
@ -92,25 +108,25 @@ class DutchLemmatizer(object):
|
||||||
return self.lookup_table.get(string, string)
|
return self.lookup_table.get(string, string)
|
||||||
|
|
||||||
def noun(self, string, morphology=None):
|
def noun(self, string, morphology=None):
|
||||||
return self(string, 'noun', morphology)
|
return self(string, "noun", morphology)
|
||||||
|
|
||||||
def verb(self, string, morphology=None):
|
def verb(self, string, morphology=None):
|
||||||
return self(string, 'verb', morphology)
|
return self(string, "verb", morphology)
|
||||||
|
|
||||||
def adj(self, string, morphology=None):
|
def adj(self, string, morphology=None):
|
||||||
return self(string, 'adj', morphology)
|
return self(string, "adj", morphology)
|
||||||
|
|
||||||
def det(self, string, morphology=None):
|
def det(self, string, morphology=None):
|
||||||
return self(string, 'det', morphology)
|
return self(string, "det", morphology)
|
||||||
|
|
||||||
def pron(self, string, morphology=None):
|
def pron(self, string, morphology=None):
|
||||||
return self(string, 'pron', morphology)
|
return self(string, "pron", morphology)
|
||||||
|
|
||||||
def adp(self, string, morphology=None):
|
def adp(self, string, morphology=None):
|
||||||
return self(string, 'adp', morphology)
|
return self(string, "adp", morphology)
|
||||||
|
|
||||||
def punct(self, string, morphology=None):
|
def punct(self, string, morphology=None):
|
||||||
return self(string, 'punct', morphology)
|
return self(string, "punct", morphology)
|
||||||
|
|
||||||
|
|
||||||
# Reimplemented to focus more on application of suffix rules and to return
|
# Reimplemented to focus more on application of suffix rules and to return
|
||||||
|
|
|
@ -4,18 +4,22 @@ from __future__ import unicode_literals
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = set("""
|
_num_words = set(
|
||||||
|
"""
|
||||||
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
|
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
|
||||||
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
|
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
|
||||||
duizend miljoen miljard biljoen biljard triljoen triljard
|
duizend miljoen miljard biljoen biljard triljoen triljard
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
_ordinal_words = set("""
|
_ordinal_words = set(
|
||||||
|
"""
|
||||||
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
|
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
|
||||||
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
|
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
|
||||||
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
|
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
|
||||||
miljardste biljoenste biljardste triljoenste triljardste
|
miljardste biljoenste biljardste triljoenste triljardste
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
@ -23,11 +27,11 @@ def like_num(text):
|
||||||
# or matches one of the number words. In order to handle numbers like
|
# or matches one of the number words. In order to handle numbers like
|
||||||
# "drieëntwintig", more work is required.
|
# "drieëntwintig", more work is required.
|
||||||
# See this discussion: https://github.com/explosion/spaCy/pull/1177
|
# See this discussion: https://github.com/explosion/spaCy/pull/1177
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('/') == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split('/')
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.lower() in _num_words:
|
if text.lower() in _num_words:
|
||||||
|
@ -37,6 +41,4 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
LIKE_NUM: like_num
|
|
||||||
}
|
|
||||||
|
|
|
@ -10,24 +10,32 @@ from ..punctuation import TOKENIZER_SUFFIXES as DEFAULT_TOKENIZER_SUFFIXES
|
||||||
# Copied from `de` package. Main purpose is to ensure that hyphens are not
|
# Copied from `de` package. Main purpose is to ensure that hyphens are not
|
||||||
# split on.
|
# split on.
|
||||||
|
|
||||||
_quotes = CONCAT_QUOTES.replace("'", '')
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
_infixes = (
|
||||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
LIST_ELLIPSES
|
||||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[{}])\.(?=[{}])".format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])'.format(a=ALPHA, q=_quotes),
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[0-9])-(?=[0-9])'])
|
r"(?<=[0-9])-(?=[0-9])",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Remove "'s" suffix from suffix list. In Dutch, "'s" is a plural ending when
|
# Remove "'s" suffix from suffix list. In Dutch, "'s" is a plural ending when
|
||||||
# it occurs as a suffix and a clitic for "eens" in standalone use. To avoid
|
# it occurs as a suffix and a clitic for "eens" in standalone use. To avoid
|
||||||
# ambiguity it's better to just leave it attached when it occurs as a suffix.
|
# ambiguity it's better to just leave it attached when it occurs as a suffix.
|
||||||
default_suffix_blacklist = ("'s", "'S", '’s', '’S')
|
default_suffix_blacklist = ("'s", "'S", "’s", "’S")
|
||||||
_suffixes = [suffix for suffix in DEFAULT_TOKENIZER_SUFFIXES
|
_suffixes = [
|
||||||
if suffix not in default_suffix_blacklist]
|
suffix
|
||||||
|
for suffix in DEFAULT_TOKENIZER_SUFFIXES
|
||||||
|
if suffix not in default_suffix_blacklist
|
||||||
|
]
|
||||||
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
|
|
@ -16,7 +16,8 @@ from __future__ import unicode_literals
|
||||||
# should have a Dutch counterpart here.
|
# should have a Dutch counterpart here.
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
|
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
|
||||||
afgelopen aldus alhoewel anderzijds
|
afgelopen aldus alhoewel anderzijds
|
||||||
|
|
||||||
|
@ -70,4 +71,5 @@ welk welke welken werd werden wiens wier wilde wordt
|
||||||
|
|
||||||
zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zo’n zoals zodra zouden
|
zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zo’n zoals zodra zouden
|
||||||
zoveel zowat zulk zulke zulks zullen zult
|
zoveel zowat zulk zulke zulks zullen zult
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -47,8 +47,12 @@ TAG_MAP = {
|
||||||
"Adj_Prep|adv|vergr|onverv_voor__Degree=Cmp|Variant=Short": {POS: ADJ},
|
"Adj_Prep|adv|vergr|onverv_voor__Degree=Cmp|Variant=Short": {POS: ADJ},
|
||||||
"Adj_V_Conj_V__Degree=Pos|VerbForm=Inf": {POS: ADJ},
|
"Adj_V_Conj_V__Degree=Pos|VerbForm=Inf": {POS: ADJ},
|
||||||
"Adj_V_N__Degree=Pos|Number=Sing|Tense=Past|VerbForm=Part": {POS: ADJ},
|
"Adj_V_N__Degree=Pos|Number=Sing|Tense=Past|VerbForm=Part": {POS: ADJ},
|
||||||
"Adj_V|adv|stell|onverv_intrans|inf__Degree=Pos|Variant=Short|VerbForm=Inf": {POS: ADJ},
|
"Adj_V|adv|stell|onverv_intrans|inf__Degree=Pos|Variant=Short|VerbForm=Inf": {
|
||||||
"Adj_V|adv|stell|onverv_trans|imp__Degree=Pos|Mood=Imp|Variant=Short|VerbForm=Fin": {POS: ADJ},
|
POS: ADJ
|
||||||
|
},
|
||||||
|
"Adj_V|adv|stell|onverv_trans|imp__Degree=Pos|Mood=Imp|Variant=Short|VerbForm=Fin": {
|
||||||
|
POS: ADJ
|
||||||
|
},
|
||||||
"Adj|adv|stell|onverv__Degree=Pos|Variant=Short": {POS: ADJ},
|
"Adj|adv|stell|onverv__Degree=Pos|Variant=Short": {POS: ADJ},
|
||||||
"Adj|adv|stell|vervneut__Case=Nom|Degree=Pos|Variant=Short": {POS: ADJ},
|
"Adj|adv|stell|vervneut__Case=Nom|Degree=Pos|Variant=Short": {POS: ADJ},
|
||||||
"Adj|adv|vergr|onverv__Degree=Cmp|Variant=Short": {POS: ADJ},
|
"Adj|adv|vergr|onverv__Degree=Cmp|Variant=Short": {POS: ADJ},
|
||||||
|
@ -133,15 +137,21 @@ TAG_MAP = {
|
||||||
"Art_Num__Definite=Def|Degree=Sup|Gender=Neut|PronType=Ind": {POS: DET},
|
"Art_Num__Definite=Def|Degree=Sup|Gender=Neut|PronType=Ind": {POS: DET},
|
||||||
"Art_Num__Definite=Def|Gender=Neut": {POS: DET},
|
"Art_Num__Definite=Def|Gender=Neut": {POS: DET},
|
||||||
"Art_Num__Degree=Pos|Number=Sing|PronType=Ind": {POS: DET},
|
"Art_Num__Degree=Pos|Number=Sing|PronType=Ind": {POS: DET},
|
||||||
"Art_N|bep|onzijd|neut_eigen|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {POS: DET},
|
"Art_N|bep|onzijd|neut_eigen|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {
|
||||||
"Art_N|bep|onzijd|neut_soort|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {POS: DET},
|
POS: DET
|
||||||
|
},
|
||||||
|
"Art_N|bep|onzijd|neut_soort|ev|neut__Definite=Def|Gender=Neut|Number=Sing": {
|
||||||
|
POS: DET
|
||||||
|
},
|
||||||
"Art_Pron_N__Case=Gen|Number=Plur|PronType=Ind": {POS: DET},
|
"Art_Pron_N__Case=Gen|Number=Plur|PronType=Ind": {POS: DET},
|
||||||
"Art_Pron__Number=Sing|PronType=Ind": {POS: DET},
|
"Art_Pron__Number=Sing|PronType=Ind": {POS: DET},
|
||||||
"Art_V_N__AdpType=Prep": {POS: DET},
|
"Art_V_N__AdpType=Prep": {POS: DET},
|
||||||
"Art|bep|onzijd|neut__Definite=Def|Gender=Neut|PronType=Art": {POS: DET},
|
"Art|bep|onzijd|neut__Definite=Def|Gender=Neut|PronType=Art": {POS: DET},
|
||||||
"Art|bep|zijdofmv|gen__Case=Gen|Definite=Def|PronType=Art": {POS: DET},
|
"Art|bep|zijdofmv|gen__Case=Gen|Definite=Def|PronType=Art": {POS: DET},
|
||||||
"Art|bep|zijdofmv|neut__Definite=Def|PronType=Art": {POS: DET},
|
"Art|bep|zijdofmv|neut__Definite=Def|PronType=Art": {POS: DET},
|
||||||
"Art|bep|zijdofonzijd|gen__Case=Gen|Definite=Def|Number=Sing|PronType=Art": {POS: DET},
|
"Art|bep|zijdofonzijd|gen__Case=Gen|Definite=Def|Number=Sing|PronType=Art": {
|
||||||
|
POS: DET
|
||||||
|
},
|
||||||
"Art|bep|zijd|dat__Case=Dat|Definite=Def|Gender=Com|PronType=Art": {POS: DET},
|
"Art|bep|zijd|dat__Case=Dat|Definite=Def|Gender=Com|PronType=Art": {POS: DET},
|
||||||
"Art|onbep|zijdofonzijd|neut__Definite=Ind|Number=Sing|PronType=Art": {POS: DET},
|
"Art|onbep|zijdofonzijd|neut__Definite=Ind|Number=Sing|PronType=Art": {POS: DET},
|
||||||
"CCONJ___": {POS: CONJ},
|
"CCONJ___": {POS: CONJ},
|
||||||
|
@ -159,17 +169,23 @@ TAG_MAP = {
|
||||||
"Conj_Int|onder|metfin___": {POS: CONJ},
|
"Conj_Int|onder|metfin___": {POS: CONJ},
|
||||||
"Conj_N_Adv__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
"Conj_N_Adv__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
||||||
"Conj_N_Prep__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
"Conj_N_Prep__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
||||||
"Conj_N|onder|metfin_soort|ev|neut__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
"Conj_N|onder|metfin_soort|ev|neut__AdpType=Preppron|Gender=Masc|Number=Plur": {
|
||||||
|
POS: CONJ
|
||||||
|
},
|
||||||
"Conj_Pron_Adv__Degree=Pos|Number=Sing|Person=3": {POS: CONJ},
|
"Conj_Pron_Adv__Degree=Pos|Number=Sing|Person=3": {POS: CONJ},
|
||||||
"Conj_Pron_V__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
"Conj_Pron_V__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
||||||
"Conj_Pron|neven_aanw|neut|zelfst__AdpType=Prep": {POS: CONJ},
|
"Conj_Pron|neven_aanw|neut|zelfst__AdpType=Prep": {POS: CONJ},
|
||||||
"Conj_Punc_Conj|neven_schuinstreep_neven__AdpType=Prep": {POS: CONJ},
|
"Conj_Punc_Conj|neven_schuinstreep_neven__AdpType=Prep": {POS: CONJ},
|
||||||
"Conj_V|onder|metfin_intrans|ott|3|ev__AdpType=Preppron|Gender=Masc|Number=Plur": {POS: CONJ},
|
"Conj_V|onder|metfin_intrans|ott|3|ev__AdpType=Preppron|Gender=Masc|Number=Plur": {
|
||||||
|
POS: CONJ
|
||||||
|
},
|
||||||
"Conj|neven___": {POS: CONJ},
|
"Conj|neven___": {POS: CONJ},
|
||||||
"Conj|onder|metfin___": {POS: CONJ},
|
"Conj|onder|metfin___": {POS: CONJ},
|
||||||
"Conj|onder|metinf___": {POS: CONJ},
|
"Conj|onder|metinf___": {POS: CONJ},
|
||||||
"DET__Degree=Cmp|NumType=Card|PronType=Ind": {POS: DET},
|
"DET__Degree=Cmp|NumType=Card|PronType=Ind": {POS: DET},
|
||||||
"DET__Gender=Fem|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs": {POS: DET},
|
"DET__Gender=Fem|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs": {
|
||||||
|
POS: DET
|
||||||
|
},
|
||||||
"DET__Gender=Fem|Number=Sing|PronType=Art": {POS: DET},
|
"DET__Gender=Fem|Number=Sing|PronType=Art": {POS: DET},
|
||||||
"DET__Gender=Masc|Number=Plur|PronType=Art": {POS: DET},
|
"DET__Gender=Masc|Number=Plur|PronType=Art": {POS: DET},
|
||||||
"DET__Gender=Masc|Number=Sing|PronType=Tot": {POS: DET},
|
"DET__Gender=Masc|Number=Sing|PronType=Tot": {POS: DET},
|
||||||
|
@ -185,7 +201,9 @@ TAG_MAP = {
|
||||||
"Misc_Misc_Misc_Misc_Misc_Misc_Punc_Misc_Misc_Misc___": {POS: X},
|
"Misc_Misc_Misc_Misc_Misc_Misc_Punc_Misc_Misc_Misc___": {POS: X},
|
||||||
"Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X},
|
"Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X},
|
||||||
"Misc_Misc_Misc_Misc_Misc_N_Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X},
|
"Misc_Misc_Misc_Misc_Misc_N_Misc_Misc_Misc_Misc_Misc_Misc___": {POS: X},
|
||||||
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd__AdpType=Preppron|Gender=Masc|Number=Sing": {POS: X},
|
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd__AdpType=Preppron|Gender=Masc|Number=Sing": {
|
||||||
|
POS: X
|
||||||
|
},
|
||||||
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd___": {POS: X},
|
"Misc_Misc_Misc_Misc|vreemd_vreemd_vreemd_vreemd___": {POS: X},
|
||||||
"Misc_Misc_Misc_N__Number=Sing": {POS: X},
|
"Misc_Misc_Misc_N__Number=Sing": {POS: X},
|
||||||
"Misc_Misc_Misc|vreemd_vreemd_vreemd___": {POS: X},
|
"Misc_Misc_Misc|vreemd_vreemd_vreemd___": {POS: X},
|
||||||
|
@ -217,7 +235,9 @@ TAG_MAP = {
|
||||||
"N_Adj__Degree=Pos|Number=Plur": {POS: NOUN},
|
"N_Adj__Degree=Pos|Number=Plur": {POS: NOUN},
|
||||||
"N_Adj__Degree=Pos|Number=Sing": {POS: NOUN},
|
"N_Adj__Degree=Pos|Number=Sing": {POS: NOUN},
|
||||||
"N_Adj___": {POS: NOUN},
|
"N_Adj___": {POS: NOUN},
|
||||||
"N_Adv_Punc_V_Pron_V__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {POS: NOUN},
|
"N_Adv_Punc_V_Pron_V__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {
|
||||||
|
POS: NOUN
|
||||||
|
},
|
||||||
"N_Adv__Degree=Pos|Number=Sing": {POS: NOUN},
|
"N_Adv__Degree=Pos|Number=Sing": {POS: NOUN},
|
||||||
"N_Adv___": {POS: NOUN},
|
"N_Adv___": {POS: NOUN},
|
||||||
"N_Adv|soort|ev|neut_deelv__Number=Sing": {POS: NOUN},
|
"N_Adv|soort|ev|neut_deelv__Number=Sing": {POS: NOUN},
|
||||||
|
@ -320,12 +340,20 @@ TAG_MAP = {
|
||||||
"N_N|eigen|ev|gen_soort|mv|neut___": {POS: NOUN},
|
"N_N|eigen|ev|gen_soort|mv|neut___": {POS: NOUN},
|
||||||
"N_N|eigen|ev|neut_eigen|ev|gen___": {POS: NOUN},
|
"N_N|eigen|ev|neut_eigen|ev|gen___": {POS: NOUN},
|
||||||
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Prep": {POS: NOUN},
|
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Prep": {POS: NOUN},
|
||||||
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Fem|Number=Plur": {POS: NOUN},
|
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Fem|Number=Plur": {
|
||||||
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Masc|Number=Sing": {POS: NOUN},
|
POS: NOUN
|
||||||
|
},
|
||||||
|
"N_N|eigen|ev|neut_eigen|ev|neut__AdpType=Preppron|Gender=Masc|Number=Sing": {
|
||||||
|
POS: NOUN
|
||||||
|
},
|
||||||
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Plur|PronType=Art": {POS: NOUN},
|
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Plur|PronType=Art": {POS: NOUN},
|
||||||
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Sing|PronType=Art": {POS: NOUN},
|
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Fem|Number=Sing|PronType=Art": {POS: NOUN},
|
||||||
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {POS: NOUN},
|
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {
|
||||||
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Sing|PronType=Art": {POS: NOUN},
|
POS: NOUN
|
||||||
|
},
|
||||||
|
"N_N|eigen|ev|neut_eigen|ev|neut__Gender=Masc|Number=Sing|PronType=Art": {
|
||||||
|
POS: NOUN
|
||||||
|
},
|
||||||
"N_N|eigen|ev|neut_eigen|ev|neut__NumType=Card": {POS: NOUN},
|
"N_N|eigen|ev|neut_eigen|ev|neut__NumType=Card": {POS: NOUN},
|
||||||
"N_N|eigen|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN},
|
"N_N|eigen|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN},
|
||||||
"N_N|eigen|ev|neut_eigen|ev|neut___": {POS: NOUN},
|
"N_N|eigen|ev|neut_eigen|ev|neut___": {POS: NOUN},
|
||||||
|
@ -335,7 +363,9 @@ TAG_MAP = {
|
||||||
"N_N|eigen|ev|neut_soort|mv|neut___": {POS: NOUN},
|
"N_N|eigen|ev|neut_soort|mv|neut___": {POS: NOUN},
|
||||||
"N_N|eigen|mv|neut_eigen|mv|neut___": {POS: NOUN},
|
"N_N|eigen|mv|neut_eigen|mv|neut___": {POS: NOUN},
|
||||||
"N_N|soort|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN},
|
"N_N|soort|ev|neut_eigen|ev|neut__Number=Sing": {POS: NOUN},
|
||||||
"N_N|soort|ev|neut_soort|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {POS: NOUN},
|
"N_N|soort|ev|neut_soort|ev|neut__Gender=Masc|Number=Plur|PronType=Art": {
|
||||||
|
POS: NOUN
|
||||||
|
},
|
||||||
"N_N|soort|ev|neut_soort|ev|neut__NumForm=Digit|NumType=Card": {POS: NOUN},
|
"N_N|soort|ev|neut_soort|ev|neut__NumForm=Digit|NumType=Card": {POS: NOUN},
|
||||||
"N_N|soort|ev|neut_soort|ev|neut__Number=Sing": {POS: NOUN},
|
"N_N|soort|ev|neut_soort|ev|neut__Number=Sing": {POS: NOUN},
|
||||||
"N_N|soort|ev|neut_soort|mv|neut__Number=Plur": {POS: NOUN},
|
"N_N|soort|ev|neut_soort|mv|neut__Number=Plur": {POS: NOUN},
|
||||||
|
@ -365,7 +395,9 @@ TAG_MAP = {
|
||||||
"N_Pron___": {POS: NOUN},
|
"N_Pron___": {POS: NOUN},
|
||||||
"N_Punc_Adj_N___": {POS: NOUN},
|
"N_Punc_Adj_N___": {POS: NOUN},
|
||||||
"N_Punc_Adj_Pron_Punc__Degree=Pos|Number=Sing|Person=2": {POS: NOUN},
|
"N_Punc_Adj_Pron_Punc__Degree=Pos|Number=Sing|Person=2": {POS: NOUN},
|
||||||
"N_Punc_Adv_V_Pron_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: NOUN},
|
"N_Punc_Adv_V_Pron_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: NOUN
|
||||||
|
},
|
||||||
"N_Punc_Misc_Punc_N___": {POS: NOUN},
|
"N_Punc_Misc_Punc_N___": {POS: NOUN},
|
||||||
"N_Punc_N_N_N_N__Number=Sing": {POS: NOUN},
|
"N_Punc_N_N_N_N__Number=Sing": {POS: NOUN},
|
||||||
"N_Punc_N_Punc_N__Number=Sing": {POS: NOUN},
|
"N_Punc_N_Punc_N__Number=Sing": {POS: NOUN},
|
||||||
|
@ -415,8 +447,12 @@ TAG_MAP = {
|
||||||
"Num|hoofd|bep|attr|onverv__Definite=Def|NumType=Card": {POS: NUM},
|
"Num|hoofd|bep|attr|onverv__Definite=Def|NumType=Card": {POS: NUM},
|
||||||
"Num|hoofd|bep|zelfst|onverv__Definite=Def|NumType=Card": {POS: NUM},
|
"Num|hoofd|bep|zelfst|onverv__Definite=Def|NumType=Card": {POS: NUM},
|
||||||
"Num|hoofd|bep|zelfst|vervmv__Definite=Def|Number=Plur|NumType=Card": {POS: NUM},
|
"Num|hoofd|bep|zelfst|vervmv__Definite=Def|Number=Plur|NumType=Card": {POS: NUM},
|
||||||
"Num|hoofd|onbep|attr|stell|onverv__Degree=Pos|NumType=Card|PronType=Ind": {POS: NUM},
|
"Num|hoofd|onbep|attr|stell|onverv__Degree=Pos|NumType=Card|PronType=Ind": {
|
||||||
"Num|hoofd|onbep|attr|vergr|onverv__Degree=Cmp|NumType=Card|PronType=Ind": {POS: NUM},
|
POS: NUM
|
||||||
|
},
|
||||||
|
"Num|hoofd|onbep|attr|vergr|onverv__Degree=Cmp|NumType=Card|PronType=Ind": {
|
||||||
|
POS: NUM
|
||||||
|
},
|
||||||
"Num|rang|bep|attr|onverv__Definite=Def|NumType=Ord": {POS: NUM},
|
"Num|rang|bep|attr|onverv__Definite=Def|NumType=Ord": {POS: NUM},
|
||||||
"Num|rang|bep|zelfst|onverv__Definite=Def|NumType=Ord": {POS: NUM},
|
"Num|rang|bep|zelfst|onverv__Definite=Def|NumType=Ord": {POS: NUM},
|
||||||
"N|eigen|ev|gen__Case=Gen|Number=Sing": {POS: NOUN},
|
"N|eigen|ev|gen__Case=Gen|Number=Sing": {POS: NOUN},
|
||||||
|
@ -469,7 +505,9 @@ TAG_MAP = {
|
||||||
"Prep_N_Adv|voor_soort|ev|neut_pron|aanw__AdpType=Prep": {POS: ADP},
|
"Prep_N_Adv|voor_soort|ev|neut_pron|aanw__AdpType=Prep": {POS: ADP},
|
||||||
"Prep_N_Adv|voor_soort|ev|neut_pron|aanw__Number=Sing|PronType=Dem": {POS: ADP},
|
"Prep_N_Adv|voor_soort|ev|neut_pron|aanw__Number=Sing|PronType=Dem": {POS: ADP},
|
||||||
"Prep_N_Adv|voor_soort|ev|neut_pron|vrag__Number=Sing|PronType=Int": {POS: ADP},
|
"Prep_N_Adv|voor_soort|ev|neut_pron|vrag__Number=Sing|PronType=Int": {POS: ADP},
|
||||||
"Prep_N_Adv|voor_soort|mv|neut_deelv__Gender=Masc|Number=Sing|PronType=Tot": {POS: ADP},
|
"Prep_N_Adv|voor_soort|mv|neut_deelv__Gender=Masc|Number=Sing|PronType=Tot": {
|
||||||
|
POS: ADP
|
||||||
|
},
|
||||||
"Prep_N_Conj_N__Number=Sing": {POS: ADP},
|
"Prep_N_Conj_N__Number=Sing": {POS: ADP},
|
||||||
"Prep_N_Conj__AdpType=Prep": {POS: ADP},
|
"Prep_N_Conj__AdpType=Prep": {POS: ADP},
|
||||||
"Prep_N_Prep_N__Number=Sing": {POS: ADP},
|
"Prep_N_Prep_N__Number=Sing": {POS: ADP},
|
||||||
|
@ -489,7 +527,9 @@ TAG_MAP = {
|
||||||
"Prep_N|voor_soort|ev|neut__Number=Sing": {POS: ADP},
|
"Prep_N|voor_soort|ev|neut__Number=Sing": {POS: ADP},
|
||||||
"Prep_N|voor_soort|mv|neut__AdpType=Prep": {POS: ADP},
|
"Prep_N|voor_soort|mv|neut__AdpType=Prep": {POS: ADP},
|
||||||
"Prep_N|voor_soort|mv|neut__Number=Plur": {POS: ADP},
|
"Prep_N|voor_soort|mv|neut__Number=Plur": {POS: ADP},
|
||||||
"Prep_Prep_Adj|voor_voor_adv|stell|onverv__Gender=Masc|Number=Sing|PronType=Tot": {POS: ADP},
|
"Prep_Prep_Adj|voor_voor_adv|stell|onverv__Gender=Masc|Number=Sing|PronType=Tot": {
|
||||||
|
POS: ADP
|
||||||
|
},
|
||||||
"Prep_Prep_Adv__Degree=Pos": {POS: ADP},
|
"Prep_Prep_Adv__Degree=Pos": {POS: ADP},
|
||||||
"Prep_Pron_Adj__Degree=Cmp|Number=Sing|Person=3": {POS: ADP},
|
"Prep_Pron_Adj__Degree=Cmp|Number=Sing|Person=3": {POS: ADP},
|
||||||
"Prep_Pron_N_Adv__Number=Plur": {POS: ADP},
|
"Prep_Pron_N_Adv__Number=Plur": {POS: ADP},
|
||||||
|
@ -503,7 +543,9 @@ TAG_MAP = {
|
||||||
"Prep_Pron|voor_ref|3|evofmv__Number=Plur,Sing|Person=3": {POS: ADP},
|
"Prep_Pron|voor_ref|3|evofmv__Number=Plur,Sing|Person=3": {POS: ADP},
|
||||||
"Prep_Punc_N_Conj_N__AdpType=Prep": {POS: ADP},
|
"Prep_Punc_N_Conj_N__AdpType=Prep": {POS: ADP},
|
||||||
"Prep_V_N__Number=Sing|Tense=Pres|VerbForm=Part": {POS: ADP},
|
"Prep_V_N__Number=Sing|Tense=Pres|VerbForm=Part": {POS: ADP},
|
||||||
"Prep_V_Pron_Pron_Adv__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {POS: ADP},
|
"Prep_V_Pron_Pron_Adv__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: ADP
|
||||||
|
},
|
||||||
"Prep_V|voor_intrans|inf__VerbForm=Inf": {POS: ADP},
|
"Prep_V|voor_intrans|inf__VerbForm=Inf": {POS: ADP},
|
||||||
"Prep_V|voorinf_trans|inf__VerbForm=Inf": {POS: ADP},
|
"Prep_V|voorinf_trans|inf__VerbForm=Inf": {POS: ADP},
|
||||||
"Prep|achter__AdpType=Post": {POS: ADP},
|
"Prep|achter__AdpType=Post": {POS: ADP},
|
||||||
|
@ -511,17 +553,25 @@ TAG_MAP = {
|
||||||
"Prep|voor__AdpType=Prep": {POS: ADP},
|
"Prep|voor__AdpType=Prep": {POS: ADP},
|
||||||
"Prep|voorinf__AdpType=Prep|PartType=Inf": {POS: ADP},
|
"Prep|voorinf__AdpType=Prep|PartType=Inf": {POS: ADP},
|
||||||
"Pron_Adj_N_Punc_Art_Adj_N_Prep_Art_Adj_N__NumType=Card": {POS: PRON},
|
"Pron_Adj_N_Punc_Art_Adj_N_Prep_Art_Adj_N__NumType=Card": {POS: PRON},
|
||||||
"Pron_Adj__Case=Nom|Degree=Sup|Number=Sing|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
|
"Pron_Adj__Case=Nom|Degree=Sup|Number=Sing|Person=2|Poss=Yes|PronType=Prs": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron_Adj__Degree=Cmp|PronType=Ind": {POS: PRON},
|
"Pron_Adj__Degree=Cmp|PronType=Ind": {POS: PRON},
|
||||||
"Pron_Adv|vrag|neut|attr_deelv__PronType=Int": {POS: PRON},
|
"Pron_Adv|vrag|neut|attr_deelv__PronType=Int": {POS: PRON},
|
||||||
"Pron_Art_N_N__Number=Plur|PronType=Ind": {POS: PRON},
|
"Pron_Art_N_N__Number=Plur|PronType=Ind": {POS: PRON},
|
||||||
"Pron_Art__Number=Sing|PronType=Int": {POS: PRON},
|
"Pron_Art__Number=Sing|PronType=Int": {POS: PRON},
|
||||||
"Pron_N_Adv__Number=Sing|PronType=Ind": {POS: PRON},
|
"Pron_N_Adv__Number=Sing|PronType=Ind": {POS: PRON},
|
||||||
"Pron_N_V_Adv_Num_Punc__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: PRON},
|
"Pron_N_V_Adv_Num_Punc__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
|
||||||
"Pron_N_V_Conj_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: PRON},
|
POS: PRON
|
||||||
|
},
|
||||||
|
"Pron_N_V_Conj_N__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron_N__Case=Gen|Number=Sing|PronType=Ind": {POS: PRON},
|
"Pron_N__Case=Gen|Number=Sing|PronType=Ind": {POS: PRON},
|
||||||
"Pron_N__Number=Sing|PronType=Ind": {POS: PRON},
|
"Pron_N__Number=Sing|PronType=Ind": {POS: PRON},
|
||||||
"Pron_N|aanw|gen|attr_soort|mv|neut__Case=Gen|Number=Plur|PronType=Dem": {POS: PRON},
|
"Pron_N|aanw|gen|attr_soort|mv|neut__Case=Gen|Number=Plur|PronType=Dem": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron_N|onbep|neut|attr_soort|ev|neut__Number=Sing|PronType=Ind": {POS: PRON},
|
"Pron_N|onbep|neut|attr_soort|ev|neut__Number=Sing|PronType=Ind": {POS: PRON},
|
||||||
"Pron_Prep_Art__Number=Sing|PronType=Int": {POS: PRON},
|
"Pron_Prep_Art__Number=Sing|PronType=Int": {POS: PRON},
|
||||||
"Pron_Prep_Art__Number=Sing|PronType=Rel": {POS: PRON},
|
"Pron_Prep_Art__Number=Sing|PronType=Rel": {POS: PRON},
|
||||||
|
@ -529,10 +579,16 @@ TAG_MAP = {
|
||||||
"Pron_Prep|betr|neut|zelfst_voor__PronType=Rel": {POS: PRON},
|
"Pron_Prep|betr|neut|zelfst_voor__PronType=Rel": {POS: PRON},
|
||||||
"Pron_Prep|onbep|neut|zelfst_voor__PronType=Ind": {POS: PRON},
|
"Pron_Prep|onbep|neut|zelfst_voor__PronType=Ind": {POS: PRON},
|
||||||
"Pron_Prep|vrag|neut|attr_voor__PronType=Int": {POS: PRON},
|
"Pron_Prep|vrag|neut|attr_voor__PronType=Int": {POS: PRON},
|
||||||
"Pron_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {POS: PRON},
|
"Pron_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron_Pron__Person=3|PronType=Prs|Reflex=Yes": {POS: PRON},
|
"Pron_Pron__Person=3|PronType=Prs|Reflex=Yes": {POS: PRON},
|
||||||
"Pron_V_V__Aspect=Imp|Mood=Ind|Person=3|PronType=Dem|Tense=Pres|VerbForm=Inf": {POS: PRON},
|
"Pron_V_V__Aspect=Imp|Mood=Ind|Person=3|PronType=Dem|Tense=Pres|VerbForm=Inf": {
|
||||||
"Pron_V__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs|VerbForm=Inf": {POS: PRON},
|
POS: PRON
|
||||||
|
},
|
||||||
|
"Pron_V__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs|VerbForm=Inf": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron_V__Number=Plur|Person=1|Poss=Yes|PronType=Prs|VerbForm=Inf": {POS: PRON},
|
"Pron_V__Number=Plur|Person=1|Poss=Yes|PronType=Prs|VerbForm=Inf": {POS: PRON},
|
||||||
"Pron|aanw|dat|attr__Case=Dat|PronType=Dem": {POS: PRON},
|
"Pron|aanw|dat|attr__Case=Dat|PronType=Dem": {POS: PRON},
|
||||||
"Pron|aanw|gen|attr__Case=Gen|PronType=Dem": {POS: PRON},
|
"Pron|aanw|gen|attr__Case=Gen|PronType=Dem": {POS: PRON},
|
||||||
|
@ -547,27 +603,47 @@ TAG_MAP = {
|
||||||
"Pron|bez|1|mv|neut|attr__Number=Plur|Person=1|Poss=Yes|PronType=Prs": {POS: PRON},
|
"Pron|bez|1|mv|neut|attr__Number=Plur|Person=1|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||||
"Pron|bez|2|ev|neut|attr__Number=Sing|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
|
"Pron|bez|2|ev|neut|attr__Number=Sing|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||||
"Pron|bez|2|mv|neut|attr__Number=Plur|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
|
"Pron|bez|2|mv|neut|attr__Number=Plur|Person=2|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||||
"Pron|bez|3|ev|gen|attr__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
|
"Pron|bez|3|ev|gen|attr__Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron|bez|3|ev|neut|attr__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
|
"Pron|bez|3|ev|neut|attr__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||||
"Pron|bez|3|ev|neut|zelfst__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
|
"Pron|bez|3|ev|neut|zelfst__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron|bez|3|mv|neut|attr__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
|
"Pron|bez|3|mv|neut|attr__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {POS: PRON},
|
||||||
"Pron|onbep|gen|attr__Case=Gen|PronType=Ind": {POS: PRON},
|
"Pron|onbep|gen|attr__Case=Gen|PronType=Ind": {POS: PRON},
|
||||||
"Pron|onbep|gen|zelfst__Case=Gen|PronType=Ind": {POS: PRON},
|
"Pron|onbep|gen|zelfst__Case=Gen|PronType=Ind": {POS: PRON},
|
||||||
"Pron|onbep|neut|attr__PronType=Ind": {POS: PRON},
|
"Pron|onbep|neut|attr__PronType=Ind": {POS: PRON},
|
||||||
"Pron|onbep|neut|zelfst__PronType=Ind": {POS: PRON},
|
"Pron|onbep|neut|zelfst__PronType=Ind": {POS: PRON},
|
||||||
"Pron|per|1|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=1|PronType=Prs": {POS: PRON},
|
"Pron|per|1|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=1|PronType=Prs": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron|per|1|ev|nom__Case=Nom|Number=Sing|Person=1|PronType=Prs": {POS: PRON},
|
"Pron|per|1|ev|nom__Case=Nom|Number=Sing|Person=1|PronType=Prs": {POS: PRON},
|
||||||
"Pron|per|1|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs": {POS: PRON},
|
"Pron|per|1|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron|per|1|mv|nom__Case=Nom|Number=Plur|Person=1|PronType=Prs": {POS: PRON},
|
"Pron|per|1|mv|nom__Case=Nom|Number=Plur|Person=1|PronType=Prs": {POS: PRON},
|
||||||
"Pron|per|2|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=2|PronType=Prs": {POS: PRON},
|
"Pron|per|2|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=2|PronType=Prs": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron|per|2|ev|nom__Case=Nom|Number=Sing|Person=2|PronType=Prs": {POS: PRON},
|
"Pron|per|2|ev|nom__Case=Nom|Number=Sing|Person=2|PronType=Prs": {POS: PRON},
|
||||||
"Pron|per|2|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs": {POS: PRON},
|
"Pron|per|2|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron|per|2|mv|nom__Case=Nom|Number=Plur|Person=2|PronType=Prs": {POS: PRON},
|
"Pron|per|2|mv|nom__Case=Nom|Number=Plur|Person=2|PronType=Prs": {POS: PRON},
|
||||||
"Pron|per|3|evofmv|datofacc__Case=Acc,Dat|Number=Plur,Sing|Person=3|PronType=Prs": {POS: PRON},
|
"Pron|per|3|evofmv|datofacc__Case=Acc,Dat|Number=Plur,Sing|Person=3|PronType=Prs": {
|
||||||
"Pron|per|3|evofmv|nom__Case=Nom|Number=Plur,Sing|Person=3|PronType=Prs": {POS: PRON},
|
POS: PRON
|
||||||
"Pron|per|3|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=3|PronType=Prs": {POS: PRON},
|
},
|
||||||
|
"Pron|per|3|evofmv|nom__Case=Nom|Number=Plur,Sing|Person=3|PronType=Prs": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
|
"Pron|per|3|ev|datofacc__Case=Acc,Dat|Number=Sing|Person=3|PronType=Prs": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron|per|3|ev|nom__Case=Nom|Number=Sing|Person=3|PronType=Prs": {POS: PRON},
|
"Pron|per|3|ev|nom__Case=Nom|Number=Sing|Person=3|PronType=Prs": {POS: PRON},
|
||||||
"Pron|per|3|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs": {POS: PRON},
|
"Pron|per|3|mv|datofacc__Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs": {
|
||||||
|
POS: PRON
|
||||||
|
},
|
||||||
"Pron|rec|gen__Case=Gen|PronType=Rcp": {POS: PRON},
|
"Pron|rec|gen__Case=Gen|PronType=Rcp": {POS: PRON},
|
||||||
"Pron|rec|neut__PronType=Rcp": {POS: PRON},
|
"Pron|rec|neut__PronType=Rcp": {POS: PRON},
|
||||||
"Pron|ref|1|ev__Number=Sing|Person=1|PronType=Prs|Reflex=Yes": {POS: PRON},
|
"Pron|ref|1|ev__Number=Sing|Person=1|PronType=Prs|Reflex=Yes": {POS: PRON},
|
||||||
|
@ -597,20 +673,34 @@ TAG_MAP = {
|
||||||
"Punc|vraag__PunctType=Qest": {POS: PUNCT},
|
"Punc|vraag__PunctType=Qest": {POS: PUNCT},
|
||||||
"V_Adv_Art_N_Prep_Pron_N__Degree=Pos|Number=Plur|Person=2|Subcat=Tran": {POS: VERB},
|
"V_Adv_Art_N_Prep_Pron_N__Degree=Pos|Number=Plur|Person=2|Subcat=Tran": {POS: VERB},
|
||||||
"V_Adv__Degree=Pos|Subcat=Tran": {POS: VERB},
|
"V_Adv__Degree=Pos|Subcat=Tran": {POS: VERB},
|
||||||
"V_Art_N_Num_N__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
"V_Art_N_Num_N__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V_Art_N__Number=Sing|Subcat=Tran": {POS: VERB},
|
"V_Art_N__Number=Sing|Subcat=Tran": {POS: VERB},
|
||||||
"V_Conj_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V_Conj_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||||
"V_Conj_Pron__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V_Conj_Pron__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||||
"V_N_Conj_Adj_N_Prep_Art_N__Degree=Pos|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V_N_Conj_Adj_N_Prep_Art_N__Degree=Pos|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V_N_N__Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
"V_N_N__Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||||
"V_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V_N_N__Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||||
"V_N_V__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {POS: VERB},
|
"V_N_V__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V_N__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V_N__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||||
"V_N|trans|imp_eigen|ev|neut__Number=Sing|Subcat=Tran": {POS: VERB},
|
"V_N|trans|imp_eigen|ev|neut__Number=Sing|Subcat=Tran": {POS: VERB},
|
||||||
"V_Prep|intrans|verldw|onverv_voor__Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V_Prep|intrans|verldw|onverv_voor__Subcat=Intr|Tense=Past|VerbForm=Part": {
|
||||||
"V_Pron_Adv_Adv_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V_Pron_Adv__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
"V_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V_Pron_Adv_Adv_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V_Pron_Adv__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V_Pron_V__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V_Pron__VerbType=Aux,Cop": {POS: VERB},
|
"V_Pron__VerbType=Aux,Cop": {POS: VERB},
|
||||||
"V_V|hulp|imp_intrans|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB},
|
"V_V|hulp|imp_intrans|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB},
|
||||||
"V|hulpofkopp|conj__Mood=Sub|VerbForm=Fin": {POS: VERB},
|
"V|hulpofkopp|conj__Mood=Sub|VerbForm=Fin": {POS: VERB},
|
||||||
|
@ -620,94 +710,220 @@ TAG_MAP = {
|
||||||
"V|hulpofkopp|inf__VerbForm=Inf": {POS: VERB},
|
"V|hulpofkopp|inf__VerbForm=Inf": {POS: VERB},
|
||||||
"V|hulpofkopp|inf__VerbForm=Inf|VerbType=Aux,Cop": {POS: VERB},
|
"V|hulpofkopp|inf__VerbForm=Inf|VerbType=Aux,Cop": {POS: VERB},
|
||||||
"V|hulpofkopp|inf|subst__VerbForm=Inf": {POS: VERB},
|
"V|hulpofkopp|inf|subst__VerbForm=Inf": {POS: VERB},
|
||||||
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {
|
||||||
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
POS: VERB
|
||||||
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
"V|hulpofkopp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||||
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
},
|
||||||
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
|
||||||
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
POS: VERB
|
||||||
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
"V|hulpofkopp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||||
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {POS: VERB},
|
},
|
||||||
|
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulpofkopp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulpofkopp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulpofkopp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Aux,Cop": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part": {POS: VERB},
|
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||||
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part|VerbType=Aux,Cop": {POS: VERB},
|
"V|hulpofkopp|tegdw|vervneut__Case=Nom|Tense=Pres|VerbForm=Part|VerbType=Aux,Cop": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB},
|
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB},
|
||||||
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Aux,Cop": {POS: VERB},
|
"V|hulpofkopp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Aux,Cop": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V|hulp|conj__Mood=Sub|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
"V|hulp|conj__Mood=Sub|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
||||||
"V|hulp|inf__VerbForm=Inf": {POS: VERB},
|
"V|hulp|inf__VerbForm=Inf": {POS: VERB},
|
||||||
"V|hulp|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB},
|
"V|hulp|inf__VerbForm=Inf|VerbType=Mod": {POS: VERB},
|
||||||
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin": {
|
||||||
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
POS: VERB
|
||||||
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
"V|hulp|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||||
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
},
|
||||||
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
|
||||||
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
POS: VERB
|
||||||
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
"V|hulp|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||||
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Mod": {POS: VERB},
|
},
|
||||||
|
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulp|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulp|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|VerbType=Mod": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|hulp|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|VerbType=Mod": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V|hulp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB},
|
"V|hulp|verldw|onverv__Tense=Past|VerbForm=Part": {POS: VERB},
|
||||||
"V|hulp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Mod": {POS: VERB},
|
"V|hulp|verldw|onverv__Tense=Past|VerbForm=Part|VerbType=Mod": {POS: VERB},
|
||||||
"V|intrans|conj__Mood=Sub|Subcat=Intr|VerbForm=Fin": {POS: VERB},
|
"V|intrans|conj__Mood=Sub|Subcat=Intr|VerbForm=Fin": {POS: VERB},
|
||||||
"V|intrans|imp__Mood=Imp|Subcat=Intr|VerbForm=Fin": {POS: VERB},
|
"V|intrans|imp__Mood=Imp|Subcat=Intr|VerbForm=Fin": {POS: VERB},
|
||||||
"V|intrans|inf__Subcat=Intr|VerbForm=Inf": {POS: VERB},
|
"V|intrans|inf__Subcat=Intr|VerbForm=Inf": {POS: VERB},
|
||||||
"V|intrans|inf|subst__Subcat=Intr|VerbForm=Inf": {POS: VERB},
|
"V|intrans|inf|subst__Subcat=Intr|VerbForm=Inf": {POS: VERB},
|
||||||
"V|intrans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V|intrans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
|
||||||
"V|intrans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V|intrans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
"V|intrans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V|intrans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
|
||||||
"V|intrans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Past|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V|intrans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
|
"V|intrans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|intrans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|intrans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Past|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|intrans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V|intrans|tegdw|onverv__Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
"V|intrans|tegdw|onverv__Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||||
"V|intrans|tegdw|vervmv__Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
"V|intrans|tegdw|vervmv__Number=Plur|Subcat=Intr|Tense=Pres|VerbForm=Part": {
|
||||||
"V|intrans|tegdw|vervneut__Case=Nom|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
POS: VERB
|
||||||
"V|intrans|tegdw|vervvergr__Degree=Cmp|Subcat=Intr|Tense=Pres|VerbForm=Part": {POS: VERB},
|
},
|
||||||
|
"V|intrans|tegdw|vervneut__Case=Nom|Subcat=Intr|Tense=Pres|VerbForm=Part": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|intrans|tegdw|vervvergr__Degree=Cmp|Subcat=Intr|Tense=Pres|VerbForm=Part": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V|intrans|verldw|onverv__Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V|intrans|verldw|onverv__Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||||
"V|intrans|verldw|vervmv__Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V|intrans|verldw|vervmv__Number=Plur|Subcat=Intr|Tense=Past|VerbForm=Part": {
|
||||||
"V|intrans|verldw|vervneut__Case=Nom|Subcat=Intr|Tense=Past|VerbForm=Part": {POS: VERB},
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|intrans|verldw|vervneut__Case=Nom|Subcat=Intr|Tense=Past|VerbForm=Part": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V|refl|imp__Mood=Imp|Reflex=Yes|VerbForm=Fin": {POS: VERB},
|
"V|refl|imp__Mood=Imp|Reflex=Yes|VerbForm=Fin": {POS: VERB},
|
||||||
"V|refl|inf__Reflex=Yes|VerbForm=Inf": {POS: VERB},
|
"V|refl|inf__Reflex=Yes|VerbForm=Inf": {POS: VERB},
|
||||||
"V|refl|inf|subst__Reflex=Yes|VerbForm=Inf": {POS: VERB},
|
"V|refl|inf|subst__Reflex=Yes|VerbForm=Inf": {POS: VERB},
|
||||||
"V|refl|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V|refl|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
|
||||||
"V|refl|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V|refl|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
"V|refl|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Reflex=Yes|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V|refl|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
|
||||||
"V|refl|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Reflex=Yes|Tense=Past|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V|refl|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Past|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
|
"V|refl|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|refl|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Reflex=Yes|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|refl|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Reflex=Yes|Tense=Past|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|refl|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Reflex=Yes|Tense=Past|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V|refl|tegdw|vervneut__Case=Nom|Reflex=Yes|Tense=Pres|VerbForm=Part": {POS: VERB},
|
"V|refl|tegdw|vervneut__Case=Nom|Reflex=Yes|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||||
"V|refl|verldw|onverv__Reflex=Yes|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V|refl|verldw|onverv__Reflex=Yes|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||||
"V|trans|conj__Mood=Sub|Subcat=Tran|VerbForm=Fin": {POS: VERB},
|
"V|trans|conj__Mood=Sub|Subcat=Tran|VerbForm=Fin": {POS: VERB},
|
||||||
"V|trans|imp__Mood=Imp|Subcat=Tran|VerbForm=Fin": {POS: VERB},
|
"V|trans|imp__Mood=Imp|Subcat=Tran|VerbForm=Fin": {POS: VERB},
|
||||||
"V|trans|inf__Subcat=Tran|VerbForm=Inf": {POS: VERB},
|
"V|trans|inf__Subcat=Tran|VerbForm=Inf": {POS: VERB},
|
||||||
"V|trans|inf|subst__Subcat=Tran|VerbForm=Inf": {POS: VERB},
|
"V|trans|inf|subst__Subcat=Tran|VerbForm=Inf": {POS: VERB},
|
||||||
"V|trans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V|trans|ott|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||||
"V|trans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V|trans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
"V|trans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: VERB},
|
"V|trans|ott|1|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||||
"V|trans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Fin": {POS: VERB},
|
POS: VERB
|
||||||
"V|trans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Fin": {POS: VERB},
|
},
|
||||||
|
"V|trans|ott|2|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|trans|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|trans|ovt|1of2of3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Tran|Tense=Past|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
|
"V|trans|ovt|1of2of3|mv__Aspect=Imp|Mood=Ind|Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Fin": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V|trans|tegdw|onverv__Subcat=Tran|Tense=Pres|VerbForm=Part": {POS: VERB},
|
"V|trans|tegdw|onverv__Subcat=Tran|Tense=Pres|VerbForm=Part": {POS: VERB},
|
||||||
"V|trans|tegdw|vervneut__Case=Nom|Subcat=Tran|Tense=Pres|VerbForm=Part": {POS: VERB},
|
"V|trans|tegdw|vervneut__Case=Nom|Subcat=Tran|Tense=Pres|VerbForm=Part": {
|
||||||
|
POS: VERB
|
||||||
|
},
|
||||||
"V|trans|verldw|onverv__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V|trans|verldw|onverv__Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
||||||
"V|trans|verldw|vervmv__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
"V|trans|verldw|vervmv__Number=Plur|Subcat=Tran|Tense=Past|VerbForm=Part": {
|
||||||
"V|trans|verldw|vervneut__Case=Nom|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
POS: VERB
|
||||||
"V|trans|verldw|vervvergr__Degree=Cmp|Subcat=Tran|Tense=Past|VerbForm=Part": {POS: VERB},
|
},
|
||||||
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {POS: X},
|
"V|trans|verldw|vervneut__Case=Nom|Subcat=Tran|Tense=Past|VerbForm=Part": {
|
||||||
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: X},
|
POS: VERB
|
||||||
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X},
|
},
|
||||||
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Past|VerbForm=Part": {POS: X},
|
"V|trans|verldw|vervvergr__Degree=Cmp|Subcat=Tran|Tense=Past|VerbForm=Part": {
|
||||||
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {POS: X},
|
POS: VERB
|
||||||
|
},
|
||||||
|
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|VerbType=Mod": {
|
||||||
|
POS: X
|
||||||
|
},
|
||||||
|
"X__Aspect=Imp|Definite=Def|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: X
|
||||||
|
},
|
||||||
|
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: X
|
||||||
|
},
|
||||||
|
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Past|VerbForm=Part": {
|
||||||
|
POS: X
|
||||||
|
},
|
||||||
|
"X__Aspect=Imp|Degree=Pos|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Inf": {
|
||||||
|
POS: X
|
||||||
|
},
|
||||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: X},
|
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {POS: X},
|
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Dem|Tense=Pres|VerbForm=Fin": {
|
||||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {POS: X},
|
POS: X
|
||||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X},
|
},
|
||||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {POS: X},
|
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|PronType=Rel|Tense=Pres|VerbForm=Fin": {
|
||||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {POS: X},
|
POS: X
|
||||||
|
},
|
||||||
|
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: X
|
||||||
|
},
|
||||||
|
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|PronType=Ind|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: X
|
||||||
|
},
|
||||||
|
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Tran|Tense=Pres|VerbForm=Fin": {
|
||||||
|
POS: X
|
||||||
|
},
|
||||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: X},
|
"X__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {POS: X},
|
||||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {POS: X},
|
"X__Aspect=Imp|Mood=Ind|Number=Sing|Subcat=Intr|Tense=Pres|VerbForm=Inf": {POS: X},
|
||||||
"X__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: X},
|
"X__Aspect=Imp|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin": {POS: X},
|
||||||
|
@ -808,5 +1024,5 @@ TAG_MAP = {
|
||||||
"X__VerbForm=Inf|VerbType=Mod": {POS: X},
|
"X__VerbForm=Inf|VerbType=Mod": {POS: X},
|
||||||
"X__VerbType=Aux,Cop": {POS: X},
|
"X__VerbType=Aux,Cop": {POS: X},
|
||||||
"X___": {POS: X},
|
"X___": {POS: X},
|
||||||
"_SP": {POS: SPACE}
|
"_SP": {POS: SPACE},
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -5039,5 +5039,5 @@ TAG_MAP = {
|
||||||
"punc": {POS: PUNCT},
|
"punc": {POS: PUNCT},
|
||||||
"v-pcp|M|P": {POS: VERB},
|
"v-pcp|M|P": {POS: VERB},
|
||||||
"v-pcp|M|S": {POS: VERB},
|
"v-pcp|M|S": {POS: VERB},
|
||||||
"_SP": {POS: SPACE}
|
"_SP": {POS: SPACE},
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,9 @@ _infixes = (
|
||||||
+ LIST_ICONS
|
+ LIST_ICONS
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
|
|
@ -19,7 +19,6 @@ _abbrev_exc = [
|
||||||
{ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"},
|
{ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"},
|
||||||
{ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"},
|
{ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"},
|
||||||
{ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"},
|
{ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"},
|
||||||
|
|
||||||
# Months abbreviations
|
# Months abbreviations
|
||||||
{ORTH: "янв", LEMMA: "январь", NORM: "январь"},
|
{ORTH: "янв", LEMMA: "январь", NORM: "январь"},
|
||||||
{ORTH: "фев", LEMMA: "февраль", NORM: "февраль"},
|
{ORTH: "фев", LEMMA: "февраль", NORM: "февраль"},
|
||||||
|
@ -49,16 +48,18 @@ for abbrev_desc in _abbrev_exc:
|
||||||
abbrev = abbrev_desc[ORTH]
|
abbrev = abbrev_desc[ORTH]
|
||||||
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
|
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
|
||||||
_exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
|
_exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
|
||||||
_exc[orth + '.'] = [{ORTH: orth + '.', LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
|
_exc[orth + "."] = [
|
||||||
|
{ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
_slang_exc = [
|
_slang_exc = [
|
||||||
{ORTH: '2к15', LEMMA: '2015', NORM: '2015'},
|
{ORTH: "2к15", LEMMA: "2015", NORM: "2015"},
|
||||||
{ORTH: '2к16', LEMMA: '2016', NORM: '2016'},
|
{ORTH: "2к16", LEMMA: "2016", NORM: "2016"},
|
||||||
{ORTH: '2к17', LEMMA: '2017', NORM: '2017'},
|
{ORTH: "2к17", LEMMA: "2017", NORM: "2017"},
|
||||||
{ORTH: '2к18', LEMMA: '2018', NORM: '2018'},
|
{ORTH: "2к18", LEMMA: "2018", NORM: "2018"},
|
||||||
{ORTH: '2к19', LEMMA: '2019', NORM: '2019'},
|
{ORTH: "2к19", LEMMA: "2019", NORM: "2019"},
|
||||||
{ORTH: '2к20', LEMMA: '2020', NORM: '2020'},
|
{ORTH: "2к20", LEMMA: "2020", NORM: "2020"},
|
||||||
]
|
]
|
||||||
|
|
||||||
for slang_desc in _slang_exc:
|
for slang_desc in _slang_exc:
|
||||||
|
|
|
@ -15,7 +15,7 @@ _infixes = (
|
||||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
|
||||||
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=_hyphens_no_dash),
|
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash),
|
||||||
r"(?<=[0-9])-(?=[0-9])",
|
r"(?<=[0-9])-(?=[0-9])",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
|
@ -7,7 +7,6 @@ from __future__ import unicode_literals
|
||||||
# Entries should be added in the following format:
|
# Entries should be added in the following format:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
LOOKUP = {
|
LOOKUP = {
|
||||||
"آ": "آنا",
|
"آ": "آنا",
|
||||||
"آْباد": "آْباد",
|
"آْباد": "آْباد",
|
||||||
|
@ -29109,5 +29108,5 @@ LOOKUP = {
|
||||||
"ظالموں": "ظالم",
|
"ظالموں": "ظالم",
|
||||||
"ظلم": "ظلم",
|
"ظلم": "ظلم",
|
||||||
"ظلمو": "ظلم",
|
"ظلمو": "ظلم",
|
||||||
"ظلموں": "ظلم"
|
"ظلموں": "ظلم",
|
||||||
}
|
}
|
|
@ -16,5 +16,5 @@ sentences = [
|
||||||
"此外,中文还是联合国正式语文,并被上海合作组织等国际组织采用为官方语言。",
|
"此外,中文还是联合国正式语文,并被上海合作组织等国际组织采用为官方语言。",
|
||||||
"在中国大陆,汉语通称为“汉语”。",
|
"在中国大陆,汉语通称为“汉语”。",
|
||||||
"在联合国、台湾、香港及澳门,通称为“中文”。",
|
"在联合国、台湾、香港及澳门,通称为“中文”。",
|
||||||
"在新加坡及马来西亚,通称为“华语”。"
|
"在新加坡及马来西亚,通称为“华语”。",
|
||||||
]
|
]
|
||||||
|
|
|
@ -47,7 +47,7 @@ _single_num_words = [
|
||||||
"拾陆",
|
"拾陆",
|
||||||
"拾柒",
|
"拾柒",
|
||||||
"拾捌",
|
"拾捌",
|
||||||
"拾玖"
|
"拾玖",
|
||||||
]
|
]
|
||||||
|
|
||||||
_count_num_words = [
|
_count_num_words = [
|
||||||
|
@ -68,27 +68,16 @@ _count_num_words = [
|
||||||
"陆",
|
"陆",
|
||||||
"柒",
|
"柒",
|
||||||
"捌",
|
"捌",
|
||||||
"玖"
|
"玖",
|
||||||
]
|
]
|
||||||
|
|
||||||
_base_num_words = [
|
_base_num_words = ["十", "百", "千", "万", "亿", "兆", "拾", "佰", "仟"]
|
||||||
"十",
|
|
||||||
"百",
|
|
||||||
"千",
|
|
||||||
"万",
|
|
||||||
"亿",
|
|
||||||
"兆",
|
|
||||||
"拾",
|
|
||||||
"佰",
|
|
||||||
"仟"
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(",", "").replace(
|
text = text.replace(",", "").replace(".", "").replace(",", "").replace("。", "")
|
||||||
".", "").replace(",", "").replace("。", "")
|
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count("/") == 1:
|
if text.count("/") == 1:
|
||||||
|
@ -97,10 +86,12 @@ def like_num(text):
|
||||||
return True
|
return True
|
||||||
if text in _single_num_words:
|
if text in _single_num_words:
|
||||||
return True
|
return True
|
||||||
|
# fmt: off
|
||||||
if re.match('^((' + '|'.join(_count_num_words) + '){1}'
|
if re.match('^((' + '|'.join(_count_num_words) + '){1}'
|
||||||
+ '(' + '|'.join(_base_num_words) + '){1})+'
|
+ '(' + '|'.join(_base_num_words) + '){1})+'
|
||||||
+ '(' + '|'.join(_count_num_words) + ')?$', text):
|
+ '(' + '|'.join(_count_num_words) + ')?$', text):
|
||||||
return True
|
return True
|
||||||
|
# fmt: on
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -430,6 +430,7 @@ class Language(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#update
|
DOCS: https://spacy.io/api/language#update
|
||||||
"""
|
"""
|
||||||
|
expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
|
||||||
if len(docs) != len(golds):
|
if len(docs) != len(golds):
|
||||||
raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
|
raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
|
||||||
if len(docs) == 0:
|
if len(docs) == 0:
|
||||||
|
@ -445,10 +446,10 @@ class Language(object):
|
||||||
if isinstance(doc, basestring_):
|
if isinstance(doc, basestring_):
|
||||||
doc = self.make_doc(doc)
|
doc = self.make_doc(doc)
|
||||||
if not isinstance(gold, GoldParse):
|
if not isinstance(gold, GoldParse):
|
||||||
expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
|
unexpected = [k for k in gold if k not in expected_keys]
|
||||||
unexpected_keys = [k for k in gold if k not in expected_keys]
|
if unexpected:
|
||||||
if unexpected_keys:
|
err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
|
||||||
raise ValueError(Errors.E151.format(unexpected_keys=unexpected_keys, expected_keys=expected_keys))
|
raise ValueError(err)
|
||||||
gold = GoldParse(doc, **gold)
|
gold = GoldParse(doc, **gold)
|
||||||
doc_objs.append(doc)
|
doc_objs.append(doc)
|
||||||
gold_objs.append(gold)
|
gold_objs.append(gold)
|
||||||
|
|
|
@ -5,10 +5,10 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,lemma', [("aprox.", "aproximadament"),
|
@pytest.mark.parametrize(
|
||||||
("pàg.", "pàgina"),
|
"text,lemma",
|
||||||
("p.ex.", "per exemple")
|
[("aprox.", "aproximadament"), ("pàg.", "pàgina"), ("p.ex.", "per exemple")],
|
||||||
])
|
)
|
||||||
def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
|
def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
|
||||||
tokens = ca_tokenizer(text)
|
tokens = ca_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -21,21 +21,37 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
|
||||||
assert len(tokens) == 138
|
assert len(tokens) == 138
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [
|
@pytest.mark.parametrize(
|
||||||
|
"text,length",
|
||||||
|
[
|
||||||
("Perquè va anar-hi?", 6),
|
("Perquè va anar-hi?", 6),
|
||||||
("“Ah no?”", 5),
|
("“Ah no?”", 5),
|
||||||
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
||||||
("Van córrer aprox. 10km", 5),
|
("Van córrer aprox. 10km", 5),
|
||||||
("Llavors perqué...", 3)])
|
("Llavors perqué...", 3),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
|
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
|
||||||
tokens = ca_tokenizer(text)
|
tokens = ca_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,match', [
|
@pytest.mark.parametrize(
|
||||||
('10', True), ('1', True), ('10,000', True), ('10,00', True),
|
"text,match",
|
||||||
('999.0', True), ('un', True), ('dos', True), ('bilió', True),
|
[
|
||||||
('gos', False), (',', False), ('1/2', True)])
|
("10", True),
|
||||||
|
("1", True),
|
||||||
|
("10,000", True),
|
||||||
|
("10,00", True),
|
||||||
|
("999.0", True),
|
||||||
|
("un", True),
|
||||||
|
("dos", True),
|
||||||
|
("bilió", True),
|
||||||
|
("gos", False),
|
||||||
|
(",", False),
|
||||||
|
("1/2", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_ca_lex_attrs_like_number(ca_tokenizer, text, match):
|
def test_ca_lex_attrs_like_number(ca_tokenizer, text, match):
|
||||||
tokens = ca_tokenizer(text)
|
tokens = ca_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -32,7 +32,7 @@ def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
||||||
assert [token.norm_ for token in tokens] == norms
|
assert [token.norm_ for token in tokens] == norms
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
|
@pytest.mark.parametrize("text,norm", [("daß", "dass")])
|
||||||
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert tokens[0].norm_ == norm
|
assert tokens[0].norm_ == norm
|
||||||
|
|
|
@ -7,33 +7,33 @@ import pytest
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text",
|
"text",
|
||||||
[
|
[
|
||||||
u"aujourd'hui",
|
"aujourd'hui",
|
||||||
u"Aujourd'hui",
|
"Aujourd'hui",
|
||||||
u"prud'hommes",
|
"prud'hommes",
|
||||||
u"prud’hommal",
|
"prud’hommal",
|
||||||
u"audio-numérique",
|
"audio-numérique",
|
||||||
u"Audio-numérique",
|
"Audio-numérique",
|
||||||
u"entr'amis",
|
"entr'amis",
|
||||||
u"entr'abat",
|
"entr'abat",
|
||||||
u"rentr'ouvertes",
|
"rentr'ouvertes",
|
||||||
u"grand'hamien",
|
"grand'hamien",
|
||||||
u"Châteauneuf-la-Forêt",
|
"Châteauneuf-la-Forêt",
|
||||||
u"Château-Guibert",
|
"Château-Guibert",
|
||||||
u"11-septembre",
|
"11-septembre",
|
||||||
u"11-Septembre",
|
"11-Septembre",
|
||||||
u"refox-trottâmes",
|
"refox-trottâmes",
|
||||||
# u"K-POP",
|
# u"K-POP",
|
||||||
# u"K-Pop",
|
# u"K-Pop",
|
||||||
# u"K-pop",
|
# u"K-pop",
|
||||||
u"z'yeutes",
|
"z'yeutes",
|
||||||
u"black-outeront",
|
"black-outeront",
|
||||||
u"états-unienne",
|
"états-unienne",
|
||||||
u"courtes-pattes",
|
"courtes-pattes",
|
||||||
u"court-pattes",
|
"court-pattes",
|
||||||
u"saut-de-ski",
|
"saut-de-ski",
|
||||||
u"Écourt-Saint-Quentin",
|
"Écourt-Saint-Quentin",
|
||||||
u"Bout-de-l'Îlien",
|
"Bout-de-l'Îlien",
|
||||||
u"pet-en-l'air",
|
"pet-en-l'air",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
||||||
|
|
|
@ -3,13 +3,18 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
@pytest.mark.parametrize("tokens,lemmas", [
|
TEST_CASES = [
|
||||||
(["Galime", "vadinti", "gerovės", "valstybe", ",", "turime", "išvystytą", "socialinę", "apsaugą", ",",
|
(["Galime", "vadinti", "gerovės", "valstybe", ",", "turime", "išvystytą", "socialinę", "apsaugą", ",",
|
||||||
"sveikatos", "apsaugą", "ir", "prieinamą", "švietimą", "."],
|
"sveikatos", "apsaugą", "ir", "prieinamą", "švietimą", "."],
|
||||||
["galėti", "vadintas", "gerovė", "valstybė", ",", "turėti", "išvystytas", "socialinis",
|
["galėti", "vadintas", "gerovė", "valstybė", ",", "turėti", "išvystytas", "socialinis",
|
||||||
"apsauga", ",", "sveikata", "apsauga", "ir", "prieinamas", "švietimas", "."]),
|
"apsauga", ",", "sveikata", "apsauga", "ir", "prieinamas", "švietimas", "."]),
|
||||||
(["taip", ",", "uoliai", "tyrinėjau", "ir", "pasirinkau", "geriausią", "variantą", "."],
|
(["taip", ",", "uoliai", "tyrinėjau", "ir", "pasirinkau", "geriausią", "variantą", "."],
|
||||||
["taip", ",", "uolus", "tyrinėti", "ir", "pasirinkti", "geras", "variantas", "."])])
|
["taip", ",", "uolus", "tyrinėti", "ir", "pasirinkti", "geras", "variantas", "."])
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
|
||||||
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
|
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
|
||||||
assert lemmas == [lt_lemmatizer.lookup(token) for token in tokens]
|
assert lemmas == [lt_lemmatizer.lookup(token) for token in tokens]
|
||||||
|
|
|
@ -7,10 +7,21 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,match', [
|
@pytest.mark.parametrize(
|
||||||
('10', True), ('1', True), ('10,000', True), ('10,00', True),
|
"text,match",
|
||||||
('jeden', True), ('dwa', True), ('milion', True),
|
[
|
||||||
('pies', False), (',', False), ('1/2', True)])
|
("10", True),
|
||||||
|
("1", True),
|
||||||
|
("10,000", True),
|
||||||
|
("10,00", True),
|
||||||
|
("jeden", True),
|
||||||
|
("dwa", True),
|
||||||
|
("milion", True),
|
||||||
|
("pies", False),
|
||||||
|
(",", False),
|
||||||
|
("1/2", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_lex_attrs_like_number(pl_tokenizer, text, match):
|
def test_lex_attrs_like_number(pl_tokenizer, text, match):
|
||||||
tokens = pl_tokenizer(text)
|
tokens = pl_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -4,9 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("text", ["ہےں۔", "کیا۔"])
|
||||||
"text", ['ہےں۔', 'کیا۔']
|
|
||||||
)
|
|
||||||
def test_contractions(ur_tokenizer, text):
|
def test_contractions(ur_tokenizer, text):
|
||||||
"""Test specific Urdu punctuation character"""
|
"""Test specific Urdu punctuation character"""
|
||||||
tokens = ur_tokenizer(text)
|
tokens = ur_tokenizer(text)
|
||||||
|
|
|
@ -134,12 +134,12 @@ def test_matcher_end_zero_plus(en_vocab):
|
||||||
def test_matcher_sets_return_correct_tokens(en_vocab):
|
def test_matcher_sets_return_correct_tokens(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
patterns = [
|
patterns = [
|
||||||
[{'LOWER': {'IN': ["zero"]}}],
|
[{"LOWER": {"IN": ["zero"]}}],
|
||||||
[{'LOWER': {'IN': ["one"]}}],
|
[{"LOWER": {"IN": ["one"]}}],
|
||||||
[{'LOWER': {'IN': ["two"]}}],
|
[{"LOWER": {"IN": ["two"]}}],
|
||||||
]
|
]
|
||||||
matcher.add('TEST', None, *patterns)
|
matcher.add("TEST", None, *patterns)
|
||||||
doc = Doc(en_vocab, words="zero one two three".split())
|
doc = Doc(en_vocab, words="zero one two three".split())
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
|
texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
|
||||||
assert texts == ['zero', 'one', 'two']
|
assert texts == ["zero", "one", "two"]
|
||||||
|
|
|
@ -52,7 +52,9 @@ def test_get_pipe(nlp, name):
|
||||||
assert nlp.get_pipe(name) == new_pipe
|
assert nlp.get_pipe(name) == new_pipe
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("name,replacement,not_callable", [("my_component", lambda doc: doc, {})])
|
@pytest.mark.parametrize(
|
||||||
|
"name,replacement,not_callable", [("my_component", lambda doc: doc, {})]
|
||||||
|
)
|
||||||
def test_replace_pipe(nlp, name, replacement, not_callable):
|
def test_replace_pipe(nlp, name, replacement, not_callable):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.replace_pipe(name, new_pipe)
|
nlp.replace_pipe(name, new_pipe)
|
||||||
|
|
|
@ -358,7 +358,9 @@ def test_issue850_basic():
|
||||||
assert end == 4
|
assert end == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="French exception list is not enabled in the default tokenizer anymore")
|
@pytest.mark.skip(
|
||||||
|
reason="French exception list is not enabled in the default tokenizer anymore"
|
||||||
|
)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
|
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
|
||||||
)
|
)
|
||||||
|
|
|
@ -19,7 +19,7 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
||||||
def test_issue1235():
|
def test_issue1235():
|
||||||
"""Test that g is not split of if preceded by a number and a letter"""
|
"""Test that g is not split of if preceded by a number and a letter"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
testwords = u'e2g 2g 52g'
|
testwords = "e2g 2g 52g"
|
||||||
doc = nlp(testwords)
|
doc = nlp(testwords)
|
||||||
assert len(doc) == 5
|
assert len(doc) == 5
|
||||||
assert doc[0].text == "e2g"
|
assert doc[0].text == "e2g"
|
||||||
|
|
|
@ -4,15 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||||
"word",
|
|
||||||
[
|
|
||||||
"don't",
|
|
||||||
"don’t",
|
|
||||||
"I'd",
|
|
||||||
"I’d",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_issue3521(en_tokenizer, word):
|
def test_issue3521(en_tokenizer, word):
|
||||||
tok = en_tokenizer(word)[1]
|
tok = en_tokenizer(word)[1]
|
||||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||||
|
|
|
@ -9,7 +9,10 @@ import numpy as np
|
||||||
def test_issue3540(en_vocab):
|
def test_issue3540(en_vocab):
|
||||||
|
|
||||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||||
tensor = np.asarray([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], dtype="f")
|
tensor = np.asarray(
|
||||||
|
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||||
|
dtype="f",
|
||||||
|
)
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words)
|
||||||
doc.tensor = tensor
|
doc.tensor = tensor
|
||||||
|
|
||||||
|
@ -25,7 +28,7 @@ def test_issue3540(en_vocab):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
heads = [(doc[3], 1), doc[2]]
|
heads = [(doc[3], 1), doc[2]]
|
||||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||||
retokenizer.split(doc[3], [u"New", u"York"], heads=heads, attrs=attrs)
|
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||||
|
|
||||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||||
assert [token.text for token in doc] == gold_text
|
assert [token.text for token in doc] == gold_text
|
||||||
|
|
|
@ -35,7 +35,9 @@ def test_issue3962(doc):
|
||||||
doc2_json = doc2.to_json()
|
doc2_json = doc2.to_json()
|
||||||
assert doc2_json
|
assert doc2_json
|
||||||
|
|
||||||
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root
|
assert (
|
||||||
|
doc2[0].head.text == "jests"
|
||||||
|
) # head set to itself, being the new artificial root
|
||||||
assert doc2[0].dep_ == "dep"
|
assert doc2[0].dep_ == "dep"
|
||||||
assert doc2[1].head.text == "jests"
|
assert doc2[1].head.text == "jests"
|
||||||
assert doc2[1].dep_ == "prep"
|
assert doc2[1].dep_ == "prep"
|
||||||
|
@ -92,7 +94,9 @@ def test_issue3962_long(two_sent_doc):
|
||||||
doc2_json = doc2.to_json()
|
doc2_json = doc2.to_json()
|
||||||
assert doc2_json
|
assert doc2_json
|
||||||
|
|
||||||
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root (in sentence 1)
|
assert (
|
||||||
|
doc2[0].head.text == "jests"
|
||||||
|
) # head set to itself, being the new artificial root (in sentence 1)
|
||||||
assert doc2[0].dep_ == "ROOT"
|
assert doc2[0].dep_ == "ROOT"
|
||||||
assert doc2[1].head.text == "jests"
|
assert doc2[1].head.text == "jests"
|
||||||
assert doc2[1].dep_ == "prep"
|
assert doc2[1].dep_ == "prep"
|
||||||
|
@ -100,9 +104,13 @@ def test_issue3962_long(two_sent_doc):
|
||||||
assert doc2[2].dep_ == "pobj"
|
assert doc2[2].dep_ == "pobj"
|
||||||
assert doc2[3].head.text == "jests"
|
assert doc2[3].head.text == "jests"
|
||||||
assert doc2[3].dep_ == "punct"
|
assert doc2[3].dep_ == "punct"
|
||||||
assert doc2[4].head.text == "They" # head set to itself, being the new artificial root (in sentence 2)
|
assert (
|
||||||
|
doc2[4].head.text == "They"
|
||||||
|
) # head set to itself, being the new artificial root (in sentence 2)
|
||||||
assert doc2[4].dep_ == "dep"
|
assert doc2[4].dep_ == "dep"
|
||||||
assert doc2[4].head.text == "They" # head set to the new artificial head (in sentence 2)
|
assert (
|
||||||
|
doc2[4].head.text == "They"
|
||||||
|
) # head set to the new artificial head (in sentence 2)
|
||||||
assert doc2[4].dep_ == "dep"
|
assert doc2[4].dep_ == "dep"
|
||||||
|
|
||||||
# We should still have 2 sentences
|
# We should still have 2 sentences
|
||||||
|
|
|
@ -30,14 +30,18 @@ def test_serialize_kb_disk(en_vocab):
|
||||||
def _get_dummy_kb(vocab):
|
def _get_dummy_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
|
kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
|
||||||
|
|
||||||
kb.add_entity(entity='Q53', freq=33, entity_vector=[0, 5, 3])
|
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
|
||||||
kb.add_entity(entity='Q17', freq=2, entity_vector=[7, 1, 0])
|
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
|
||||||
kb.add_entity(entity='Q007', freq=7, entity_vector=[0, 0, 7])
|
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
|
||||||
kb.add_entity(entity='Q44', freq=342, entity_vector=[4, 4, 4])
|
kb.add_entity(entity="Q44", freq=342, entity_vector=[4, 4, 4])
|
||||||
|
|
||||||
kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9])
|
kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
|
||||||
kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])
|
kb.add_alias(
|
||||||
kb.add_alias(alias='random', entities=['Q007'], probabilities=[1.0])
|
alias="guy",
|
||||||
|
entities=["Q53", "Q007", "Q17", "Q44"],
|
||||||
|
probabilities=[0.3, 0.3, 0.2, 0.1],
|
||||||
|
)
|
||||||
|
kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
|
||||||
|
|
||||||
return kb
|
return kb
|
||||||
|
|
||||||
|
@ -45,30 +49,30 @@ def _get_dummy_kb(vocab):
|
||||||
def _check_kb(kb):
|
def _check_kb(kb):
|
||||||
# check entities
|
# check entities
|
||||||
assert kb.get_size_entities() == 4
|
assert kb.get_size_entities() == 4
|
||||||
for entity_string in ['Q53', 'Q17', 'Q007', 'Q44']:
|
for entity_string in ["Q53", "Q17", "Q007", "Q44"]:
|
||||||
assert entity_string in kb.get_entity_strings()
|
assert entity_string in kb.get_entity_strings()
|
||||||
for entity_string in ['', 'Q0']:
|
for entity_string in ["", "Q0"]:
|
||||||
assert entity_string not in kb.get_entity_strings()
|
assert entity_string not in kb.get_entity_strings()
|
||||||
|
|
||||||
# check aliases
|
# check aliases
|
||||||
assert kb.get_size_aliases() == 3
|
assert kb.get_size_aliases() == 3
|
||||||
for alias_string in ['double07', 'guy', 'random']:
|
for alias_string in ["double07", "guy", "random"]:
|
||||||
assert alias_string in kb.get_alias_strings()
|
assert alias_string in kb.get_alias_strings()
|
||||||
for alias_string in ['nothingness', '', 'randomnoise']:
|
for alias_string in ["nothingness", "", "randomnoise"]:
|
||||||
assert alias_string not in kb.get_alias_strings()
|
assert alias_string not in kb.get_alias_strings()
|
||||||
|
|
||||||
# check candidates & probabilities
|
# check candidates & probabilities
|
||||||
candidates = sorted(kb.get_candidates('double07'), key=lambda x: x.entity_)
|
candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_)
|
||||||
assert len(candidates) == 2
|
assert len(candidates) == 2
|
||||||
|
|
||||||
assert candidates[0].entity_ == 'Q007'
|
assert candidates[0].entity_ == "Q007"
|
||||||
assert 6.999 < candidates[0].entity_freq < 7.01
|
assert 6.999 < candidates[0].entity_freq < 7.01
|
||||||
assert candidates[0].entity_vector == [0, 0, 7]
|
assert candidates[0].entity_vector == [0, 0, 7]
|
||||||
assert candidates[0].alias_ == 'double07'
|
assert candidates[0].alias_ == "double07"
|
||||||
assert 0.899 < candidates[0].prior_prob < 0.901
|
assert 0.899 < candidates[0].prior_prob < 0.901
|
||||||
|
|
||||||
assert candidates[1].entity_ == 'Q17'
|
assert candidates[1].entity_ == "Q17"
|
||||||
assert 1.99 < candidates[1].entity_freq < 2.01
|
assert 1.99 < candidates[1].entity_freq < 2.01
|
||||||
assert candidates[1].entity_vector == [7, 1, 0]
|
assert candidates[1].entity_vector == [7, 1, 0]
|
||||||
assert candidates[1].alias_ == 'double07'
|
assert candidates[1].alias_ == "double07"
|
||||||
assert 0.099 < candidates[1].prior_prob < 0.101
|
assert 0.099 < candidates[1].prior_prob < 0.101
|
||||||
|
|
Loading…
Reference in New Issue
Block a user