Remove corpus-specific morph rules

* Remove corpus-specific morph rules
* Add options similar to tag maps to provide them in the `train` and
`debug-data` CLIs
This commit is contained in:
Adriane Boyd 2020-07-15 19:44:18 +02:00
parent 2f981d5af1
commit d83e3c44c5
13 changed files with 16 additions and 5094 deletions

View File

@ -131,9 +131,16 @@ def debug_data(
tag_map = {} tag_map = {}
if tag_map_path is not None: if tag_map_path is not None:
tag_map = srsly.read_json(tag_map_path) tag_map = srsly.read_json(tag_map_path)
morph_rules_path = util.ensure_path(config["training"]["morph_rules"])
morph_rules = {}
if morph_rules_path is not None:
morph_rules = srsly.read_json(morph_rules_path)
# Update tag map with provided mapping # Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map) nlp.vocab.morphology.tag_map.update(tag_map)
# Load morph rules
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
msg.divider("Data file validation") msg.divider("Data file validation")
# Create the gold corpus to be able to better analyze data # Create the gold corpus to be able to better analyze data

View File

@ -127,6 +127,9 @@ def train(
# Update tag map with provided mapping # Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map) nlp.vocab.morphology.tag_map.update(tag_map)
# Load morph rules
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
# Create empty extra lexeme tables so the data from spacy-lookups-data # Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed # isn't loaded if these features are accessed
if config["training"]["omit_extra_lookups"]: if config["training"]["omit_extra_lookups"]:
@ -482,6 +485,12 @@ def load_from_paths(config):
if not tag_map_path.exists(): if not tag_map_path.exists():
msg.fail("Can't find tag map path", tag_map_path, exits=1) msg.fail("Can't find tag map path", tag_map_path, exits=1)
tag_map = srsly.read_json(config["training"]["tag_map"]) tag_map = srsly.read_json(config["training"]["tag_map"])
morph_rules = {}
morph_rules_path = util.ensure_path(config["training"]["morph_rules"])
if morph_rules_path is not None:
if not morph_rules_path.exists():
msg.fail("Can't find tag map path", morph_rules_path, exits=1)
morph_rules = srsly.read_json(config["training"]["morph_rules"])
weights_data = None weights_data = None
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"]) init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
if init_tok2vec is not None: if init_tok2vec is not None:

View File

@ -1,263 +0,0 @@
from ...symbols import LEMMA, PRON_LEMMA
MORPH_RULES = {
"PRP": {
"": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
"ওই": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
"আমাকে": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "One",
"PronType": "Prs",
"Case": "Acc",
},
"কি": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Gender": "Neut",
"PronType": "Int",
"Case": "Acc",
},
"সে": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "Three",
"PronType": "Prs",
"Case": "Nom",
},
"কিসে": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Gender": "Neut",
"PronType": "Int",
"Case": "Acc",
},
"তাকে": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "Three",
"PronType": "Prs",
"Case": "Acc",
},
"স্বয়ং": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
"কোনগুলো": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Gender": "Neut",
"PronType": "Int",
"Case": "Acc",
},
"তুমি": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "Two",
"PronType": "Prs",
"Case": "Nom",
},
"তুই": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "Two",
"PronType": "Prs",
"Case": "Nom",
},
"তাদেরকে": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "Three",
"PronType": "Prs",
"Case": "Acc",
},
"আমরা": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "One ",
"PronType": "Prs",
"Case": "Nom",
},
"যিনি": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Rel", "Case": "Nom"},
"আমাদেরকে": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "One",
"PronType": "Prs",
"Case": "Acc",
},
"কোন": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
"কারা": {LEMMA: PRON_LEMMA, "Number": "Plur", "PronType": "Int", "Case": "Acc"},
"তোমাকে": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "Two",
"PronType": "Prs",
"Case": "Acc",
},
"তোকে": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "Two",
"PronType": "Prs",
"Case": "Acc",
},
"খোদ": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
"কে": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
"যারা": {LEMMA: PRON_LEMMA, "Number": "Plur", "PronType": "Rel", "Case": "Nom"},
"যে": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Rel", "Case": "Nom"},
"তোমরা": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "Two",
"PronType": "Prs",
"Case": "Nom",
},
"তোরা": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "Two",
"PronType": "Prs",
"Case": "Nom",
},
"তোমাদেরকে": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "Two",
"PronType": "Prs",
"Case": "Acc",
},
"তোদেরকে": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "Two",
"PronType": "Prs",
"Case": "Acc",
},
"আপন": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
"": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
"নিজ": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
"কার": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
"যা": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Gender": "Neut",
"PronType": "Rel",
"Case": "Nom",
},
"তারা": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "Three",
"PronType": "Prs",
"Case": "Nom",
},
"আমি": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "One",
"PronType": "Prs",
"Case": "Nom",
},
},
"PRP$": {
"আমার": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "One",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"মোর": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "One",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"মোদের": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "One",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"তার": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "Three",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"তাহাার": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "Three",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"তোমাদের": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "Two",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"আমাদের": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "One",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"তোমার": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "Two",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"তোর": {
LEMMA: PRON_LEMMA,
"Number": "Sing",
"Person": "Two",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"তাদের": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "Three",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"কাদের": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"PronType": "Int",
"Case": "Acc",
},
"তোদের": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"Person": "Two",
"PronType": "Prs",
"Poss": "Yes",
"Case": "Nom",
},
"যাদের": {
LEMMA: PRON_LEMMA,
"Number": "Plur",
"PronType": "Int",
"Case": "Acc",
},
},
}

View File

@ -2,7 +2,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
@ -15,7 +14,6 @@ class DanishDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "da" lex_attr_getters[LANG] = lambda text: "da"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
morph_rules = MORPH_RULES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,308 +0,0 @@
from ...symbols import LEMMA, PRON_LEMMA
# Source: Danish Universal Dependencies and http://fjern-uv.dk/pronom.php
# Note: The Danish Universal Dependencies specify Case=Acc for all instances
# of "den"/"det" even when the case is in fact "Nom". In the rules below, Case
# is left unspecified for "den" and "det".
MORPH_RULES = {
"PRON": {
"jeg": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Nom",
"Gender": "Com",
}, # Case=Nom|Gender=Com|Number=Sing|Person=1|PronType=Prs
"mig": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Acc",
"Gender": "Com",
}, # Case=Acc|Gender=Com|Number=Sing|Person=1|PronType=Prs
"min": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Com",
}, # Gender=Com|Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
"mit": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
}, # Gender=Neut|Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
"vor": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Com",
}, # Gender=Com|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs|Style=Form
"vort": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
}, # Gender=Neut|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs|Style=Form
"du": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Case": "Nom",
"Gender": "Com",
}, # Case=Nom|Gender=Com|Number=Sing|Person=2|PronType=Prs
"dig": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Case": "Acc",
"Gender": "Com",
}, # Case=Acc|Gender=Com|Number=Sing|Person=2|PronType=Prs
"din": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Com",
}, # Gender=Com|Number=Sing|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Prs
"dit": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
}, # Gender=Neut|Number=Sing|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Prs
"han": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Nom",
"Gender": "Com",
}, # Case=Nom|Gender=Com|Number=Sing|Person=3|PronType=Prs
"hun": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Nom",
"Gender": "Com",
}, # Case=Nom|Gender=Com|Number=Sing|Person=3|PronType=Prs
"den": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Com",
}, # Case=Acc|Gender=Com|Number=Sing|Person=3|PronType=Prs, See note above.
"det": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
}, # Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs See note above.
"ham": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Com",
}, # Case=Acc|Gender=Com|Number=Sing|Person=3|PronType=Prs
"hende": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Com",
}, # Case=Acc|Gender=Com|Number=Sing|Person=3|PronType=Prs
"sin": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Com",
"Reflex": "Yes",
}, # Gender=Com|Number=Sing|Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs|Reflex=Yes
"sit": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
"Reflex": "Yes",
}, # Gender=Neut|Number=Sing|Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs|Reflex=Yes
"vi": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Nom",
"Gender": "Com",
}, # Case=Nom|Gender=Com|Number=Plur|Person=1|PronType=Prs
"os": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Acc",
"Gender": "Com",
}, # Case=Acc|Gender=Com|Number=Plur|Person=1|PronType=Prs
"mine": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
}, # Number=Plur|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
"vore": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
}, # Number=Plur|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs|Style=Form
"I": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Case": "Nom",
"Gender": "Com",
}, # Case=Nom|Gender=Com|Number=Plur|Person=2|PronType=Prs
"jer": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Case": "Acc",
"Gender": "Com",
}, # Case=Acc|Gender=Com|Number=Plur|Person=2|PronType=Prs
"dine": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Poss": "Yes",
}, # Number=Plur|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Prs
"de": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Nom",
}, # Case=Nom|Number=Plur|Person=3|PronType=Prs
"dem": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Acc",
}, # Case=Acc|Number=Plur|Person=3|PronType=Prs
"sine": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
}, # Number=Plur|Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs|Reflex=Yes
"vores": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Poss": "Yes",
}, # Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs
"De": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Case": "Nom",
"Gender": "Com",
}, # Case=Nom|Gender=Com|Person=2|Polite=Form|PronType=Prs
"Dem": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Case": "Acc",
"Gender": "Com",
}, # Case=Acc|Gender=Com|Person=2|Polite=Form|PronType=Prs
"Deres": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Poss": "Yes",
}, # Person=2|Polite=Form|Poss=Yes|PronType=Prs
"jeres": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Poss": "Yes",
}, # Number[psor]=Plur|Person=2|Poss=Yes|PronType=Prs
"sig": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Case": "Acc",
"Reflex": "Yes",
}, # Case=Acc|Person=3|PronType=Prs|Reflex=Yes
"hans": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Poss": "Yes",
}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
"hendes": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Poss": "Yes",
}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
"dens": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Poss": "Yes",
}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
"dets": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Poss": "Yes",
}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
"deres": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Poss": "Yes",
}, # Number[psor]=Plur|Person=3|Poss=Yes|PronType=Prs
},
"VERB": {
"er": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Pres"},
"var": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Past"},
},
}
for tag, rules in MORPH_RULES.items():
for key, attrs in dict(rules).items():
rules[key.title()] = attrs

View File

@ -1,7 +1,6 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -20,7 +19,6 @@ class EnglishDefaults(Language.Defaults):
lex_attr_getters[LANG] = _return_en lex_attr_getters[LANG] = _return_en
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
morph_rules = MORPH_RULES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
single_orth_variants = [ single_orth_variants = [
{"tags": ["NFP"], "variants": ["", "..."]}, {"tags": ["NFP"], "variants": ["", "..."]},

View File

@ -1,490 +0,0 @@
from ...symbols import LEMMA, PRON_LEMMA
# Several entries here look pretty suspicious. These will get the POS SCONJ
# given the tag IN, when an adpositional reading seems much more likely for
# a lot of these prepositions. I'm not sure what I was running in 04395ffa4
# when I did this? It doesn't seem right.
_subordinating_conjunctions = [
"that",
"if",
"as",
"because",
# "of",
# "for",
# "before",
# "in",
"while",
# "after",
"since",
"like",
# "with",
"so",
# "to",
# "by",
# "on",
# "about",
"than",
"whether",
"although",
# "from",
"though",
# "until",
"unless",
"once",
# "without",
# "at",
# "into",
"cause",
# "over",
"upon",
"till",
"whereas",
# "beyond",
"whilst",
"except",
"despite",
"wether",
# "then",
"but",
"becuse",
"whie",
# "below",
# "against",
"it",
"w/out",
# "toward",
"albeit",
"save",
"besides",
"becouse",
"coz",
"til",
"ask",
"i'd",
"out",
"near",
"seince",
# "towards",
"tho",
"sice",
"will",
]
# This seems kind of wrong too?
# _relative_pronouns = ["this", "that", "those", "these"]
MORPH_RULES = {
# "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
"NN": {
"something": {"POS": "PRON"},
"anyone": {"POS": "PRON"},
"anything": {"POS": "PRON"},
"nothing": {"POS": "PRON"},
"someone": {"POS": "PRON"},
"everything": {"POS": "PRON"},
"everyone": {"POS": "PRON"},
"everybody": {"POS": "PRON"},
"nobody": {"POS": "PRON"},
"somebody": {"POS": "PRON"},
"anybody": {"POS": "PRON"},
"any1": {"POS": "PRON"},
},
"PRP": {
"I": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Nom",
},
"me": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Acc",
},
"you": {LEMMA: PRON_LEMMA, "POS": "PRON", "PronType": "Prs", "Person": "Two"},
"he": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Nom",
},
"him": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Acc",
},
"she": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Case": "Nom",
},
"her": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Case": "Acc",
},
"it": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
},
"we": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Nom",
},
"us": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Acc",
},
"they": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Nom",
},
"them": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Acc",
},
"mine": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Reflex": "Yes",
},
"his": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Poss": "Yes",
"Reflex": "Yes",
},
"hers": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Poss": "Yes",
"Reflex": "Yes",
},
"its": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
"Poss": "Yes",
"Reflex": "Yes",
},
"ours": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"yours": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"theirs": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"myself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Acc",
"Reflex": "Yes",
},
"yourself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Two",
"Case": "Acc",
"Reflex": "Yes",
},
"himself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Masc",
"Reflex": "Yes",
},
"herself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Fem",
"Reflex": "Yes",
},
"itself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Neut",
"Reflex": "Yes",
},
"themself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Reflex": "Yes",
},
"ourselves": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Acc",
"Reflex": "Yes",
},
"yourselves": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Two",
"Case": "Acc",
"Reflex": "Yes",
},
"themselves": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Acc",
"Reflex": "Yes",
},
},
"PRP$": {
"my": {
LEMMA: PRON_LEMMA,
"Person": "One",
"Number": "Sing",
"PronType": "Prs",
"Poss": "Yes",
},
"your": {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"},
"his": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"PronType": "Prs",
"Poss": "Yes",
},
"her": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"PronType": "Prs",
"Poss": "Yes",
},
"its": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
"PronType": "Prs",
"Poss": "Yes",
},
"our": {
LEMMA: PRON_LEMMA,
"Person": "One",
"Number": "Plur",
"PronType": "Prs",
"Poss": "Yes",
},
"their": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Plur",
"PronType": "Prs",
"Poss": "Yes",
},
},
"RB": {word: {"POS": "PART"} for word in ["not", "n't", "nt", "nt"]},
"VB": {
word: {"POS": "AUX"}
for word in ["be", "have", "do", "get", "of", "am", "are", "'ve"]
},
"VBN": {"been": {LEMMA: "be", "POS": "AUX"}},
"VBG": {"being": {LEMMA: "be", "POS": "AUX"}},
"VBZ": {
"am": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "One",
"Tense": "Pres",
"Mood": "Ind",
},
"are": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Two",
"Tense": "Pres",
"Mood": "Ind",
},
"is": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Three",
"Tense": "Pres",
"Mood": "Ind",
},
"'re": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Two",
"Tense": "Pres",
"Mood": "Ind",
},
"'s": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Three",
"Tense": "Pres",
"Mood": "Ind",
},
"has": {LEMMA: "have", "POS": "AUX"},
"does": {LEMMA: "do", "POS": "AUX"},
},
"VBP": {
"are": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Pres",
"Mood": "Ind",
},
"'re": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Pres",
"Mood": "Ind",
},
"am": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "One",
"Tense": "Pres",
"Mood": "Ind",
},
"do": {"POS": "AUX"},
"have": {"POS": "AUX"},
"'m": {"POS": "AUX", LEMMA: "be"},
"'ve": {"POS": "AUX"},
"'s": {"POS": "AUX"},
"is": {"POS": "AUX"},
"'d": {"POS": "AUX"},
},
"VBD": {
"was": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Past",
"Number": "Sing",
},
"were": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Past",
"Number": "Plur",
},
"did": {LEMMA: "do", "POS": "AUX"},
"had": {LEMMA: "have", "POS": "AUX"},
"'d": {LEMMA: "have", "POS": "AUX"},
},
}
for tag, rules in MORPH_RULES.items():
for key, attrs in dict(rules).items():
rules[key.title()] = attrs

View File

@ -2,7 +2,6 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
@ -31,7 +30,6 @@ class LithuanianDefaults(Language.Defaults):
del mod_base_exceptions["8)"] del mod_base_exceptions["8)"]
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
morph_rules = MORPH_RULES
class Lithuanian(Language): class Lithuanian(Language):

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -23,7 +22,6 @@ class NorwegianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS
morph_rules = MORPH_RULES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS

View File

@ -1,665 +0,0 @@
from ...symbols import LEMMA, PRON_LEMMA
# This dict includes all the PRON and DET tag combinations found in the
# dataset developed by Schibsted, Nasjonalbiblioteket and LTG (to be published
# autumn 2018) and the rarely used polite form.
MORPH_RULES = {
"PRON__Animacy=Anim|Case=Nom|Number=Sing|Person=1|PronType=Prs": {
"jeg": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Nom",
}
},
"PRON__Animacy=Anim|Case=Nom|Number=Sing|Person=2|PronType=Prs": {
"du": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Case": "Nom",
},
# polite form, not sure about the tag
"De": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Case": "Nom",
"Polite": "Form",
},
},
"PRON__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
"hun": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Case": "Nom",
}
},
"PRON__Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
"han": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Nom",
}
},
"PRON__Gender=Neut|Number=Sing|Person=3|PronType=Prs": {
"det": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
},
"alt": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
},
"intet": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
},
"noe": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Sing",
"Person": "Three",
"Gender": "Neut",
},
},
"PRON__Animacy=Anim|Case=Nom|Number=Plur|Person=1|PronType=Prs": {
"vi": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Nom",
}
},
"PRON__Animacy=Anim|Case=Nom|Number=Plur|Person=2|PronType=Prs": {
"dere": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Case": "Nom",
}
},
"PRON__Case=Nom|Number=Plur|Person=3|PronType=Prs": {
"de": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Nom",
}
},
"PRON__Animacy=Anim|Case=Acc|Number=Sing|Person=1|PronType=Prs": {
"meg": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Acc",
}
},
"PRON__Animacy=Anim|Case=Acc|Number=Sing|Person=2|PronType=Prs": {
"deg": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Case": "Acc",
},
# polite form, not sure about the tag
"Dem": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Case": "Acc",
"Polite": "Form",
},
},
"PRON__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
"henne": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Case": "Acc",
}
},
"PRON__Animacy=Anim|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
"ham": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Acc",
},
"han": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Acc",
},
},
"PRON__Animacy=Anim|Case=Acc|Number=Plur|Person=1|PronType=Prs": {
"oss": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Acc",
}
},
"PRON__Animacy=Anim|Case=Acc|Number=Plur|Person=2|PronType=Prs": {
"dere": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Case": "Acc",
}
},
"PRON__Case=Acc|Number=Plur|Person=3|PronType=Prs": {
"dem": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Acc",
}
},
"PRON__Case=Acc|Reflex=Yes": {
"seg": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Sing,Plur",
"Reflex": "Yes",
}
},
"PRON__Animacy=Anim|Case=Nom|Number=Sing|PronType=Prs": {
"man": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Number": "Sing", "Case": "Nom"}
},
"DET__Gender=Masc|Number=Sing|Poss=Yes": {
"min": {
LEMMA: "min",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Masc",
},
"din": {
LEMMA: "din",
"Person": "Two",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Masc",
},
"hennes": {
LEMMA: "hennes",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Masc",
},
"hans": {
LEMMA: "hans",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Masc",
},
"sin": {
LEMMA: "sin",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Masc",
"Reflex": "Yes",
},
"vår": {
LEMMA: "vår",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Masc",
},
"deres": {
LEMMA: "deres",
"Person": "Two,Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Masc",
},
# polite form, not sure about the tag
"Deres": {
LEMMA: "Deres",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Masc",
"Polite": "Form",
},
},
"DET__Gender=Fem|Number=Sing|Poss=Yes": {
"mi": {
LEMMA: "min",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Fem",
},
"di": {
LEMMA: "din",
"Person": "Two",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Fem",
},
"hennes": {
LEMMA: "hennes",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Fem",
},
"hans": {
LEMMA: "hans",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Fem",
},
"si": {
LEMMA: "sin",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Fem",
"Reflex": "Yes",
},
"vår": {
LEMMA: "vår",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Fem",
},
"deres": {
LEMMA: "deres",
"Person": "Two,Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Fem",
},
# polite form, not sure about the tag
"Deres": {
LEMMA: "Deres",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Fem",
"Polite": "Form",
},
},
"DET__Gender=Neut|Number=Sing|Poss=Yes": {
"mitt": {
LEMMA: "min",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
},
"ditt": {
LEMMA: "din",
"Person": "Two",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
},
"hennes": {
LEMMA: "hennes",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
},
"hans": {
LEMMA: "hans",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
},
"sitt": {
LEMMA: "sin",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
"Reflex": "Yes",
},
"vårt": {
LEMMA: "vår",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
},
"deres": {
LEMMA: "deres",
"Person": "Two,Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
},
# polite form, not sure about the tag
"Deres": {
LEMMA: "Deres",
"Person": "Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
"Polite": "Form",
},
},
"DET__Number=Plur|Poss=Yes": {
"mine": {LEMMA: "min", "Person": "One", "Number": "Plur", "Poss": "Yes"},
"dine": {LEMMA: "din", "Person": "Two", "Number": "Plur", "Poss": "Yes"},
"hennes": {LEMMA: "hennes", "Person": "Three", "Number": "Plur", "Poss": "Yes"},
"hans": {LEMMA: "hans", "Person": "Three", "Number": "Plur", "Poss": "Yes"},
"sine": {
LEMMA: "sin",
"Person": "Three",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"våre": {LEMMA: "vår", "Person": "One", "Number": "Plur", "Poss": "Yes"},
"deres": {
LEMMA: "deres",
"Person": "Two,Three",
"Number": "Plur",
"Poss": "Yes",
},
},
"PRON__Animacy=Anim|Number=Plur|PronType=Rcp": {
"hverandre": {LEMMA: PRON_LEMMA, "PronType": "Rcp", "Number": "Plur"}
},
"DET__Number=Plur|Poss=Yes|PronType=Rcp": {
"hverandres": {
LEMMA: "hverandres",
"PronType": "Rcp",
"Number": "Plur",
"Poss": "Yes",
}
},
"PRON___": {"som": {LEMMA: PRON_LEMMA}, "ikkenoe": {LEMMA: PRON_LEMMA}},
"PRON__PronType=Int": {"hva": {LEMMA: PRON_LEMMA, "PronType": "Int"}},
"PRON__Animacy=Anim|PronType=Int": {"hvem": {LEMMA: PRON_LEMMA, "PronType": "Int"}},
"PRON__Animacy=Anim|Poss=Yes|PronType=Int": {
"hvis": {LEMMA: PRON_LEMMA, "PronType": "Int", "Poss": "Yes"}
},
"PRON__Number=Plur|Person=3|PronType=Prs": {
"noen": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Plur",
"Person": "Three",
},
"ingen": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Plur",
"Person": "Three",
},
"alle": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Plur",
"Person": "Three",
},
},
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs": {
"noen": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Sing",
"Person": "Three",
"Gender": "Fem,Masc",
},
"den": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Sing",
"Person": "Three",
"Gender": "Fem,Masc",
},
"ingen": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Sing",
"Person": "Three",
"Gender": "Fem,Masc",
"Polarity": "Neg",
},
},
"PRON__Number=Sing": {"ingenting": {LEMMA: PRON_LEMMA, "Number": "Sing"}},
"PRON__Animacy=Anim|Number=Sing|PronType=Prs": {
"en": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Number": "Sing"}
},
"PRON__Animacy=Anim|Case=Gen,Nom|Number=Sing|PronType=Prs": {
"ens": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Sing",
"Case": "Gen,Nom",
}
},
"PRON__Animacy=Anim|Case=Gen|Number=Sing|PronType=Prs": {
"ens": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Number": "Sing", "Case": "Gen"}
},
"DET__Case=Gen|Gender=Masc|Number=Sing": {
"ens": {LEMMA: "en", "Number": "Sing", "Case": "Gen"}
},
"DET__Gender=Masc|Number=Sing": {
"enhver": {LEMMA: "enhver", "Number": "Sing", "Gender": "Masc"},
"all": {LEMMA: "all", "Number": "Sing", "Gender": "Masc"},
"hver": {LEMMA: "hver", "Number": "Sing", "Gender": "Masc"},
"noen": {LEMMA: "noen", "Gender": "Masc", "Number": "Sing"},
"noe": {LEMMA: "noen", "Gender": "Masc", "Number": "Sing"},
"en": {LEMMA: "en", "Number": "Sing", "Gender": "Neut"},
"ingen": {LEMMA: "ingen", "Gender": "Masc", "Number": "Sing"},
},
"DET__Gender=Fem|Number=Sing": {
"enhver": {LEMMA: "enhver", "Number": "Sing", "Gender": "Fem"},
"all": {LEMMA: "all", "Number": "Sing", "Gender": "Fem"},
"hver": {LEMMA: "hver", "Number": "Sing", "Gender": "Fem"},
"noen": {LEMMA: "noen", "Gender": "Fem", "Number": "Sing"},
"noe": {LEMMA: "noen", "Gender": "Fem", "Number": "Sing"},
"ei": {LEMMA: "en", "Number": "Sing", "Gender": "Fem"},
},
"DET__Gender=Neut|Number=Sing": {
"ethvert": {LEMMA: "enhver", "Number": "Sing", "Gender": "Neut"},
"alt": {LEMMA: "all", "Number": "Sing", "Gender": "Neut"},
"hvert": {LEMMA: "hver", "Number": "Sing", "Gender": "Neut"},
"noe": {LEMMA: "noen", "Number": "Sing", "Gender": "Neut"},
"intet": {LEMMA: "ingen", "Gender": "Neut", "Number": "Sing"},
"et": {LEMMA: "en", "Number": "Sing", "Gender": "Neut"},
},
"DET__Gender=Neut|Number=Sing|PronType=Int": {
"hvilket": {
LEMMA: "hvilken",
"PronType": "Int",
"Number": "Sing",
"Gender": "Neut",
}
},
"DET__Gender=Fem|Number=Sing|PronType=Int": {
"hvilken": {
LEMMA: "hvilken",
"PronType": "Int",
"Number": "Sing",
"Gender": "Fem",
}
},
"DET__Gender=Masc|Number=Sing|PronType=Int": {
"hvilken": {
LEMMA: "hvilken",
"PronType": "Int",
"Number": "Sing",
"Gender": "Masc",
}
},
"DET__Number=Plur|PronType=Int": {
"hvilke": {LEMMA: "hvilken", "PronType": "Int", "Number": "Plur"}
},
"DET__Number=Plur": {
"alle": {LEMMA: "all", "Number": "Plur"},
"noen": {LEMMA: "noen", "Number": "Plur"},
"egne": {LEMMA: "egen", "Number": "Plur"},
"ingen": {LEMMA: "ingen", "Number": "Plur"},
},
"DET__Gender=Masc|Number=Sing|PronType=Dem": {
"den": {LEMMA: "den", "PronType": "Dem", "Number": "Sing", "Gender": "Masc"},
"slik": {LEMMA: "slik", "PronType": "Dem", "Number": "Sing", "Gender": "Masc"},
"denne": {
LEMMA: "denne",
"PronType": "Dem",
"Number": "Sing",
"Gender": "Masc",
},
},
"DET__Gender=Fem|Number=Sing|PronType=Dem": {
"den": {LEMMA: "den", "PronType": "Dem", "Number": "Sing", "Gender": "Fem"},
"slik": {LEMMA: "slik", "PronType": "Dem", "Number": "Sing", "Gender": "Fem"},
"denne": {LEMMA: "denne", "PronType": "Dem", "Number": "Sing", "Gender": "Fem"},
},
"DET__Gender=Neut|Number=Sing|PronType=Dem": {
"det": {LEMMA: "det", "PronType": "Dem", "Number": "Sing", "Gender": "Neut"},
"slikt": {LEMMA: "slik", "PronType": "Dem", "Number": "Sing", "Gender": "Neut"},
"dette": {
LEMMA: "dette",
"PronType": "Dem",
"Number": "Sing",
"Gender": "Neut",
},
},
"DET__Number=Plur|PronType=Dem": {
"disse": {LEMMA: "disse", "PronType": "Dem", "Number": "Plur"},
"andre": {LEMMA: "annen", "PronType": "Dem", "Number": "Plur"},
"de": {LEMMA: "de", "PronType": "Dem", "Number": "Plur"},
"slike": {LEMMA: "slik", "PronType": "Dem", "Number": "Plur"},
},
"DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Dem": {
"annen": {LEMMA: "annen", "PronType": "Dem", "Number": "Sing", "Gender": "Masc"}
},
"DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Dem": {
"annen": {LEMMA: "annen", "PronType": "Dem", "Number": "Sing", "Gender": "Fem"}
},
"DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Dem": {
"annet": {LEMMA: "annen", "PronType": "Dem", "Number": "Sing", "Gender": "Neut"}
},
"DET__Case=Gen|Definite=Ind|Gender=Masc|Number=Sing|PronType=Dem": {
"annens": {
LEMMA: "annnen",
"PronType": "Dem",
"Number": "Sing",
"Gender": "Masc",
"Case": "Gen",
}
},
"DET__Case=Gen|Number=Plur|PronType=Dem": {
"andres": {LEMMA: "annen", "PronType": "Dem", "Number": "Plur", "Case": "Gen"}
},
"DET__Case=Gen|Gender=Fem|Number=Sing|PronType=Dem": {
"dens": {
LEMMA: "den",
"PronType": "Dem",
"Number": "Sing",
"Gender": "Fem",
"Case": "Gen",
}
},
"DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Dem": {
"hvis": {
LEMMA: "hvis",
"PronType": "Dem",
"Number": "Sing",
"Gender": "Masc",
"Case": "Gen",
},
"dens": {
LEMMA: "den",
"PronType": "Dem",
"Number": "Sing",
"Gender": "Masc",
"Case": "Gen",
},
},
"DET__Case=Gen|Gender=Neut|Number=Sing|PronType=Dem": {
"dets": {
LEMMA: "det",
"PronType": "Dem",
"Number": "Sing",
"Gender": "Neut",
"Case": "Gen",
}
},
"DET__Case=Gen|Number=Plur": {
"alles": {LEMMA: "all", "Number": "Plur", "Case": "Gen"}
},
"DET__Definite=Def|Number=Sing|PronType=Dem": {
"andre": {LEMMA: "annen", "Number": "Sing", "PronType": "Dem"}
},
"DET__Definite=Def|PronType=Dem": {
"samme": {LEMMA: "samme", "PronType": "Dem"},
"forrige": {LEMMA: "forrige", "PronType": "Dem"},
"neste": {LEMMA: "neste", "PronType": "Dem"},
},
"DET__Definite=Def": {"selve": {LEMMA: "selve"}, "selveste": {LEMMA: "selveste"}},
"DET___": {"selv": {LEMMA: "selv"}, "endel": {LEMMA: "endel"}},
"DET__Definite=Ind|Gender=Fem|Number=Sing": {
"egen": {LEMMA: "egen", "Gender": "Fem", "Number": "Sing"}
},
"DET__Definite=Ind|Gender=Masc|Number=Sing": {
"egen": {LEMMA: "egen", "Gender": "Masc", "Number": "Sing"}
},
"DET__Definite=Ind|Gender=Neut|Number=Sing": {
"eget": {LEMMA: "egen", "Gender": "Neut", "Number": "Sing"}
},
# same wordform and pos (verb), have to specify the exact features in order to not mix them up
"VERB__Mood=Ind|Tense=Pres|VerbForm=Fin": {
"": {LEMMA: "", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
},
"VERB__Mood=Ind|Tense=Past|VerbForm=Fin": {
"": {LEMMA: "se", "VerbForm": "Fin", "Tense": "Past", "Mood": "Ind"}
},
}
# copied from the English morph_rules.py
for tag, rules in MORPH_RULES.items():
for key, attrs in dict(rules).items():
rules[key.title()] = attrs

View File

@ -1,7 +1,6 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES
# Punctuation stolen from Danish # Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
@ -22,11 +21,9 @@ class SwedishDefaults(Language.Defaults):
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
morph_rules = MORPH_RULES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS
morph_rules = MORPH_RULES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS

View File

@ -1,285 +0,0 @@
from ...symbols import LEMMA, PRON_LEMMA
# Used the table of pronouns at https://sv.wiktionary.org/wiki/deras
MORPH_RULES = {
"PRP": {
"jag": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Nom",
},
"mig": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Acc",
},
"mej": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Acc",
},
"du": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Case": "Nom",
},
"han": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Nom",
},
"honom": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Acc",
},
"hon": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Case": "Nom",
},
"henne": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Case": "Acc",
},
"det": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
},
"vi": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Nom",
},
"oss": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Acc",
},
"ni": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Case": "Nom",
},
"er": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur"},
"de": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Nom",
},
"dom": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Nom,Acc",
},
"dem": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Acc",
},
"min": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Reflex": "Yes",
},
"mitt": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Reflex": "Yes",
},
"mina": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"din": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Poss": "Yes",
"Reflex": "Yes",
},
"ditt": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing",
"Poss": "Yes",
"Reflex": "Yes",
},
"dina": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"hans": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing,Plur",
"Gender": "Masc",
"Poss": "Yes",
"Reflex": "Yes",
},
"hennes": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing,Plur",
"Gender": "Fem",
"Poss": "Yes",
"Reflex": "Yes",
},
"dess": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Sing,Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"vår": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"våran": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"vårt": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"vårat": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"våra": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"eran": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"ert": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"erat": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"era": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"deras": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
},
"VBZ": {
"är": {
"VerbForm": "Fin",
"Person": "One,Two,Three",
"Tense": "Pres",
"Mood": "Ind",
}
},
"VBP": {"är": {"VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}},
"VBD": {
"var": {"VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
"vart": {"VerbForm": "Fin", "Tense": "Past", "Number": "Plur"},
},
}