From 64c732918a39907860d4107b9d25281152b32fe1 Mon Sep 17 00:00:00 2001 From: mollerhoj Date: Mon, 3 Jul 2017 15:49:09 +0200 Subject: [PATCH] Add Morph_rules. (TODO: Not working?) --- spacy/lang/da/__init__.py | 2 ++ spacy/lang/da/morph_rules.py | 41 ++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 spacy/lang/da/morph_rules.py diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 1dc4d4820..d83ad8048 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .morph_rules import MORPH_RULES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -19,6 +20,7 @@ class DanishDefaults(Language.Defaults): lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + #morph_rules = dict(MORPH_RULES) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/da/morph_rules.py b/spacy/lang/da/morph_rules.py new file mode 100644 index 000000000..b365bf871 --- /dev/null +++ b/spacy/lang/da/morph_rules.py @@ -0,0 +1,41 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import LEMMA +from ...deprecated import PRON_LEMMA + +MORPH_RULES = { + "PRON": { + "jeg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"}, + "mig": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"}, + "du": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"}, + "han": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"}, + "ham": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"}, + "hun": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Nom"}, + "hende": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Acc"}, + "den": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "det": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "vi": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom"}, + "os": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc"}, + "de": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"}, + "dem": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, + + "min": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, + "din": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, + "hans": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, + "hendes": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"}, + "dens": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, + "dets": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, + "vores": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, + "deres": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, + }, + + "VERB": { + "er": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Pres"}, + "var": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Past"} + } +} + +for tag, rules in MORPH_RULES.items(): + for key, attrs in dict(rules).items(): + rules[key.title()] = attrs