From bc9557b21ffde564e39a98a0819fd6a63f47e11a Mon Sep 17 00:00:00 2001 From: Leif Uwe Vogelsang Date: Thu, 23 Mar 2017 11:10:22 +0100 Subject: [PATCH] Norwegian language basics --- spacy/nb/__init__.py | 26 +++++ spacy/nb/language_data.py | 28 +++++ spacy/nb/morph_rules.py | 67 ++++++++++++ spacy/nb/stop_words.py | 40 +++++++ spacy/nb/tokenizer_exceptions.py | 175 +++++++++++++++++++++++++++++++ 5 files changed, 336 insertions(+) create mode 100644 spacy/nb/__init__.py create mode 100644 spacy/nb/language_data.py create mode 100644 spacy/nb/morph_rules.py create mode 100644 spacy/nb/stop_words.py create mode 100644 spacy/nb/tokenizer_exceptions.py diff --git a/spacy/nb/__init__.py b/spacy/nb/__init__.py new file mode 100644 index 000000000..6c1aab979 --- /dev/null +++ b/spacy/nb/__init__.py @@ -0,0 +1,26 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG + + +# Import language-specific data +from .language_data import * + + +# create Language subclass +class NorwegianBokmal(Language): + lang = 'nb' # ISO code + + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'nb' + + + # override defaults + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + #tag_map = TAG_MAP + stop_words = STOP_WORDS \ No newline at end of file diff --git a/spacy/nb/language_data.py b/spacy/nb/language_data.py new file mode 100644 index 000000000..9383f3a62 --- /dev/null +++ b/spacy/nb/language_data.py @@ -0,0 +1,28 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc, expand_exc + +# import language-specific data from files +#from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY +from .morph_rules import MORPH_RULES + +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) +#TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) + +# customize tokenizer exceptions +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) + +# export +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "MORPH_RULES"] \ No newline at end of file diff --git a/spacy/nb/morph_rules.py b/spacy/nb/morph_rules.py new file mode 100644 index 000000000..1f9f1b84e --- /dev/null +++ b/spacy/nb/morph_rules.py @@ -0,0 +1,67 @@ +# encoding: utf8 +# norwegian bokmål +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + +# Used the table of pronouns at https://no.wiktionary.org/wiki/Tillegg:Pronomen_i_norsk + +MORPH_RULES = { + "PRP": { + "jeg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"}, + "meg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"}, + "du": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Case": "Nom"}, + "deg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Case": "Acc"}, + "han": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"}, + "ham": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"}, + "han": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"}, + "hun": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Nom"}, + "henne": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Acc"}, + "den": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "det": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "seg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Reflex": "Yes"}, + "vi": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom"}, + "oss": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc"}, + "dere": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Case": "Nom"}, + "de": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"}, + "dem": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, + "seg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Reflex": "Yes"}, + + "min": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender": "Masc"}, + "mi": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender": "Fem"}, + "mitt": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender": "Neu"}, + "mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes"}, + "din": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Gender": "Masc"}, + "di": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Gender": "Fem"}, + "ditt": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Gender": "Neu"}, + "dine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Poss": "Yes"}, + "hans": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Poss": "Yes", "Gender": "Masc"}, + "hennes": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Poss": "Yes", "Gender": "Fem"}, + "dens": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Poss": "Yes", "Gender": "Neu"}, + "dets": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Poss": "Yes", "Gender": "Neu"}, + "vår": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes"}, + "vårt": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes"}, + "våre": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Gender":"Neu"}, + "deres": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Gender":"Neu", "Reflex":"Yes"}, + "sin": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender":"Masc", "Reflex":"Yes"}, + "si": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender":"Fem", "Reflex":"Yes"}, + "sitt": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender":"Neu", "Reflex":"Yes"}, + "sine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex":"Yes"}, + }, + + "VBZ": { + "er": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"}, + "er": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"}, + "er": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"}, + }, + + "VBP": { + "er": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"} + }, + + "VBD": { + "var": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"}, + "vært": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"} + } +} \ No newline at end of file diff --git a/spacy/nb/stop_words.py b/spacy/nb/stop_words.py new file mode 100644 index 000000000..62d1a6028 --- /dev/null +++ b/spacy/nb/stop_words.py @@ -0,0 +1,40 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set("""alle at av + +bare begge ble blei bli blir blitt både båe + +da de deg dei deim deira deires dem den denne der dere deres det dette di din disse ditt du dykk dykkar då + +eg ein eit eitt eller elles en enn er et ett etter + +for fordi fra før + +ha hadde han hans har hennar henne hennes her hjå ho hoe honom hoss hossen hun hva hvem hver hvilke hvilken hvis hvor hvordan hvorfor + +i ikke ikkje ikkje ingen ingi inkje inn inni + +ja jeg + +kan kom korleis korso kun kunne kva kvar kvarhelst kven kvi kvifor + +man mange me med medan meg meget mellom men mi min mine mitt mot mykje + +ned no noe noen noka noko nokon nokor nokre nå når + +og også om opp oss over + +på + +samme seg selv si sia sidan siden sin sine sitt sjøl skal skulle slik so som som somme somt så sånn + +til + +um upp ut uten + +var vart varte ved vere verte vi vil ville vore vors vort vår være være vært + +å +""".split()) \ No newline at end of file diff --git a/spacy/nb/tokenizer_exceptions.py b/spacy/nb/tokenizer_exceptions.py new file mode 100644 index 000000000..44fc76532 --- /dev/null +++ b/spacy/nb/tokenizer_exceptions.py @@ -0,0 +1,175 @@ +# encoding: utf8 +# Norwegian bokmaål +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + + +TOKENIZER_EXCEPTIONS = { + "jan.": [ + {ORTH: "jan.", LEMMA: "januar"} + ], + + "feb.": [ + {ORTH: "feb.", LEMMA: "februar"} + ], + + "jul.": [ + {ORTH: "jul.", LEMMA: "juli"} + ] +} + + +ORTH_ONLY = ["adm.dir.", + "a.m.", + "Aq.", + "b.c.", + "bl.a.", + "bla.", + "bm.", + "bto.", + "ca.", + "cand.mag.", + "c.c.", + "co.", + "d.d.", + "dept.", + "d.m.", + "dr.philos.", + "dvs.", + "d.y.", + "E. coli", + "eg.", + "ekskl.", + "e.Kr.", + "el.", + "e.l.", + "et.", + "etg.", + "ev.", + "evt.", + "f.", + "f.eks.", + "fhv.", + "fk.", + "f.Kr.", + "f.o.m.", + "foreg.", + "fork.", + "fv.", + "fvt.", + "g.", + "gt.", + "gl.", + "gno.", + "gnr.", + "grl.", + "hhv.", + "hoh.", + "hr.", + "h.r.adv.", + "ifb.", + "ifm.", + "iht.", + "inkl.", + "istf.", + "jf.", + "jr.", + "jun.", + "kfr.", + "kgl.res.", + "kl.", + "komm.", + "kst.", + "lø.", + "ma.", + "mag.art.", + "m.a.o.", + "md.", + "mfl.", + "mill.", + "min.", + "m.m.", + "mnd.", + "moh.", + "Mr.", + "muh.", + "mv.", + "mva.", + "ndf.", + "no.", + "nov.", + "nr.", + "nto.", + "nyno.", + "n.å.", + "o.a.", + "off.", + "ofl.", + "okt.", + "o.l.", + "on.", + "op.", + "osv.", + "ovf.", + "p.", + "p.a.", + "Pb.", + "pga.", + "ph.d.", + "pkt.", + "p.m.", + "pr.", + "pst.", + "p.t.", + "red.anm.", + "ref.", + "res.", + "res.kap.", + "resp.", + "rv.", + "s.", + "s.d.", + "sen.", + "sep.", + "siviling.", + "sms.", + "spm.", + "sr.", + "sst.", + "st.", + "stip.", + "stk.", + "st.meld.", + "st.prp.", + "stud.", + "s.u.", + "sv.", + "sø.", + "s.å.", + "såk.", + "temp.", + "ti.", + "tils.", + "tilsv.", + "tl;dr", + "tlf.", + "to.", + "t.o.m.", + "ult.", + "utg.", + "v.", + "vedk.", + "vedr.", + "vg.", + "vgs.", + "vha.", + "vit.ass.", + "vn.", + "vol.", + "vs.", + "vsa.", + "årg.", + "årh." +] \ No newline at end of file