Merge pull request #807 from magnusburton/master

Added swedish lemma rules and more verb contractions
This commit is contained in:
Ines Montani 2017-02-05 11:34:19 +01:00 committed by GitHub
commit 30a52d576b
2 changed files with 122 additions and 0 deletions

45
spacy/sv/lemma_rules.py Normal file
View File

@ -0,0 +1,45 @@
# encoding: utf8
from __future__ import unicode_literals
LEMMA_RULES = {
"noun": [
["t", ""],
["n", ""],
["na", ""],
["na", "e"],
["or", "a"],
["orna", "a"],
["et", ""],
["en", ""],
["en", "e"],
["er", ""],
["erna", ""],
["ar", "e"],
["ar", ""],
["lar", "el"],
["arna", "e"],
["arna", ""],
["larna", "el"]
],
"adj": [
["are", ""],
["ast", ""],
["re", ""],
["st", ""],
["ägre", "åg"],
["ägst", "åg"],
["ängre", "ång"],
["ängst", "ång"],
["örre", "or"],
["örst", "or"],
],
"punct": [
["", "\""],
["", "\""],
["\u2018", "'"],
["\u2019", "'"]
]
}

View File

@ -4,6 +4,26 @@ from __future__ import unicode_literals
from ..symbols import * from ..symbols import *
from ..language_data import PRON_LEMMA from ..language_data import PRON_LEMMA
# Verbs
for verb_data in [
{ORTH: "driver"},
{ORTH: "kör"},
{ORTH: "hörr", LEMMA: "hör"},
{ORTH: "fattar"},
{ORTH: "hajar", LEMMA: "förstår"},
{ORTH: "lever"},
{ORTH: "serr", LEMMA: "ser"},
{ORTH: "fixar"}
]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:
EXC[data[ORTH] + "u"] = [
dict(data),
{ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"}
]
TOKENIZER_EXCEPTIONS = { TOKENIZER_EXCEPTIONS = {
"jan.": [ "jan.": [
@ -63,6 +83,63 @@ TOKENIZER_EXCEPTIONS = {
"sön.": [ "sön.": [
{ORTH: "sön.", LEMMA: "söndag"} {ORTH: "sön.", LEMMA: "söndag"}
], ],
"Jan.": [
{ORTH: "Jan.", LEMMA: "Januari"}
],
"Febr.": [
{ORTH: "Febr.", LEMMA: "Februari"}
],
"Feb.": [
{ORTH: "Feb.", LEMMA: "Februari"}
],
"Apr.": [
{ORTH: "Apr.", LEMMA: "April"}
],
"Jun.": [
{ORTH: "Jun.", LEMMA: "Juni"}
],
"Jul.": [
{ORTH: "Jul.", LEMMA: "Juli"}
],
"Aug.": [
{ORTH: "Aug.", LEMMA: "Augusti"}
],
"Sept.": [
{ORTH: "Sept.", LEMMA: "September"}
],
"Sep.": [
{ORTH: "Sep.", LEMMA: "September"}
],
"Okt.": [
{ORTH: "Okt.", LEMMA: "Oktober"}
],
"Nov.": [
{ORTH: "Nov.", LEMMA: "November"}
],
"Dec.": [
{ORTH: "Dec.", LEMMA: "December"}
],
"Mån.": [
{ORTH: "Mån.", LEMMA: "Måndag"}
],
"Tis.": [
{ORTH: "Tis.", LEMMA: "Tisdag"}
],
"Ons.": [
{ORTH: "Ons.", LEMMA: "Onsdag"}
],
"Tors.": [
{ORTH: "Tors.", LEMMA: "Torsdag"}
],
"Fre.": [
{ORTH: "Fre.", LEMMA: "Fredag"}
],
"Lör.": [
{ORTH: "Lör.", LEMMA: "Lördag"}
],
"Sön.": [
{ORTH: "Sön.", LEMMA: "Söndag"}
],
"sthlm": [ "sthlm": [
{ORTH: "sthlm", LEMMA: "Stockholm"} {ORTH: "sthlm", LEMMA: "Stockholm"}
], ],