Reorganize exceptions for English and German

This commit is contained in:
Ines Montani 2016-12-08 13:58:32 +01:00
parent 66c7348cda
commit 311b30ab35
4 changed files with 96 additions and 30 deletions

View File

@ -5,6 +5,25 @@ from os import path
from ..language import Language
from ..attrs import LANG
from . import language_data
from ..util import update_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
from .language_data import strings_to_exc
from .language_data import get_time_exc
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 24 + 1)))
class German(Language):
@ -15,13 +34,9 @@ class German(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de'
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
infixes = tuple(language_data.TOKENIZER_INFIXES)
tag_map = dict(language_data.TAG_MAP)
stop_words = set(language_data.STOP_WORDS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS

View File

@ -3,7 +3,21 @@ from __future__ import unicode_literals
import re
from ..symbols import *
from ..language_data import EMOTICONS
def strings_to_exc(orths):
return {orth: [{ORTH: orth}] for orth in orths}
def get_time_exc(hours):
exc = {}
for hour in hours:
# currently only supporting formats like "10h", not "10 Uhr"
exc["%dh" % hour] = [
{ORTH: hour},
{ORTH: "h", LEMMA: "Uhr"}
]
return exc
PRON_LEMMA = "-PRON-"
@ -655,7 +669,7 @@ TOKENIZER_EXCEPTIONS = {
}
self_map = [
ORTH_ONLY = [
"''",
"\\\")",
"<space>",

View File

@ -9,6 +9,25 @@ from ..lemmatizer import Lemmatizer
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..attrs import LANG
from ..util import update_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
from .language_data import strings_to_exc
from .language_data import get_time_exc
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
class English(Language):
@ -18,14 +37,9 @@ class English(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en'
tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
infixes = tuple(language_data.TOKENIZER_INFIXES)
tag_map = dict(language_data.TAG_MAP)
stop_words = set(language_data.STOP_WORDS)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS

View File

@ -3,7 +3,35 @@ from __future__ import unicode_literals
import re
from ..symbols import *
from ..language_data import EMOTICONS
def strings_to_exc(orths):
return {orth: [{ORTH: orth}] for orth in orths}
def get_time_exc(hours):
exc = {}
for hour in hours:
exc["%da.m." % hour] = [
{ORTH: hour},
{ORTH: "a.m."}
]
exc["%dp.m." % hour] = [
{ORTH: hour},
{ORTH: "p.m."}
]
exc["%dam" % hour] = [
{ORTH: hour},
{ORTH: "am", LEMMA: "a.m."}
]
exc["%dpm" % hour] = [
{ORTH: hour},
{ORTH: "pm", LEMMA: "p.m."}
]
return exc
PRON_LEMMA = "-PRON-"
@ -2121,7 +2149,7 @@ TOKENIZER_EXCEPTIONS = {
}
self_map = [
ORTH_ONLY = [
"''",
"\")",
"a.",
@ -2185,11 +2213,6 @@ self_map = [
"z."
]
for orths in [self_map, EMOTICONS]:
overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(orths))
assert not overlap, overlap
TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in orths})
TOKENIZER_PREFIXES = r'''
,