Reorganize exceptions for English and German

This commit is contained in:
Ines Montani 2016-12-08 13:58:32 +01:00
parent 66c7348cda
commit 311b30ab35
4 changed files with 96 additions and 30 deletions

View File

@ -5,6 +5,25 @@ from os import path
from ..language import Language from ..language import Language
from ..attrs import LANG from ..attrs import LANG
from . import language_data from . import language_data
from ..util import update_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
from .language_data import strings_to_exc
from .language_data import get_time_exc
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 24 + 1)))
class German(Language): class German(Language):
@ -15,13 +34,9 @@ class German(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de' lex_attr_getters[LANG] = lambda text: 'de'
prefixes = tuple(language_data.TOKENIZER_PREFIXES) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = tuple(language_data.TOKENIZER_SUFFIXES) suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
infixes = tuple(language_data.TOKENIZER_INFIXES) tag_map = TAG_MAP
stop_words = STOP_WORDS
tag_map = dict(language_data.TAG_MAP)
stop_words = set(language_data.STOP_WORDS)

View File

@ -3,7 +3,21 @@ from __future__ import unicode_literals
import re import re
from ..symbols import * from ..symbols import *
from ..language_data import EMOTICONS
def strings_to_exc(orths):
return {orth: [{ORTH: orth}] for orth in orths}
def get_time_exc(hours):
exc = {}
for hour in hours:
# currently only supporting formats like "10h", not "10 Uhr"
exc["%dh" % hour] = [
{ORTH: hour},
{ORTH: "h", LEMMA: "Uhr"}
]
return exc
PRON_LEMMA = "-PRON-" PRON_LEMMA = "-PRON-"
@ -655,7 +669,7 @@ TOKENIZER_EXCEPTIONS = {
} }
self_map = [ ORTH_ONLY = [
"''", "''",
"\\\")", "\\\")",
"<space>", "<space>",

View File

@ -9,6 +9,25 @@ from ..lemmatizer import Lemmatizer
from ..vocab import Vocab from ..vocab import Vocab
from ..tokenizer import Tokenizer from ..tokenizer import Tokenizer
from ..attrs import LANG from ..attrs import LANG
from ..util import update_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
from .language_data import strings_to_exc
from .language_data import get_time_exc
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
class English(Language): class English(Language):
@ -18,14 +37,9 @@ class English(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[LANG] = lambda text: 'en'
tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
prefixes = tuple(language_data.TOKENIZER_PREFIXES) suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
suffixes = tuple(language_data.TOKENIZER_SUFFIXES) tag_map = TAG_MAP
stop_words = STOP_WORDS
infixes = tuple(language_data.TOKENIZER_INFIXES)
tag_map = dict(language_data.TAG_MAP)
stop_words = set(language_data.STOP_WORDS)

View File

@ -3,7 +3,35 @@ from __future__ import unicode_literals
import re import re
from ..symbols import * from ..symbols import *
from ..language_data import EMOTICONS
def strings_to_exc(orths):
return {orth: [{ORTH: orth}] for orth in orths}
def get_time_exc(hours):
exc = {}
for hour in hours:
exc["%da.m." % hour] = [
{ORTH: hour},
{ORTH: "a.m."}
]
exc["%dp.m." % hour] = [
{ORTH: hour},
{ORTH: "p.m."}
]
exc["%dam" % hour] = [
{ORTH: hour},
{ORTH: "am", LEMMA: "a.m."}
]
exc["%dpm" % hour] = [
{ORTH: hour},
{ORTH: "pm", LEMMA: "p.m."}
]
return exc
PRON_LEMMA = "-PRON-" PRON_LEMMA = "-PRON-"
@ -2121,7 +2149,7 @@ TOKENIZER_EXCEPTIONS = {
} }
self_map = [ ORTH_ONLY = [
"''", "''",
"\")", "\")",
"a.", "a.",
@ -2185,11 +2213,6 @@ self_map = [
"z." "z."
] ]
for orths in [self_map, EMOTICONS]:
overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(orths))
assert not overlap, overlap
TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in orths})
TOKENIZER_PREFIXES = r''' TOKENIZER_PREFIXES = r'''
, ,