diff --git a/spacy/de/__init__.py b/spacy/de/__init__.py index 4434c9a2e..d4f3e0e50 100644 --- a/spacy/de/__init__.py +++ b/spacy/de/__init__.py @@ -5,6 +5,25 @@ from os import path from ..language import Language from ..attrs import LANG from . import language_data +from ..util import update_exc + +from ..language_data import EMOTICONS +from .language_data import ORTH_ONLY +from .language_data import strings_to_exc +from .language_data import get_time_exc + + +TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) +TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) +TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES) +TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES) +TAG_MAP = dict(language_data.TAG_MAP) +STOP_WORDS = set(language_data.STOP_WORDS) + + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 24 + 1))) class German(Language): @@ -14,14 +33,10 @@ class German(Language): tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'de' - - prefixes = tuple(language_data.TOKENIZER_PREFIXES) - - suffixes = tuple(language_data.TOKENIZER_SUFFIXES) - - infixes = tuple(language_data.TOKENIZER_INFIXES) - - tag_map = dict(language_data.TAG_MAP) - - stop_words = set(language_data.STOP_WORDS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES + tag_map = TAG_MAP + stop_words = STOP_WORDS diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py index 8401eba1d..7a7c044b3 100644 --- a/spacy/de/language_data.py +++ b/spacy/de/language_data.py @@ -3,7 +3,21 @@ from __future__ import unicode_literals import re from ..symbols import * -from ..language_data import EMOTICONS + + +def strings_to_exc(orths): + return {orth: [{ORTH: orth}] for orth in orths} + + +def get_time_exc(hours): + exc = {} + for hour in hours: + # currently only supporting formats like "10h", not "10 Uhr" + exc["%dh" % hour] = [ + {ORTH: hour}, + {ORTH: "h", LEMMA: "Uhr"} + ] + return exc PRON_LEMMA = "-PRON-" @@ -655,7 +669,7 @@ TOKENIZER_EXCEPTIONS = { } -self_map = [ +ORTH_ONLY = [ "''", "\\\")", "", diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index ade3e8e7a..db9088a88 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -9,6 +9,25 @@ from ..lemmatizer import Lemmatizer from ..vocab import Vocab from ..tokenizer import Tokenizer from ..attrs import LANG +from ..util import update_exc + +from ..language_data import EMOTICONS +from .language_data import ORTH_ONLY +from .language_data import strings_to_exc +from .language_data import get_time_exc + + +TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) +TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) +TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES) +TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES) +TAG_MAP = dict(language_data.TAG_MAP) +STOP_WORDS = set(language_data.STOP_WORDS) + + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) class English(Language): @@ -18,14 +37,9 @@ class English(Language): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'en' - tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) - - prefixes = tuple(language_data.TOKENIZER_PREFIXES) - - suffixes = tuple(language_data.TOKENIZER_SUFFIXES) - - infixes = tuple(language_data.TOKENIZER_INFIXES) - - tag_map = dict(language_data.TAG_MAP) - - stop_words = set(language_data.STOP_WORDS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES + tag_map = TAG_MAP + stop_words = STOP_WORDS diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index 383ff96f4..72b00e5cd 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -3,7 +3,35 @@ from __future__ import unicode_literals import re from ..symbols import * -from ..language_data import EMOTICONS + + +def strings_to_exc(orths): + return {orth: [{ORTH: orth}] for orth in orths} + + +def get_time_exc(hours): + exc = {} + for hour in hours: + exc["%da.m." % hour] = [ + {ORTH: hour}, + {ORTH: "a.m."} + ] + + exc["%dp.m." % hour] = [ + {ORTH: hour}, + {ORTH: "p.m."} + ] + + exc["%dam" % hour] = [ + {ORTH: hour}, + {ORTH: "am", LEMMA: "a.m."} + ] + + exc["%dpm" % hour] = [ + {ORTH: hour}, + {ORTH: "pm", LEMMA: "p.m."} + ] + return exc PRON_LEMMA = "-PRON-" @@ -2121,7 +2149,7 @@ TOKENIZER_EXCEPTIONS = { } -self_map = [ +ORTH_ONLY = [ "''", "\")", "a.", @@ -2185,11 +2213,6 @@ self_map = [ "z." ] -for orths in [self_map, EMOTICONS]: - overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(orths)) - assert not overlap, overlap - TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in orths}) - TOKENIZER_PREFIXES = r''' ,