Tidy up language data

This commit is contained in:
ines 2017-10-11 02:22:49 +02:00
parent 73bca3d382
commit 0c2343d73a
19 changed files with 18 additions and 35 deletions

View File

@ -16,12 +16,10 @@ from ...util import update_exc
class BengaliDefaults(Language.Defaults): class BengaliDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn' lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP tag_map = dict(TAG_MAP)
stop_words = STOP_WORDS stop_words = set(STOP_WORDS)
lemma_rules = LEMMA_RULES lemma_rules = dict(LEMMA_RULES)
prefixes = tuple(TOKENIZER_PREFIXES) prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)

View File

@ -15,7 +15,6 @@ class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da' lex_attr_getters[LANG] = lambda text: 'da'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -22,7 +22,6 @@ class GermanDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'de' lex_attr_getters[LANG] = lambda text: 'de'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
NORM_EXCEPTIONS, BASE_NORMS) NORM_EXCEPTIONS, BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)

View File

@ -23,7 +23,6 @@ class EnglishDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
BASE_NORMS, NORM_EXCEPTIONS) BASE_NORMS, NORM_EXCEPTIONS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -19,7 +19,6 @@ class SpanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[LANG] = lambda text: 'es'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -15,7 +15,6 @@ class FinnishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi' lex_attr_getters[LANG] = lambda text: 'fi'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -21,7 +21,6 @@ class FrenchDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'fr' lex_attr_getters[LANG] = lambda text: 'fr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)

View File

@ -12,7 +12,6 @@ from ...util import update_exc
class HebrewDefaults(Language.Defaults): class HebrewDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he' lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -18,7 +18,6 @@ class HungarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu' lex_attr_getters[LANG] = lambda text: 'hu'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES) prefixes = tuple(TOKENIZER_PREFIXES)

View File

@ -19,9 +19,7 @@ from ...util import update_exc
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'id' lex_attr_getters[LANG] = lambda text: 'id'
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES) prefixes = tuple(TOKENIZER_PREFIXES)

View File

@ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta', 'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun', 'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun', 'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
'noniliun', 'desiliun', 'noniliun', 'desiliun']
]
def like_num(text): def like_num(text):

View File

@ -16,7 +16,6 @@ class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it' lex_attr_getters[LANG] = lambda text: 'it'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -16,7 +16,6 @@ class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb' lex_attr_getters[LANG] = lambda text: 'nb'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -16,7 +16,6 @@ class DutchDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[LANG] = lambda text: 'nl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -15,7 +15,6 @@ class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl' lex_attr_getters[LANG] = lambda text: 'pl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -19,7 +19,6 @@ class PortugueseDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'pt' lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -18,7 +18,6 @@ class SwedishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv' lex_attr_getters[LANG] = lambda text: 'sv'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)

View File

@ -12,24 +12,27 @@ from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
class ThaiDefaults(Language.Defaults): class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'th' lex_attr_getters[LANG] = lambda text: 'th'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
class Thai(Language): class Thai(Language):
lang = 'th' lang = 'th'
Defaults = ThaiDefaults Defaults = ThaiDefaults
def make_doc(self, text):
try: def make_doc(self, text):
from pythainlp.tokenize import word_tokenize try:
except ImportError: from pythainlp.tokenize import word_tokenize
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " except ImportError:
"https://github.com/wannaphongcom/pythainlp/") raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
words = [x for x in list(word_tokenize(text,"newmm"))] "https://github.com/wannaphongcom/pythainlp/")
return Doc(self.vocab, words=words, spaces=[False]*len(words)) words = [x for x in list(word_tokenize(text,"newmm"))]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
__all__ = ['Thai'] __all__ = ['Thai']

View File

@ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx' lex_attr_getters[LANG] = lambda text: 'xx'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)