From 0c2343d73abc5410ddc51816e48e92c56d3c548c Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 02:22:49 +0200 Subject: [PATCH] Tidy up language data --- spacy/lang/bn/__init__.py | 8 +++----- spacy/lang/da/__init__.py | 1 - spacy/lang/de/__init__.py | 1 - spacy/lang/en/__init__.py | 1 - spacy/lang/es/__init__.py | 1 - spacy/lang/fi/__init__.py | 1 - spacy/lang/fr/__init__.py | 1 - spacy/lang/he/__init__.py | 1 - spacy/lang/hu/__init__.py | 1 - spacy/lang/id/__init__.py | 2 -- spacy/lang/id/lex_attrs.py | 3 +-- spacy/lang/it/__init__.py | 1 - spacy/lang/nb/__init__.py | 1 - spacy/lang/nl/__init__.py | 1 - spacy/lang/pl/__init__.py | 1 - spacy/lang/pt/__init__.py | 1 - spacy/lang/sv/__init__.py | 1 - spacy/lang/th/__init__.py | 25 ++++++++++++++----------- spacy/lang/xx/__init__.py | 1 - 19 files changed, 18 insertions(+), 35 deletions(-) diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index c2cf12f12..1a76123ea 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -16,12 +16,10 @@ from ...util import update_exc class BengaliDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'bn' - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = TAG_MAP - stop_words = STOP_WORDS - lemma_rules = LEMMA_RULES - + tag_map = dict(TAG_MAP) + stop_words = set(STOP_WORDS) + lemma_rules = dict(LEMMA_RULES) prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 99babdc2c..b255a04b9 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -15,7 +15,6 @@ class DanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'da' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1c64541e6..0ff707a06 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -22,7 +22,6 @@ class GermanDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'de' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) infixes = tuple(TOKENIZER_INFIXES) tag_map = dict(TAG_MAP) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index ec14fecd0..79d383b90 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -23,7 +23,6 @@ class EnglishDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 1e7f55be8..e64b88fad 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -19,7 +19,6 @@ class SpanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 931ad5341..2eb40851b 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -15,7 +15,6 @@ class FinnishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'fi' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 06dcf2d45..e2123c28f 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -21,7 +21,6 @@ class FrenchDefaults(Language.Defaults): lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'fr' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index a15dc9a05..b815b3273 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -12,7 +12,6 @@ from ...util import update_exc class HebrewDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'he' - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 0fe6a9f5c..9b6b63a81 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -18,7 +18,6 @@ class HungarianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'hu' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) prefixes = tuple(TOKENIZER_PREFIXES) diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index e0cfa941d..b4d020427 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -19,9 +19,7 @@ from ...util import update_exc class IndonesianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'id' - lex_attr_getters.update(LEX_ATTRS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) prefixes = tuple(TOKENIZER_PREFIXES) diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py index f6acd8508..fb6a31f99 100644 --- a/spacy/lang/id/lex_attrs.py +++ b/spacy/lang/id/lex_attrs.py @@ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta', 'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun', 'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun', - 'noniliun', 'desiliun', - ] + 'noniliun', 'desiliun'] def like_num(text): diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 7cc717cb3..f6506038c 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -16,7 +16,6 @@ class ItalianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'it' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index c1b4af263..8804f7424 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -16,7 +16,6 @@ class NorwegianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'nb' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 98df8d487..29cbb4617 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -16,7 +16,6 @@ class DutchDefaults(Language.Defaults): lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 38a240598..22e103246 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -15,7 +15,6 @@ class PolishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'pl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 67539034d..0baae7e7a 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -19,7 +19,6 @@ class PortugueseDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'pt' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters.update(LEX_ATTRS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 2d3a640c5..b21333fac 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -18,7 +18,6 @@ class SwedishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'sv' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index b6bdb658f..e640fc4ef 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -12,24 +12,27 @@ from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups + class ThaiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'th' - tokenizer_exceptions = TOKENIZER_EXCEPTIONS + tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) class Thai(Language): - lang = 'th' - Defaults = ThaiDefaults - def make_doc(self, text): - try: - from pythainlp.tokenize import word_tokenize - except ImportError: - raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " - "https://github.com/wannaphongcom/pythainlp/") - words = [x for x in list(word_tokenize(text,"newmm"))] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + lang = 'th' + Defaults = ThaiDefaults + + def make_doc(self, text): + try: + from pythainlp.tokenize import word_tokenize + except ImportError: + raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " + "https://github.com/wannaphongcom/pythainlp/") + words = [x for x in list(word_tokenize(text,"newmm"))] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + __all__ = ['Thai'] diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index dc63ee33f..017f55ecc 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'xx' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)