diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index cb748085b..c2cf12f12 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -13,21 +13,23 @@ from ...attrs import LANG from ...util import update_exc +class BengaliDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'bn' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tag_map = TAG_MAP + stop_words = STOP_WORDS + lemma_rules = LEMMA_RULES + + prefixes = tuple(TOKENIZER_PREFIXES) + suffixes = tuple(TOKENIZER_SUFFIXES) + infixes = tuple(TOKENIZER_INFIXES) + + class Bengali(Language): lang = 'bn' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'bn' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = TAG_MAP - stop_words = STOP_WORDS - lemma_rules = LEMMA_RULES - - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) + Defaults = BengaliDefaults __all__ = ['Bengali'] diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 9efe21fb5..b9e90dc0d 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -10,15 +10,17 @@ from ...attrs import LANG from ...util import update_exc +class DanishDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'da' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + class Danish(Language): lang = 'da' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'da' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + Defaults = DanishDefaults __all__ = ['Danish'] diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 7a44b7485..fa957a6f5 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -14,21 +14,23 @@ from ...attrs import LANG from ...util import update_exc +class GermanDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'de' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tag_map = dict(TAG_MAP) + stop_words = set(STOP_WORDS) + syntax_iterators = dict(SYNTAX_ITERATORS) + + @classmethod + def create_lemmatizer(cls, nlp=None): + return Lemmatizer(LOOKUP) + + class German(Language): lang = 'de' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'de' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + Defaults = GermanDefaults __all__ = ['German'] diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 2d5314991..7e1da789b 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -15,22 +15,24 @@ from ...attrs import LANG from ...util import update_exc +class EnglishDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'en' + lex_attr_getters.update(LEX_ATTRS) + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tag_map = dict(TAG_MAP) + stop_words = set(STOP_WORDS) + morph_rules = dict(MORPH_RULES) + lemma_rules = dict(LEMMA_RULES) + lemma_index = dict(LEMMA_INDEX) + lemma_exc = dict(LEMMA_EXC) + sytax_iterators = dict(SYNTAX_ITERATORS) + + class English(Language): lang = 'en' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'en' - lex_attr_getters.update(LEX_ATTRS) - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - morph_rules = dict(MORPH_RULES) - lemma_rules = dict(LEMMA_RULES) - lemma_index = dict(LEMMA_INDEX) - lemma_exc = dict(LEMMA_EXC) - sytax_iterators = dict(SYNTAX_ITERATORS) + Defaults = EnglishDefaults __all__ = ['English'] diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index f5735d460..8291b2dd0 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -28,7 +28,7 @@ class SpanishDefaults(Language.Defaults): class Spanish(Language): lang = 'es' - Defaults = SpanishDefaults + __all__ = ['Spanish'] diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 8cb6ad8ab..7010acd48 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -10,15 +10,17 @@ from ...attrs import LANG from ...util import update_exc +class FinnishDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'fi' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + class Finnish(Language): lang = 'fi' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'fi' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + Defaults = FinnishDefaults __all__ = ['Finnish'] diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index a8a18a601..f9a01f223 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -13,22 +13,24 @@ from ...attrs import LANG from ...util import update_exc +class FrenchDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'fr' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + infixes = tuple(TOKENIZER_INFIXES) + suffixes = tuple(TOKENIZER_SUFFIXES) + token_match = TOKEN_MATCH + + @classmethod + def create_lemmatizer(cls, nlp=None): + return Lemmatizer(LOOKUP) + + class French(Language): lang = 'fr' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'fr' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - infixes = tuple(TOKENIZER_INFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - token_match = TOKEN_MATCH - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + Defaults = FrenchDefaults __all__ = ['French'] diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index 4ed1f30d0..a15dc9a05 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -9,15 +9,17 @@ from ...attrs import LANG from ...util import update_exc +class HebrewDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'he' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + class Hebrew(Language): lang = 'he' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'he' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) + Defaults = HebrewDefaults __all__ = ['Hebrew'] diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 7233239da..70b4ae5cc 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -13,23 +13,25 @@ from ...attrs import LANG from ...util import update_exc +class HungarianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'hu' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + prefixes = tuple(TOKENIZER_PREFIXES) + suffixes = tuple(TOKENIZER_SUFFIXES) + infixes = tuple(TOKENIZER_INFIXES) + token_match = TOKEN_MATCH + + @classmethod + def create_lemmatizer(cls, nlp=None): + return Lemmatizer(LOOKUP) + + class Hungarian(Language): lang = 'hu' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'hu' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) - token_match = TOKEN_MATCH - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + Defaults = HungarianDefaults __all__ = ['Hungarian'] diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 93f7f7764..573a8df16 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -11,19 +11,21 @@ from ...attrs import LANG from ...util import update_exc +class ItalianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'it' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + @classmethod + def create_lemmatizer(cls, nlp=None): + return Lemmatizer(LOOKUP) + + class Italian(Language): lang = 'it' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'it' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + Defaults = ItalianDefaults __all__ = ['Italian'] diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 20832bfe3..cb2baf148 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -11,15 +11,17 @@ from ...attrs import LANG from ...util import update_exc +class NorwegianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'nb' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + class Norwegian(Language): lang = 'nb' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'nb' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + Defaults = NorwegianDefaults __all__ = ['Norwegian'] diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 254849ad0..d6430d0b3 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -9,16 +9,17 @@ from ...attrs import LANG from ...util import update_exc +class DutchDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'nl' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + stop_words = set(STOP_WORDS) + class Dutch(Language): lang = 'nl' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'nl' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) + Defaults = DutchDefaults __all__ = ['Dutch'] diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 9fad81899..535120874 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -9,15 +9,17 @@ from ...attrs import LANG from ...util import update_exc +class PolishDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'pl' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + class Polish(Language): lang = 'pl' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'pl' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) + Defaults = PolishDefaults __all__ = ['Polish'] diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 314d05184..df6b76c7a 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -13,20 +13,22 @@ from ...attrs import LANG from ...util import update_exc +class PortugueseDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'pt' + lex_attr_getters.update(LEX_ATTRS) + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + @classmethod + def create_lemmatizer(cls, nlp=None): + return Lemmatizer(LOOKUP) + + class Portuguese(Language): lang = 'pt' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'pt' - lex_attr_getters.update(LEX_ATTRS) - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + Defaults = PortugueseDefaults __all__ = ['Portuguese'] diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index b16e1befc..b309643f7 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -13,19 +13,21 @@ from ...attrs import LANG from ...util import update_exc +class SwedishDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'sv' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + @classmethod + def create_lemmatizer(cls, nlp=None): + return Lemmatizer(LOOKUP) + + class Swedish(Language): lang = 'sv' - - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'sv' - - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + Defaults = SwedishDefaults __all__ = ['Swedish'] diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 2d90028f0..ed602f8fa 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -56,20 +56,22 @@ p from ...attrs import LANG from ...util import update_exc + # create Defaults class in the module scope (necessary for pickling!) + class XxxxxDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code + + # optional: replace flags with custom functions, e.g. like_num() + lex_attr_getters.update(LEX_ATTRS) + + # merge base exceptions and custom tokenizer exceptions + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + + # create actual Language class class Xxxxx(Language): lang = 'xx' # language ISO code - - # override defaults - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code - - # optional: replace flags with custom functions, e.g. like_num() - lex_attr_getters.update(LEX_ATTRS) - - # merge base exceptions and custom tokenizer exceptions - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + Defaults = XxxxxDefaults # override defaults # set default export – this allows the language class to be lazy-loaded __all__ = ['Xxxxx']