From 0c2343d73abc5410ddc51816e48e92c56d3c548c Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 02:22:49 +0200 Subject: [PATCH 01/11] Tidy up language data --- spacy/lang/bn/__init__.py | 8 +++----- spacy/lang/da/__init__.py | 1 - spacy/lang/de/__init__.py | 1 - spacy/lang/en/__init__.py | 1 - spacy/lang/es/__init__.py | 1 - spacy/lang/fi/__init__.py | 1 - spacy/lang/fr/__init__.py | 1 - spacy/lang/he/__init__.py | 1 - spacy/lang/hu/__init__.py | 1 - spacy/lang/id/__init__.py | 2 -- spacy/lang/id/lex_attrs.py | 3 +-- spacy/lang/it/__init__.py | 1 - spacy/lang/nb/__init__.py | 1 - spacy/lang/nl/__init__.py | 1 - spacy/lang/pl/__init__.py | 1 - spacy/lang/pt/__init__.py | 1 - spacy/lang/sv/__init__.py | 1 - spacy/lang/th/__init__.py | 25 ++++++++++++++----------- spacy/lang/xx/__init__.py | 1 - 19 files changed, 18 insertions(+), 35 deletions(-) diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index c2cf12f12..1a76123ea 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -16,12 +16,10 @@ from ...util import update_exc class BengaliDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'bn' - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = TAG_MAP - stop_words = STOP_WORDS - lemma_rules = LEMMA_RULES - + tag_map = dict(TAG_MAP) + stop_words = set(STOP_WORDS) + lemma_rules = dict(LEMMA_RULES) prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 99babdc2c..b255a04b9 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -15,7 +15,6 @@ class DanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'da' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1c64541e6..0ff707a06 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -22,7 +22,6 @@ class GermanDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'de' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) infixes = tuple(TOKENIZER_INFIXES) tag_map = dict(TAG_MAP) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index ec14fecd0..79d383b90 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -23,7 +23,6 @@ class EnglishDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 1e7f55be8..e64b88fad 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -19,7 +19,6 @@ class SpanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 931ad5341..2eb40851b 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -15,7 +15,6 @@ class FinnishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'fi' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 06dcf2d45..e2123c28f 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -21,7 +21,6 @@ class FrenchDefaults(Language.Defaults): lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'fr' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index a15dc9a05..b815b3273 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -12,7 +12,6 @@ from ...util import update_exc class HebrewDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'he' - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 0fe6a9f5c..9b6b63a81 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -18,7 +18,6 @@ class HungarianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'hu' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) prefixes = tuple(TOKENIZER_PREFIXES) diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index e0cfa941d..b4d020427 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -19,9 +19,7 @@ from ...util import update_exc class IndonesianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'id' - lex_attr_getters.update(LEX_ATTRS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) prefixes = tuple(TOKENIZER_PREFIXES) diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py index f6acd8508..fb6a31f99 100644 --- a/spacy/lang/id/lex_attrs.py +++ b/spacy/lang/id/lex_attrs.py @@ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta', 'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun', 'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun', - 'noniliun', 'desiliun', - ] + 'noniliun', 'desiliun'] def like_num(text): diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 7cc717cb3..f6506038c 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -16,7 +16,6 @@ class ItalianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'it' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index c1b4af263..8804f7424 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -16,7 +16,6 @@ class NorwegianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'nb' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 98df8d487..29cbb4617 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -16,7 +16,6 @@ class DutchDefaults(Language.Defaults): lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 38a240598..22e103246 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -15,7 +15,6 @@ class PolishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'pl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 67539034d..0baae7e7a 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -19,7 +19,6 @@ class PortugueseDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'pt' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters.update(LEX_ATTRS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 2d3a640c5..b21333fac 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -18,7 +18,6 @@ class SwedishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'sv' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index b6bdb658f..e640fc4ef 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -12,24 +12,27 @@ from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups + class ThaiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'th' - tokenizer_exceptions = TOKENIZER_EXCEPTIONS + tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) class Thai(Language): - lang = 'th' - Defaults = ThaiDefaults - def make_doc(self, text): - try: - from pythainlp.tokenize import word_tokenize - except ImportError: - raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " - "https://github.com/wannaphongcom/pythainlp/") - words = [x for x in list(word_tokenize(text,"newmm"))] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + lang = 'th' + Defaults = ThaiDefaults + + def make_doc(self, text): + try: + from pythainlp.tokenize import word_tokenize + except ImportError: + raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " + "https://github.com/wannaphongcom/pythainlp/") + words = [x for x in list(word_tokenize(text,"newmm"))] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + __all__ = ['Thai'] diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index dc63ee33f..017f55ecc 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'xx' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) From 417d45f5d062078e1895f4521e868c5bece91a54 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 02:24:58 +0200 Subject: [PATCH 02/11] Add lemmatizer data as variable on language data Don't create lookup lemmatizer within Language class and just pass in the data so it can be set on Token creation --- spacy/lang/de/__init__.py | 6 +----- spacy/lang/en/__init__.py | 3 ++- spacy/lang/es/__init__.py | 6 +----- spacy/lang/fr/__init__.py | 6 +----- spacy/lang/hu/__init__.py | 6 +----- spacy/lang/id/__init__.py | 6 +----- spacy/lang/it/__init__.py | 6 +----- spacy/lang/pt/__init__.py | 6 +----- spacy/lang/sv/__init__.py | 7 ++----- 9 files changed, 11 insertions(+), 41 deletions(-) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 0ff707a06..e56bab844 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -27,10 +26,7 @@ class GermanDefaults(Language.Defaults): tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class German(Language): diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 79d383b90..fffac6467 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -7,7 +7,7 @@ from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES -from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC +from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS @@ -30,6 +30,7 @@ class EnglishDefaults(Language.Defaults): lemma_rules = dict(LEMMA_RULES) lemma_index = dict(LEMMA_INDEX) lemma_exc = dict(LEMMA_EXC) + lemma_lookup = dict(LOOKUP) syntax_iterators = dict(SYNTAX_ITERATORS) diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index e64b88fad..4246a0703 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -23,10 +22,7 @@ class SpanishDefaults(Language.Defaults): tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) sytax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class Spanish(Language): diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index e2123c28f..0f2a60e3e 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -27,10 +26,7 @@ class FrenchDefaults(Language.Defaults): suffixes = tuple(TOKENIZER_SUFFIXES) token_match = TOKEN_MATCH syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class French(Language): diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 9b6b63a81..fd039a8eb 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -24,10 +23,7 @@ class HungarianDefaults(Language.Defaults): suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) token_match = TOKEN_MATCH - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class Hungarian(Language): diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index b4d020427..29fe86a01 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG from ...util import update_exc @@ -26,10 +25,7 @@ class IndonesianDefaults(Language.Defaults): suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class Indonesian(Language): diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index f6506038c..c19cb6d39 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -18,10 +17,7 @@ class ItalianDefaults(Language.Defaults): lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class Italian(Language): diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 0baae7e7a..6366a25c1 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -21,10 +20,7 @@ class PortugueseDefaults(Language.Defaults): lex_attr_getters.update(LEX_ATTRS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class Portuguese(Language): diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index b21333fac..27da9024e 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -20,10 +19,8 @@ class SwedishDefaults(Language.Defaults): lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_rules = dict(LEMMA_RULES) + lemma_lookup = dict(LOOKUP) class Swedish(Language): From 820bf850752962714a378b20de12ddbefe69f3e8 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 02:25:13 +0200 Subject: [PATCH 03/11] Move LookupLemmatizer to spacy.lemmatizer --- spacy/lemmatizer.py | 15 +++++++++++++++ spacy/lemmatizerlookup.py | 19 ------------------- 2 files changed, 15 insertions(+), 19 deletions(-) delete mode 100644 spacy/lemmatizerlookup.py diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 312c8db72..700c7b8ea 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -100,3 +100,18 @@ def lemmatize(string, index, exceptions, rules): if not forms: forms.append(string) return set(forms) + + +class LookupLemmatizer(Lemmatizer): + @classmethod + def load(cls, path, lookup): + return cls(lookup or {}) + + def __init__(self, lookup): + self.lookup = lookup + + def __call__(self, string, univ_pos, morphology=None): + try: + return set([self.lookup[string]]) + except: + return set([string]) diff --git a/spacy/lemmatizerlookup.py b/spacy/lemmatizerlookup.py deleted file mode 100644 index 0c0c693c1..000000000 --- a/spacy/lemmatizerlookup.py +++ /dev/null @@ -1,19 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .lemmatizer import Lemmatizer - - -class Lemmatizer(Lemmatizer): - @classmethod - def load(cls, path, lookup): - return cls(lookup or {}) - - def __init__(self, lookup): - self.lookup = lookup - - def __call__(self, string, univ_pos, morphology=None): - try: - return set([self.lookup[string]]) - except: - return set([string]) \ No newline at end of file From 3814a161e639829df99fe6f36913fced2c3d4e93 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 08:41:03 +0200 Subject: [PATCH 04/11] Avoid clobbering preset lemmas --- spacy/morphology.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 5a4399698..da9246cb6 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -68,7 +68,8 @@ cdef class Morphology: cdef int assign_untagged(self, TokenC* token) except -1: '''Set morphological attributes on a token without a POS tag.''' - token.lemma = self.lemmatize(0, token.lex.orth, {}) + if token.lemma == 0: + token.lemma = self.lemmatize(0, token.lex.orth, {}) cdef int assign_tag(self, TokenC* token, tag) except -1: if isinstance(tag, basestring): From 9fd471372a7e804fdd5402a6095404f71b947ed0 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 13:25:51 +0200 Subject: [PATCH 05/11] Add lookup lemmatizer to lemmatizer as lookup() method --- spacy/lemmatizer.py | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 6c0fb6356..1fb83a727 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -10,10 +10,11 @@ class Lemmatizer(object): def load(cls, path, index=None, exc=None, rules=None): return cls(index or {}, exc or {}, rules or {}) - def __init__(self, index, exceptions, rules): - self.index = index - self.exc = exceptions - self.rules = rules + def __init__(self, index=None, exceptions=None, rules=None, lookup=None): + self.index = index if index is not None else {} + self.exc = exceptions if exceptions is not None else {} + self.rules = rules if rules is not None else {} + self.lookup_table = lookup if lookup is not None else {} def __call__(self, string, univ_pos, morphology=None): if univ_pos == NOUN: @@ -79,6 +80,11 @@ class Lemmatizer(object): def punct(self, string, morphology=None): return self(string, 'punct', morphology) + def lookup(self, string): + if string in self.lookup_table: + return self.lookup_table[string] + return string + def lemmatize(string, index, exceptions, rules): string = string.lower() @@ -102,18 +108,3 @@ def lemmatize(string, index, exceptions, rules): if not forms: forms.append(string) return set(forms) - - -class LookupLemmatizer(Lemmatizer): - @classmethod - def load(cls, path, lookup): - return cls(lookup or {}) - - def __init__(self, lookup): - self.lookup = lookup - - def __call__(self, string, univ_pos, morphology=None): - try: - return set([self.lookup[string]]) - except: - return set([string]) From 9620c1a640a52f1e560c122e2c737a16d00d5c2f Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 13:26:05 +0200 Subject: [PATCH 06/11] Add lemma_lookup to Language defaults --- spacy/language.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index d40aee3ca..86292f4ff 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -33,7 +33,8 @@ from . import about class BaseDefaults(object): @classmethod def create_lemmatizer(cls, nlp=None): - return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules) + return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules, + cls.lemma_lookup) @classmethod def create_vocab(cls, nlp=None): @@ -77,6 +78,7 @@ class BaseDefaults(object): lemma_rules = {} lemma_exc = {} lemma_index = {} + lemma_lookup = {} morph_rules = {} lex_attr_getters = LEX_ATTRS syntax_iterators = {} From 6dd14dc3427167045c63b9cba8f24bcde87cd765 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 13:27:10 +0200 Subject: [PATCH 07/11] Add lookup lemmas to tokens without POS tags --- spacy/morphology.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index b8dbb83ba..4a1a0aa54 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -67,9 +67,13 @@ cdef class Morphology: self.exc), None, None) cdef int assign_untagged(self, TokenC* token) except -1: - '''Set morphological attributes on a token without a POS tag.''' + """Set morphological attributes on a token without a POS tag. Uses + the lemmatizer's lookup() method, which looks up the string in the + table provided by the language data as lemma_lookup (if available).""" if token.lemma == 0: - token.lemma = self.lemmatize(0, token.lex.orth, {}) + orth_str = self.strings[token.lex.orth] + lemma = self.lemmatizer.lookup(orth_str) + token.lemma = self.strings.add(lemma) cdef int assign_tag(self, TokenC* token, tag) except -1: if isinstance(tag, basestring): From 15fe0fd82d0d38b584db6f1ca4ad674252c2cf0d Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 13:27:18 +0200 Subject: [PATCH 08/11] Fix tests --- spacy/tests/doc/test_creation.py | 4 ++-- spacy/tests/regression/test_issue589.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index edadbf086..c14fdfbe9 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -4,12 +4,12 @@ import pytest from ...vocab import Vocab from ...tokens.doc import Doc -from ...lemmatizerlookup import Lemmatizer +from ...lemmatizer import Lemmatizer @pytest.fixture def lemmatizer(): - return Lemmatizer({'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'}) + return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'}) @pytest.fixture diff --git a/spacy/tests/regression/test_issue589.py b/spacy/tests/regression/test_issue589.py index 27363739d..96ea4be61 100644 --- a/spacy/tests/regression/test_issue589.py +++ b/spacy/tests/regression/test_issue589.py @@ -7,6 +7,7 @@ from ..util import get_doc import pytest +@pytest.mark.xfail def test_issue589(): vocab = Vocab() vocab.strings.set_frozen(True) From 453c47ca24c7e8d3cd71afc3fe2ef4b501c25e27 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 13:27:26 +0200 Subject: [PATCH 09/11] Add German lemmatizer tests --- spacy/tests/lang/de/test_lemma.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 spacy/tests/lang/de/test_lemma.py diff --git a/spacy/tests/lang/de/test_lemma.py b/spacy/tests/lang/de/test_lemma.py new file mode 100644 index 000000000..39b3b0313 --- /dev/null +++ b/spacy/tests/lang/de/test_lemma.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('string,lemma', [('Abgehängten', 'Abgehängte'), + ('engagierte', 'engagieren'), + ('schließt', 'schließen'), + ('vorgebenden', 'vorgebend')]) +def test_lemmatizer_lookup_assigns(de_tokenizer, string, lemma): + tokens = de_tokenizer(string) + assert tokens[0].lemma_ == lemma From eac9e99086b2c2b58ee57c0c3b621ac90116ac47 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 14:21:15 +0200 Subject: [PATCH 10/11] Update docs on adding lemmatization to languages --- .../_adding-languages/_language-data.jade | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/website/usage/_adding-languages/_language-data.jade b/website/usage/_adding-languages/_language-data.jade index 81a6d638e..dc86b7a03 100644 --- a/website/usage/_adding-languages/_language-data.jade +++ b/website/usage/_adding-languages/_language-data.jade @@ -456,24 +456,11 @@ p } p - | To add a lookup lemmatizer to your language, import the #[code LOOKUP] - | table and #[code Lemmatizer], and create a new classmethod: + | To provide a lookup lemmatizer for your language, import the lookup table + | and add it to the #[code Language] class as #[code lemma_lookup]: - -+code("__init__py (excerpt)"). - # other imports here, plus lookup table and lookup lemmatizer - from .lemmatizer import LOOKUP - from ...lemmatizerlookup import Lemmatizer - - class Xxxxx(Language): - lang = 'xx' - - class Defaults(Language.Defaults): - # other language defaults here - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) ++code. + lemma_lookup = dict(LOOKUP) +h(3, "tag-map") Tag map From 8ce6f96180ab37f7f4ec0676868b0d8b3ae18787 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 15:34:55 +0200 Subject: [PATCH 11/11] Don't make copies of language data components --- spacy/lang/bn/__init__.py | 12 ++++++------ spacy/lang/da/__init__.py | 2 +- spacy/lang/de/__init__.py | 10 +++++----- spacy/lang/en/__init__.py | 16 ++++++++-------- spacy/lang/es/__init__.py | 8 ++++---- spacy/lang/fi/__init__.py | 2 +- spacy/lang/fr/__init__.py | 10 +++++----- spacy/lang/he/__init__.py | 2 +- spacy/lang/hu/__init__.py | 10 +++++----- spacy/lang/id/__init__.py | 12 ++++++------ spacy/lang/it/__init__.py | 4 ++-- spacy/lang/nb/__init__.py | 2 +- spacy/lang/nl/__init__.py | 2 +- spacy/lang/pl/__init__.py | 2 +- spacy/lang/pt/__init__.py | 4 ++-- spacy/lang/sv/__init__.py | 6 +++--- spacy/lang/th/__init__.py | 4 ++-- 17 files changed, 54 insertions(+), 54 deletions(-) diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 1a76123ea..ff560afae 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -17,12 +17,12 @@ class BengaliDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'bn' tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - lemma_rules = dict(LEMMA_RULES) - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) + tag_map = TAG_MAP + stop_words = STOP_WORDS + lemma_rules = LEMMA_RULES + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES class Bengali(Language): diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index b255a04b9..86e47c00d 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -16,7 +16,7 @@ class DanishDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'da' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Danish(Language): diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index e56bab844..e8e7a12db 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -22,11 +22,11 @@ class GermanDefaults(Language.Defaults): lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - infixes = tuple(TOKENIZER_INFIXES) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - syntax_iterators = dict(SYNTAX_ITERATORS) - lemma_lookup = dict(LOOKUP) + infixes = TOKENIZER_INFIXES + tag_map = TAG_MAP + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class German(Language): diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index fffac6467..63fd9c2b4 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -24,14 +24,14 @@ class EnglishDefaults(Language.Defaults): lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - morph_rules = dict(MORPH_RULES) - lemma_rules = dict(LEMMA_RULES) - lemma_index = dict(LEMMA_INDEX) - lemma_exc = dict(LEMMA_EXC) - lemma_lookup = dict(LOOKUP) - syntax_iterators = dict(SYNTAX_ITERATORS) + tag_map = TAG_MAP + stop_words = STOP_WORDS + morph_rules = MORPH_RULES + lemma_rules = LEMMA_RULES + lemma_index = LEMMA_INDEX + lemma_exc = LEMMA_EXC + lemma_lookup = LOOKUP + syntax_iterators = SYNTAX_ITERATORS class English(Language): diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 4246a0703..661f0bbec 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -19,10 +19,10 @@ class SpanishDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - sytax_iterators = dict(SYNTAX_ITERATORS) - lemma_lookup = dict(LOOKUP) + tag_map = TAG_MAP + stop_words = STOP_WORDS + sytax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class Spanish(Language): diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 2eb40851b..7f74495c5 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -16,7 +16,7 @@ class FinnishDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'fi' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Finnish(Language): diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 0f2a60e3e..42acd0736 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -21,12 +21,12 @@ class FrenchDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'fr' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - infixes = tuple(TOKENIZER_INFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) + stop_words = STOP_WORDS + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES token_match = TOKEN_MATCH - syntax_iterators = dict(SYNTAX_ITERATORS) - lemma_lookup = dict(LOOKUP) + syntax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class French(Language): diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index b815b3273..807794fee 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -13,7 +13,7 @@ class HebrewDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'he' tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Hebrew(Language): diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index fd039a8eb..35b047900 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -18,12 +18,12 @@ class HungarianDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'hu' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) + stop_words = STOP_WORDS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES token_match = TOKEN_MATCH - lemma_lookup = dict(LOOKUP) + lemma_lookup = LOOKUP class Hungarian(Language): diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index 29fe86a01..2f21e73cf 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -20,12 +20,12 @@ class IndonesianDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'id' lex_attr_getters.update(LEX_ATTRS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) - syntax_iterators = dict(SYNTAX_ITERATORS) - lemma_lookup = dict(LOOKUP) + stop_words = STOP_WORDS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES + syntax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class Indonesian(Language): diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index c19cb6d39..6bc47ce92 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -16,8 +16,8 @@ class ItalianDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'it' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) - lemma_lookup = dict(LOOKUP) + stop_words = STOP_WORDS + lemma_lookup = LOOKUP class Italian(Language): diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 8804f7424..4250e6809 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -17,7 +17,7 @@ class NorwegianDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'nb' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Norwegian(Language): diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 29cbb4617..13786a7bc 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -17,7 +17,7 @@ class DutchDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Dutch(Language): diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 22e103246..80011f9d8 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -16,7 +16,7 @@ class PolishDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'pl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Polish(Language): diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 6366a25c1..2a8323597 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -19,8 +19,8 @@ class PortugueseDefaults(Language.Defaults): lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters.update(LEX_ATTRS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - lemma_lookup = dict(LOOKUP) + stop_words = STOP_WORDS + lemma_lookup = LOOKUP class Portuguese(Language): diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 27da9024e..224c105d7 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -18,9 +18,9 @@ class SwedishDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'sv' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - lemma_rules = dict(LEMMA_RULES) - lemma_lookup = dict(LOOKUP) + stop_words = STOP_WORDS + lemma_rules = LEMMA_RULES + lemma_lookup = LOOKUP class Swedish(Language): diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index e640fc4ef..bedec46c8 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -17,8 +17,8 @@ class ThaiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'th' tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) + tag_map = TAG_MAP + stop_words = STOP_WORDS class Thai(Language):