Add lemmatizer data as variable on language data

Don't create lookup lemmatizer within Language class and just pass in
the data so it can be set on Token creation
This commit is contained in:
ines 2017-10-11 02:24:58 +02:00
parent 0c2343d73a
commit 417d45f5d0
9 changed files with 11 additions and 41 deletions

View File

@ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -27,10 +26,7 @@ class GermanDefaults(Language.Defaults):
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS) syntax_iterators = dict(SYNTAX_ITERATORS)
lemma_lookup = dict(LOOKUP)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class German(Language): class German(Language):

View File

@ -7,7 +7,7 @@ from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -30,6 +30,7 @@ class EnglishDefaults(Language.Defaults):
lemma_rules = dict(LEMMA_RULES) lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX) lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC) lemma_exc = dict(LEMMA_EXC)
lemma_lookup = dict(LOOKUP)
syntax_iterators = dict(SYNTAX_ITERATORS) syntax_iterators = dict(SYNTAX_ITERATORS)

View File

@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -23,10 +22,7 @@ class SpanishDefaults(Language.Defaults):
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
sytax_iterators = dict(SYNTAX_ITERATORS) sytax_iterators = dict(SYNTAX_ITERATORS)
lemma_lookup = dict(LOOKUP)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Spanish(Language): class Spanish(Language):

View File

@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -27,10 +26,7 @@ class FrenchDefaults(Language.Defaults):
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
syntax_iterators = dict(SYNTAX_ITERATORS) syntax_iterators = dict(SYNTAX_ITERATORS)
lemma_lookup = dict(LOOKUP)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class French(Language): class French(Language):

View File

@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -24,10 +23,7 @@ class HungarianDefaults(Language.Defaults):
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
lemma_lookup = dict(LOOKUP)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Hungarian(Language): class Hungarian(Language):

View File

@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
@ -26,10 +25,7 @@ class IndonesianDefaults(Language.Defaults):
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)
syntax_iterators = dict(SYNTAX_ITERATORS) syntax_iterators = dict(SYNTAX_ITERATORS)
lemma_lookup = dict(LOOKUP)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Indonesian(Language): class Indonesian(Language):

View File

@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -18,10 +17,7 @@ class ItalianDefaults(Language.Defaults):
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
lemma_lookup = dict(LOOKUP)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Italian(Language): class Italian(Language):

View File

@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -21,10 +20,7 @@ class PortugueseDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
lemma_lookup = dict(LOOKUP)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Portuguese(Language): class Portuguese(Language):

View File

@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -20,10 +19,8 @@ class SwedishDefaults(Language.Defaults):
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
lemma_rules = dict(LEMMA_RULES)
@classmethod lemma_lookup = dict(LOOKUP)
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Swedish(Language): class Swedish(Language):