mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Add lemmatizer data as variable on language data
Don't create lookup lemmatizer within Language class and just pass in the data so it can be set on Token creation
This commit is contained in:
parent
0c2343d73a
commit
417d45f5d0
|
@ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -27,10 +26,7 @@ class GermanDefaults(Language.Defaults):
|
||||||
tag_map = dict(TAG_MAP)
|
tag_map = dict(TAG_MAP)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = set(STOP_WORDS)
|
||||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
lemma_lookup = dict(LOOKUP)
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class German(Language):
|
class German(Language):
|
||||||
|
|
|
@ -7,7 +7,7 @@ from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .morph_rules import MORPH_RULES
|
from .morph_rules import MORPH_RULES
|
||||||
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
@ -30,6 +30,7 @@ class EnglishDefaults(Language.Defaults):
|
||||||
lemma_rules = dict(LEMMA_RULES)
|
lemma_rules = dict(LEMMA_RULES)
|
||||||
lemma_index = dict(LEMMA_INDEX)
|
lemma_index = dict(LEMMA_INDEX)
|
||||||
lemma_exc = dict(LEMMA_EXC)
|
lemma_exc = dict(LEMMA_EXC)
|
||||||
|
lemma_lookup = dict(LOOKUP)
|
||||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -23,10 +22,7 @@ class SpanishDefaults(Language.Defaults):
|
||||||
tag_map = dict(TAG_MAP)
|
tag_map = dict(TAG_MAP)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = set(STOP_WORDS)
|
||||||
sytax_iterators = dict(SYNTAX_ITERATORS)
|
sytax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
lemma_lookup = dict(LOOKUP)
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Spanish(Language):
|
class Spanish(Language):
|
||||||
|
|
|
@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -27,10 +26,7 @@ class FrenchDefaults(Language.Defaults):
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
lemma_lookup = dict(LOOKUP)
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class French(Language):
|
class French(Language):
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -24,10 +23,7 @@ class HungarianDefaults(Language.Defaults):
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
lemma_lookup = dict(LOOKUP)
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Hungarian(Language):
|
class Hungarian(Language):
|
||||||
|
|
|
@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
|
||||||
|
@ -26,10 +25,7 @@ class IndonesianDefaults(Language.Defaults):
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
lemma_lookup = dict(LOOKUP)
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Indonesian(Language):
|
class Indonesian(Language):
|
||||||
|
|
|
@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -18,10 +17,7 @@ class ItalianDefaults(Language.Defaults):
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = set(STOP_WORDS)
|
||||||
|
lemma_lookup = dict(LOOKUP)
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Italian(Language):
|
class Italian(Language):
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -21,10 +20,7 @@ class PortugueseDefaults(Language.Defaults):
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = set(STOP_WORDS)
|
||||||
|
lemma_lookup = dict(LOOKUP)
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Portuguese(Language):
|
class Portuguese(Language):
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -20,10 +19,8 @@ class SwedishDefaults(Language.Defaults):
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = set(STOP_WORDS)
|
||||||
|
lemma_rules = dict(LEMMA_RULES)
|
||||||
@classmethod
|
lemma_lookup = dict(LOOKUP)
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Swedish(Language):
|
class Swedish(Language):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user