spaCy/spacy/lang/fr/__init__.py
ines 417d45f5d0 Add lemmatizer data as variable on language data
Don't create lookup lemmatizer within Language class and just pass in
the data so it can be set on Token creation
2017-10-11 02:24:58 +02:00

38 lines
1.2 KiB
Python

# coding: utf8
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'fr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
syntax_iterators = dict(SYNTAX_ITERATORS)
lemma_lookup = dict(LOOKUP)
class French(Language):
lang = 'fr'
Defaults = FrenchDefaults
__all__ = ['French']