mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-13 01:50:33 +03:00
Remove corpus-specific tag maps from the language data for languages without custom tokenizers. For languages with custom word segmenters that also provide tags (Japanese and Korean), the tag maps for the custom tokenizers are kept as the default. The default tag maps for languages without custom tokenizers are now the default tag map from `lang/tag_map/py`, UPOS -> UPOS.
26 lines
617 B
Python
26 lines
617 B
Python
from .stop_words import STOP_WORDS
|
|
from .lex_attrs import LEX_ATTRS
|
|
from .punctuation import TOKENIZER_SUFFIXES
|
|
|
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
from ...language import Language
|
|
from ...attrs import LANG
|
|
|
|
|
|
class BasqueDefaults(Language.Defaults):
|
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
lex_attr_getters.update(LEX_ATTRS)
|
|
lex_attr_getters[LANG] = lambda text: "eu"
|
|
|
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
|
stop_words = STOP_WORDS
|
|
suffixes = TOKENIZER_SUFFIXES
|
|
|
|
|
|
class Basque(Language):
|
|
lang = "eu"
|
|
Defaults = BasqueDefaults
|
|
|
|
|
|
__all__ = ["Basque"]
|