spaCy/spacy/lang/pl/__init__.py
Adriane Boyd 2f981d5af1 Remove corpus-specific tag maps
Remove corpus-specific tag maps from the language data for languages
without custom tokenizers. For languages with custom word segmenters
that also provide tags (Japanese and Korean), the tag maps for the
custom tokenizers are kept as the default.

The default tag maps for languages without custom tokenizers are now the
default tag map from `lang/tag_map/py`, UPOS -> UPOS.
2020-07-15 15:58:29 +02:00

44 lines
1.3 KiB
Python

from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import PolishLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import add_lookups
from ...lookups import Lookups
class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "pl"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
tokenizer_exceptions = mod_base_exceptions
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return PolishLemmatizer(lookups)
class Polish(Language):
lang = "pl"
Defaults = PolishDefaults
__all__ = ["Polish"]