mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 05:04:09 +03:00
2f981d5af1
Remove corpus-specific tag maps from the language data for languages without custom tokenizers. For languages with custom word segmenters that also provide tags (Japanese and Korean), the tag maps for the custom tokenizers are kept as the default. The default tag maps for languages without custom tokenizers are now the default tag map from `lang/tag_map/py`, UPOS -> UPOS.
44 lines
1.3 KiB
Python
44 lines
1.3 KiB
Python
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
|
from .punctuation import TOKENIZER_SUFFIXES
|
|
from .stop_words import STOP_WORDS
|
|
from .lex_attrs import LEX_ATTRS
|
|
from .lemmatizer import PolishLemmatizer
|
|
|
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
from ..norm_exceptions import BASE_NORMS
|
|
from ...language import Language
|
|
from ...attrs import LANG, NORM
|
|
from ...util import add_lookups
|
|
from ...lookups import Lookups
|
|
|
|
|
|
class PolishDefaults(Language.Defaults):
|
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
lex_attr_getters.update(LEX_ATTRS)
|
|
lex_attr_getters[LANG] = lambda text: "pl"
|
|
lex_attr_getters[NORM] = add_lookups(
|
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
)
|
|
mod_base_exceptions = {
|
|
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
|
}
|
|
tokenizer_exceptions = mod_base_exceptions
|
|
stop_words = STOP_WORDS
|
|
prefixes = TOKENIZER_PREFIXES
|
|
infixes = TOKENIZER_INFIXES
|
|
suffixes = TOKENIZER_SUFFIXES
|
|
|
|
@classmethod
|
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
|
if lookups is None:
|
|
lookups = Lookups()
|
|
return PolishLemmatizer(lookups)
|
|
|
|
|
|
class Polish(Language):
|
|
lang = "pl"
|
|
Defaults = PolishDefaults
|
|
|
|
|
|
__all__ = ["Polish"]
|