spaCy/spacy/lang/de/__init__.py
Adriane Boyd 2f981d5af1 Remove corpus-specific tag maps
Remove corpus-specific tag maps from the language data for languages
without custom tokenizers. For languages with custom word segmenters
that also provide tags (Japanese and Korean), the tag maps for the
custom tokenizers are kept as the default.

The default tag maps for languages without custom tokenizers are now the
default tag map from `lang/tag_map/py`, UPOS -> UPOS.
2020-07-15 15:58:29 +02:00

44 lines
1.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "de"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
single_orth_variants = [
{"tags": ["$("], "variants": ["", "..."]},
{"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]},
]
paired_orth_variants = [
{
"tags": ["$("],
"variants": [("'", "'"), (",", "'"), ("", ""), ("", ""), ("", "")],
},
{
"tags": ["$("],
"variants": [("``", "''"), ('"', '"'), ("", ""), ("»", "«"), ("«", "»")],
},
]
class German(Language):
lang = "de"
Defaults = GermanDefaults
__all__ = ["German"]