Update language data

This commit is contained in:
Ines Montani 2021-01-27 13:29:22 +11:00
parent 230e651ad6
commit e3f8be9a94
6 changed files with 7 additions and 31 deletions

View File

@ -1,25 +1,14 @@
# coding: utf8
from __future__ import unicode_literals
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...attrs import LANG
from ...language import Language
from ...util import update_exc
class KyrgyzDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "ky"
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.ky.examples import sentences

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = [

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
from ..char_classes import LIST_ELLIPSES, LIST_ICONS

View File

@ -1,8 +1,5 @@
# encoding: utf8
from __future__ import unicode_literals
STOP_WORDS = set(
"""
"""
ага адам айтты айтымында айтып ал алар
алардын алган алуу алып анда андан аны
анын ар

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {}
@ -52,4 +51,4 @@ for exc_data in [ # "etc." abbreviations
exc_data[LEMMA] = exc_data[NORM]
_exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)