Update language data

This commit is contained in:
Ines Montani 2021-01-27 13:29:22 +11:00
parent 230e651ad6
commit e3f8be9a94
6 changed files with 7 additions and 31 deletions

View File

@ -1,25 +1,14 @@
# coding: utf8
from __future__ import unicode_literals
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...attrs import LANG
from ...language import Language from ...language import Language
from ...util import update_exc
class KyrgyzDefaults(Language.Defaults): class KyrgyzDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters[LANG] = lambda text: "ky" infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES)
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.
>>> from spacy.lang.ky.examples import sentences >>> from spacy.lang.ky.examples import sentences

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [ _num_words = [

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import LIST_ELLIPSES, LIST_ICONS

View File

@ -1,8 +1,5 @@
# encoding: utf8
from __future__ import unicode_literals
STOP_WORDS = set( STOP_WORDS = set(
""" """
ага адам айтты айтымында айтып ал алар ага адам айтты айтымында айтып ал алар
алардын алган алуу алып анда андан аны алардын алган алуу алып анда андан аны
анын ар анын ар

View File

@ -1,7 +1,6 @@
# coding: utf8 from ..tokenizer_exceptions import BASE_EXCEPTIONS
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, LEMMA, NORM
from ...util import update_exc
_exc = {} _exc = {}
@ -52,4 +51,4 @@ for exc_data in [ # "etc." abbreviations
exc_data[LEMMA] = exc_data[NORM] exc_data[LEMMA] = exc_data[NORM]
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)