mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Add support for vocab.writing_system property (#3390)
* Add xfail test for vocab.writing_system * Add vocab.writing_system property * Set Language.Defaults.writing_system * Set default writing system * Remove xfail on test_vocab_writing_system
This commit is contained in:
parent
05ef0a5abb
commit
39a4741e26
|
@ -27,6 +27,7 @@ class PersianDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
|
||||
|
||||
class Persian(Language):
|
||||
|
|
|
@ -14,7 +14,7 @@ class HebrewDefaults(Language.Defaults):
|
|||
lex_attr_getters[LANG] = lambda text: "he"
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
|
||||
class Hebrew(Language):
|
||||
lang = "he"
|
||||
|
|
|
@ -94,6 +94,7 @@ class JapaneseDefaults(Language.Defaults):
|
|||
lex_attr_getters[LANG] = lambda _text: "ja"
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
|
|
|
@ -14,7 +14,7 @@ class ChineseDefaults(Language.Defaults):
|
|||
use_jieba = True
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
||||
class Chinese(Language):
|
||||
lang = "zh"
|
||||
|
|
|
@ -94,6 +94,7 @@ class BaseDefaults(object):
|
|||
morph_rules = {}
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = {}
|
||||
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
||||
|
||||
|
||||
class Language(object):
|
||||
|
|
|
@ -45,3 +45,8 @@ def test_vocab_api_contains(en_vocab, text):
|
|||
_ = en_vocab[text] # noqa: F841
|
||||
assert text in en_vocab
|
||||
assert "LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm" not in en_vocab
|
||||
|
||||
|
||||
def test_vocab_writing_system(en_vocab):
|
||||
assert en_vocab.writing_system["direction"] == "ltr"
|
||||
assert en_vocab.writing_system["has_case"] == True
|
||||
|
|
|
@ -38,6 +38,18 @@ def set_env_log(value):
|
|||
_PRINT_ENV = value
|
||||
|
||||
|
||||
def lang_class_is_loaded(lang):
|
||||
"""Check whether a Language class is already loaded. Language classes are
|
||||
loaded lazily, to avoid expensive setup code associated with the language
|
||||
data.
|
||||
|
||||
lang (unicode): Two-letter language code, e.g. 'en'.
|
||||
RETURNS (bool): Whether a Language class has been loaded.
|
||||
"""
|
||||
global LANGUAGES
|
||||
return lang in LANGUAGES
|
||||
|
||||
|
||||
def get_lang_class(lang):
|
||||
"""Import and load a Language class.
|
||||
|
||||
|
|
|
@ -67,6 +67,17 @@ cdef class Vocab:
|
|||
langfunc = self.lex_attr_getters.get(LANG, None)
|
||||
return langfunc("_") if langfunc else ""
|
||||
|
||||
property writing_system:
|
||||
"""A dict with information about the language's writing system. To get
|
||||
the data, we use the vocab.lang property to fetch the Language class.
|
||||
If the Language class is not loaded, an empty dict is returned.
|
||||
"""
|
||||
def __get__(self):
|
||||
if not util.lang_class_is_loaded(self.lang):
|
||||
return {}
|
||||
lang_class = util.get_lang_class(self.lang)
|
||||
return dict(lang_class.Defaults.writing_system)
|
||||
|
||||
def __len__(self):
|
||||
"""The current number of lexemes stored.
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user