mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Add support for vocab.writing_system property (#3390)
* Add xfail test for vocab.writing_system * Add vocab.writing_system property * Set Language.Defaults.writing_system * Set default writing system * Remove xfail on test_vocab_writing_system
This commit is contained in:
parent
05ef0a5abb
commit
39a4741e26
|
@ -27,6 +27,7 @@ class PersianDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
class Persian(Language):
|
class Persian(Language):
|
||||||
|
|
|
@ -14,7 +14,7 @@ class HebrewDefaults(Language.Defaults):
|
||||||
lex_attr_getters[LANG] = lambda text: "he"
|
lex_attr_getters[LANG] = lambda text: "he"
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
|
|
||||||
class Hebrew(Language):
|
class Hebrew(Language):
|
||||||
lang = "he"
|
lang = "he"
|
||||||
|
|
|
@ -94,6 +94,7 @@ class JapaneseDefaults(Language.Defaults):
|
||||||
lex_attr_getters[LANG] = lambda _text: "ja"
|
lex_attr_getters[LANG] = lambda _text: "ja"
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
|
|
|
@ -14,7 +14,7 @@ class ChineseDefaults(Language.Defaults):
|
||||||
use_jieba = True
|
use_jieba = True
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
|
||||||
class Chinese(Language):
|
class Chinese(Language):
|
||||||
lang = "zh"
|
lang = "zh"
|
||||||
|
|
|
@ -94,6 +94,7 @@ class BaseDefaults(object):
|
||||||
morph_rules = {}
|
morph_rules = {}
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
syntax_iterators = {}
|
syntax_iterators = {}
|
||||||
|
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
class Language(object):
|
class Language(object):
|
||||||
|
|
|
@ -45,3 +45,8 @@ def test_vocab_api_contains(en_vocab, text):
|
||||||
_ = en_vocab[text] # noqa: F841
|
_ = en_vocab[text] # noqa: F841
|
||||||
assert text in en_vocab
|
assert text in en_vocab
|
||||||
assert "LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm" not in en_vocab
|
assert "LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm" not in en_vocab
|
||||||
|
|
||||||
|
|
||||||
|
def test_vocab_writing_system(en_vocab):
|
||||||
|
assert en_vocab.writing_system["direction"] == "ltr"
|
||||||
|
assert en_vocab.writing_system["has_case"] == True
|
||||||
|
|
|
@ -38,6 +38,18 @@ def set_env_log(value):
|
||||||
_PRINT_ENV = value
|
_PRINT_ENV = value
|
||||||
|
|
||||||
|
|
||||||
|
def lang_class_is_loaded(lang):
|
||||||
|
"""Check whether a Language class is already loaded. Language classes are
|
||||||
|
loaded lazily, to avoid expensive setup code associated with the language
|
||||||
|
data.
|
||||||
|
|
||||||
|
lang (unicode): Two-letter language code, e.g. 'en'.
|
||||||
|
RETURNS (bool): Whether a Language class has been loaded.
|
||||||
|
"""
|
||||||
|
global LANGUAGES
|
||||||
|
return lang in LANGUAGES
|
||||||
|
|
||||||
|
|
||||||
def get_lang_class(lang):
|
def get_lang_class(lang):
|
||||||
"""Import and load a Language class.
|
"""Import and load a Language class.
|
||||||
|
|
||||||
|
|
|
@ -67,6 +67,17 @@ cdef class Vocab:
|
||||||
langfunc = self.lex_attr_getters.get(LANG, None)
|
langfunc = self.lex_attr_getters.get(LANG, None)
|
||||||
return langfunc("_") if langfunc else ""
|
return langfunc("_") if langfunc else ""
|
||||||
|
|
||||||
|
property writing_system:
|
||||||
|
"""A dict with information about the language's writing system. To get
|
||||||
|
the data, we use the vocab.lang property to fetch the Language class.
|
||||||
|
If the Language class is not loaded, an empty dict is returned.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
if not util.lang_class_is_loaded(self.lang):
|
||||||
|
return {}
|
||||||
|
lang_class = util.get_lang_class(self.lang)
|
||||||
|
return dict(lang_class.Defaults.writing_system)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored.
|
"""The current number of lexemes stored.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user