Add support for vocab.writing_system property (#3390)

* Add xfail test for vocab.writing_system

* Add vocab.writing_system property

* Set Language.Defaults.writing_system

* Set default writing system

* Remove xfail on test_vocab_writing_system
This commit is contained in:
Matthew Honnibal 2019-03-11 15:23:20 +01:00 committed by Ines Montani
parent 05ef0a5abb
commit 39a4741e26
8 changed files with 33 additions and 2 deletions

View File

@ -27,6 +27,7 @@ class PersianDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Persian(Language): class Persian(Language):

View File

@ -14,7 +14,7 @@ class HebrewDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: "he" lex_attr_getters[LANG] = lambda text: "he"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Hebrew(Language): class Hebrew(Language):
lang = "he" lang = "he"

View File

@ -94,6 +94,7 @@ class JapaneseDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda _text: "ja" lex_attr_getters[LANG] = lambda _text: "ja"
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@classmethod @classmethod
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):

View File

@ -14,7 +14,7 @@ class ChineseDefaults(Language.Defaults):
use_jieba = True use_jieba = True
tokenizer_exceptions = BASE_EXCEPTIONS tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
class Chinese(Language): class Chinese(Language):
lang = "zh" lang = "zh"

View File

@ -94,6 +94,7 @@ class BaseDefaults(object):
morph_rules = {} morph_rules = {}
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
syntax_iterators = {} syntax_iterators = {}
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
class Language(object): class Language(object):

View File

@ -45,3 +45,8 @@ def test_vocab_api_contains(en_vocab, text):
_ = en_vocab[text] # noqa: F841 _ = en_vocab[text] # noqa: F841
assert text in en_vocab assert text in en_vocab
assert "LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm" not in en_vocab assert "LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm" not in en_vocab
def test_vocab_writing_system(en_vocab):
assert en_vocab.writing_system["direction"] == "ltr"
assert en_vocab.writing_system["has_case"] == True

View File

@ -38,6 +38,18 @@ def set_env_log(value):
_PRINT_ENV = value _PRINT_ENV = value
def lang_class_is_loaded(lang):
"""Check whether a Language class is already loaded. Language classes are
loaded lazily, to avoid expensive setup code associated with the language
data.
lang (unicode): Two-letter language code, e.g. 'en'.
RETURNS (bool): Whether a Language class has been loaded.
"""
global LANGUAGES
return lang in LANGUAGES
def get_lang_class(lang): def get_lang_class(lang):
"""Import and load a Language class. """Import and load a Language class.

View File

@ -67,6 +67,17 @@ cdef class Vocab:
langfunc = self.lex_attr_getters.get(LANG, None) langfunc = self.lex_attr_getters.get(LANG, None)
return langfunc("_") if langfunc else "" return langfunc("_") if langfunc else ""
property writing_system:
"""A dict with information about the language's writing system. To get
the data, we use the vocab.lang property to fetch the Language class.
If the Language class is not loaded, an empty dict is returned.
"""
def __get__(self):
if not util.lang_class_is_loaded(self.lang):
return {}
lang_class = util.get_lang_class(self.lang)
return dict(lang_class.Defaults.writing_system)
def __len__(self): def __len__(self):
"""The current number of lexemes stored. """The current number of lexemes stored.