From 39a4741e264d75599508f1a4d4f4fa797c05c263 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 11 Mar 2019 15:23:20 +0100 Subject: [PATCH] Add support for vocab.writing_system property (#3390) * Add xfail test for vocab.writing_system * Add vocab.writing_system property * Set Language.Defaults.writing_system * Set default writing system * Remove xfail on test_vocab_writing_system --- spacy/lang/fa/__init__.py | 1 + spacy/lang/he/__init__.py | 2 +- spacy/lang/ja/__init__.py | 1 + spacy/lang/zh/__init__.py | 2 +- spacy/language.py | 1 + spacy/tests/vocab_vectors/test_vocab_api.py | 5 +++++ spacy/util.py | 12 ++++++++++++ spacy/vocab.pyx | 11 +++++++++++ 8 files changed, 33 insertions(+), 2 deletions(-) diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 8756c3ff9..4041ec635 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -27,6 +27,7 @@ class PersianDefaults(Language.Defaults): stop_words = STOP_WORDS tag_map = TAG_MAP suffixes = TOKENIZER_SUFFIXES + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Persian(Language): diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index c7ba4ebf8..0ad65a0b4 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -14,7 +14,7 @@ class HebrewDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: "he" tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = STOP_WORDS - + writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} class Hebrew(Language): lang = "he" diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index daea9b8d6..e35967409 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -94,6 +94,7 @@ class JapaneseDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda _text: "ja" stop_words = STOP_WORDS tag_map = TAG_MAP + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @classmethod def create_tokenizer(cls, nlp=None): diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 04a7d1508..708e446ba 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -14,7 +14,7 @@ class ChineseDefaults(Language.Defaults): use_jieba = True tokenizer_exceptions = BASE_EXCEPTIONS stop_words = STOP_WORDS - + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} class Chinese(Language): lang = "zh" diff --git a/spacy/language.py b/spacy/language.py index 44a819132..e97b74a77 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -94,6 +94,7 @@ class BaseDefaults(object): morph_rules = {} lex_attr_getters = LEX_ATTRS syntax_iterators = {} + writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} class Language(object): diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index 8c826e8c3..59a911830 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -45,3 +45,8 @@ def test_vocab_api_contains(en_vocab, text): _ = en_vocab[text] # noqa: F841 assert text in en_vocab assert "LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm" not in en_vocab + + +def test_vocab_writing_system(en_vocab): + assert en_vocab.writing_system["direction"] == "ltr" + assert en_vocab.writing_system["has_case"] == True diff --git a/spacy/util.py b/spacy/util.py index 0066b196d..137d466d5 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -38,6 +38,18 @@ def set_env_log(value): _PRINT_ENV = value +def lang_class_is_loaded(lang): + """Check whether a Language class is already loaded. Language classes are + loaded lazily, to avoid expensive setup code associated with the language + data. + + lang (unicode): Two-letter language code, e.g. 'en'. + RETURNS (bool): Whether a Language class has been loaded. + """ + global LANGUAGES + return lang in LANGUAGES + + def get_lang_class(lang): """Import and load a Language class. diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 0923f977a..90e7dca34 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -67,6 +67,17 @@ cdef class Vocab: langfunc = self.lex_attr_getters.get(LANG, None) return langfunc("_") if langfunc else "" + property writing_system: + """A dict with information about the language's writing system. To get + the data, we use the vocab.lang property to fetch the Language class. + If the Language class is not loaded, an empty dict is returned. + """ + def __get__(self): + if not util.lang_class_is_loaded(self.lang): + return {} + lang_class = util.get_lang_class(self.lang) + return dict(lang_class.Defaults.writing_system) + def __len__(self): """The current number of lexemes stored.