From d24bca62f6d4c4af87bf8b904be0af17382ae673 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 8 Jan 2020 16:50:19 +0100 Subject: [PATCH] Add CJK to character classes (#4884) * Add CJK character class as uncased * Incorporate Chinese URL test case Un-xfail Chinese URL test instance --- spacy/lang/char_classes.py | 12 ++++++++++++ spacy/tests/tokenizer/test_urls.py | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 2c8823867..bd0f7e437 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -17,6 +17,17 @@ _tamil = r"\u0B80-\u0BFF" _telugu = r"\u0C00-\u0C7F" +# from the final table in: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs +_cjk = ( + r"\u4E00-\u62FF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FFF\u3400-\u4DBF" + r"\U00020000-\U000215FF\U00021600-\U000230FF\U00023100-\U000245FF" + r"\U00024600-\U000260FF\U00026100-\U000275FF\U00027600-\U000290FF" + r"\U00029100-\U0002A6DF\U0002A700-\U0002B73F\U0002B740-\U0002B81F" + r"\U0002B820-\U0002CEAF\U0002CEB0-\U0002EBEF\u2E80-\u2EFF\u2F00-\u2FDF" + r"\u2FF0-\u2FFF\u3000-\u303F\u31C0-\u31EF\u3200-\u32FF\u3300-\u33FF" + r"\uF900-\uFAFF\uFE30-\uFE4F\U0001F200-\U0001F2FF\U0002F800-\U0002FA1F" +) + # Latin standard _latin_u_standard = r"A-Z" _latin_l_standard = r"a-z" @@ -215,6 +226,7 @@ _uncased = ( + _tamil + _telugu + _hangul + + _cjk ) ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index ef99484ee..58e9d73f3 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -67,7 +67,7 @@ URLS_SHOULD_MATCH = [ "http://✪df.ws/123", "http://➡.ws/䨹", "http://مثال.إختبار", - pytest.param("http://例子.测试", marks=pytest.mark.xfail()), + "http://例子.测试", "http://उदाहरण.परीक्षा", ]