Add CJK to character classes (#4884)

* Add CJK character class as uncased

* Incorporate Chinese URL test case

Un-xfail Chinese URL test instance
This commit is contained in:
adrianeboyd 2020-01-08 16:50:19 +01:00 committed by Ines Montani
parent b216ff43c9
commit d24bca62f6
2 changed files with 13 additions and 1 deletions

View File

@ -17,6 +17,17 @@ _tamil = r"\u0B80-\u0BFF"
_telugu = r"\u0C00-\u0C7F" _telugu = r"\u0C00-\u0C7F"
# from the final table in: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
_cjk = (
r"\u4E00-\u62FF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FFF\u3400-\u4DBF"
r"\U00020000-\U000215FF\U00021600-\U000230FF\U00023100-\U000245FF"
r"\U00024600-\U000260FF\U00026100-\U000275FF\U00027600-\U000290FF"
r"\U00029100-\U0002A6DF\U0002A700-\U0002B73F\U0002B740-\U0002B81F"
r"\U0002B820-\U0002CEAF\U0002CEB0-\U0002EBEF\u2E80-\u2EFF\u2F00-\u2FDF"
r"\u2FF0-\u2FFF\u3000-\u303F\u31C0-\u31EF\u3200-\u32FF\u3300-\u33FF"
r"\uF900-\uFAFF\uFE30-\uFE4F\U0001F200-\U0001F2FF\U0002F800-\U0002FA1F"
)
# Latin standard # Latin standard
_latin_u_standard = r"A-Z" _latin_u_standard = r"A-Z"
_latin_l_standard = r"a-z" _latin_l_standard = r"a-z"
@ -215,6 +226,7 @@ _uncased = (
+ _tamil + _tamil
+ _telugu + _telugu
+ _hangul + _hangul
+ _cjk
) )
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)

View File

@ -67,7 +67,7 @@ URLS_SHOULD_MATCH = [
"http://✪df.ws/123", "http://✪df.ws/123",
"http://➡.ws/䨹", "http://➡.ws/䨹",
"http://مثال.إختبار", "http://مثال.إختبار",
pytest.param("http://例子.测试", marks=pytest.mark.xfail()), "http://例子.测试",
"http://उदाहरण.परीक्षा", "http://उदाहरण.परीक्षा",
] ]