mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Add CJK to character classes (#4884)
* Add CJK character class as uncased * Incorporate Chinese URL test case Un-xfail Chinese URL test instance
This commit is contained in:
parent
b216ff43c9
commit
d24bca62f6
|
@ -17,6 +17,17 @@ _tamil = r"\u0B80-\u0BFF"
|
|||
|
||||
_telugu = r"\u0C00-\u0C7F"
|
||||
|
||||
# from the final table in: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
|
||||
_cjk = (
|
||||
r"\u4E00-\u62FF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FFF\u3400-\u4DBF"
|
||||
r"\U00020000-\U000215FF\U00021600-\U000230FF\U00023100-\U000245FF"
|
||||
r"\U00024600-\U000260FF\U00026100-\U000275FF\U00027600-\U000290FF"
|
||||
r"\U00029100-\U0002A6DF\U0002A700-\U0002B73F\U0002B740-\U0002B81F"
|
||||
r"\U0002B820-\U0002CEAF\U0002CEB0-\U0002EBEF\u2E80-\u2EFF\u2F00-\u2FDF"
|
||||
r"\u2FF0-\u2FFF\u3000-\u303F\u31C0-\u31EF\u3200-\u32FF\u3300-\u33FF"
|
||||
r"\uF900-\uFAFF\uFE30-\uFE4F\U0001F200-\U0001F2FF\U0002F800-\U0002FA1F"
|
||||
)
|
||||
|
||||
# Latin standard
|
||||
_latin_u_standard = r"A-Z"
|
||||
_latin_l_standard = r"a-z"
|
||||
|
@ -215,6 +226,7 @@ _uncased = (
|
|||
+ _tamil
|
||||
+ _telugu
|
||||
+ _hangul
|
||||
+ _cjk
|
||||
)
|
||||
|
||||
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
|
||||
|
|
|
@ -67,7 +67,7 @@ URLS_SHOULD_MATCH = [
|
|||
"http://✪df.ws/123",
|
||||
"http://➡.ws/䨹",
|
||||
"http://مثال.إختبار",
|
||||
pytest.param("http://例子.测试", marks=pytest.mark.xfail()),
|
||||
"http://例子.测试",
|
||||
"http://उदाहरण.परीक्षा",
|
||||
]
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user