From d24bca62f6d4c4af87bf8b904be0af17382ae673 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Wed, 8 Jan 2020 16:50:19 +0100
Subject: [PATCH] Add CJK to character classes (#4884)

* Add CJK character class as uncased

* Incorporate Chinese URL test case

Un-xfail Chinese URL test instance
---
 spacy/lang/char_classes.py         | 12 ++++++++++++
 spacy/tests/tokenizer/test_urls.py |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 2c8823867..bd0f7e437 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -17,6 +17,17 @@ _tamil = r"\u0B80-\u0BFF"
 
 _telugu = r"\u0C00-\u0C7F"
 
+# from the final table in: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
+_cjk = (
+    r"\u4E00-\u62FF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FFF\u3400-\u4DBF"
+    r"\U00020000-\U000215FF\U00021600-\U000230FF\U00023100-\U000245FF"
+    r"\U00024600-\U000260FF\U00026100-\U000275FF\U00027600-\U000290FF"
+    r"\U00029100-\U0002A6DF\U0002A700-\U0002B73F\U0002B740-\U0002B81F"
+    r"\U0002B820-\U0002CEAF\U0002CEB0-\U0002EBEF\u2E80-\u2EFF\u2F00-\u2FDF"
+    r"\u2FF0-\u2FFF\u3000-\u303F\u31C0-\u31EF\u3200-\u32FF\u3300-\u33FF"
+    r"\uF900-\uFAFF\uFE30-\uFE4F\U0001F200-\U0001F2FF\U0002F800-\U0002FA1F"
+)
+
 # Latin standard
 _latin_u_standard = r"A-Z"
 _latin_l_standard = r"a-z"
@@ -215,6 +226,7 @@ _uncased = (
     + _tamil
     + _telugu
     + _hangul
+    + _cjk
 )
 
 ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index ef99484ee..58e9d73f3 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -67,7 +67,7 @@ URLS_SHOULD_MATCH = [
     "http://✪df.ws/123",
     "http://➡.ws/䨹",
     "http://مثال.إختبار",
-    pytest.param("http://例子.测试", marks=pytest.mark.xfail()),
+    "http://例子.测试",
     "http://उदाहरण.परीक्षा",
 ]