From 09aed581405588947697cdaa2474fd7705197c86 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 14 Oct 2017 12:52:59 +0200
Subject: [PATCH] Port over changes from #1333 and add comments

---
 spacy/lang/char_classes.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index b10481411..89774b17d 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -29,11 +29,19 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm
           'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
           'TB T G M K %')
 _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
+
+# These expressions contain various unicode variations, including characters
+# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
+# conflicts, spaCy's base tokenizer should handle all of those by default
 _punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ ·'
-_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
+_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
 _hyphens = '- – — -- --- —— ~'
+
+# Various symbols like dingbats, but also emoji
+# Details: https://www.compart.com/en/unicode/category/So
 _other_symbols = r'[\p{So}]'
 
+
 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 QUOTES = merge_chars(_quotes)