From e85e1d571b834d35922a816e1886cfc74cdf50d8 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 14 Oct 2017 14:59:23 +0200 Subject: [PATCH] Update base punctuation --- spacy/lang/char_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 89774b17d..7ec631c92 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -33,7 +33,7 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language # conflicts, spaCy's base tokenizer should handle all of those by default -_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ ·' +_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · ।' _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' _hyphens = '- – — -- --- —— ~'