Update base punctuation

This commit is contained in:
ines 2017-10-14 14:59:23 +02:00
parent 9d6c8eaa49
commit e85e1d571b

View File

@ -33,7 +33,7 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
# These expressions contain various unicode variations, including characters
# used in Chinese (see #1333, #1340, #1351) unless there are cross-language
# conflicts, spaCy's base tokenizer should handle all of those by default
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ·'
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ·'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » « 「 」 『 』 【 】 《 》 〈 〉'
_hyphens = '- — -- --- —— ~'