mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Port over changes from #1333 and add comments
This commit is contained in:
parent
a5da683578
commit
09aed58140
|
@ -29,11 +29,19 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm
|
|||
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
||||
'TB T G M K %')
|
||||
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
||||
|
||||
# These expressions contain various unicode variations, including characters
|
||||
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
||||
# conflicts, spaCy's base tokenizer should handle all of those by default
|
||||
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ ·'
|
||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
|
||||
_hyphens = '- – — -- --- —— ~'
|
||||
|
||||
# Various symbols like dingbats, but also emoji
|
||||
# Details: https://www.compart.com/en/unicode/category/So
|
||||
_other_symbols = r'[\p{So}]'
|
||||
|
||||
|
||||
UNITS = merge_chars(_units)
|
||||
CURRENCY = merge_chars(_currency)
|
||||
QUOTES = merge_chars(_quotes)
|
||||
|
|
Loading…
Reference in New Issue
Block a user