Add Persian character and symbols

Add Persian characters and the following:
- ٪ used instead of %
- ؟ used instead of ?
- ﷼ used instead of $
- ، used instead of ,
- ؛ used instead of ;
This commit is contained in:
Ali Zarezade 2018-01-23 13:20:36 +03:30 committed by GitHub
parent 7e6dc283db
commit 2bda582135
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -15,12 +15,13 @@ _hebrew = r'[\p{L}&&\p{Hebrew}]'
_latin_lower = r'[\p{Ll}&&\p{Latin}]'
_latin_upper = r'[\p{Lu}&&\p{Latin}]'
_latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
_persian = r'[\p{L}&&\p{Arabic}]'
_russian_lower = r'[ёа-я]'
_russian_upper = r'[ЁА-Я]'
_upper = [_latin_upper, _russian_upper]
_lower = [_latin_lower, _russian_lower]
_uncased = [_bengali, _hebrew]
_uncased = [_bengali, _hebrew, _persian]
ALPHA = merge_char_classes(_upper + _lower + _uncased)
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
@ -29,14 +30,14 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
'TB T G M K % ٪ км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽'
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽'
# These expressions contain various unicode variations, including characters
# used in Chinese (see #1333, #1340, #1351) unless there are cross-language
# conflicts, spaCy's base tokenizer should handle all of those by default
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 · ।'
_punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 · । ، ؛'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » « 「 」 『 』 【 】 《 》 〈 〉'
_hyphens = '- — -- --- —— ~'