spaCy/spacy/lang/char_classes.py

# coding: utf8
from __future__ import unicode_literals

import regex as re


re.DEFAULT_VERSION = re.VERSION1
merge_char_classes = lambda classes: '[{}]'.format('||'.join(classes))
split_chars = lambda char: list(char.strip().split(' '))
merge_chars = lambda char: char.strip().replace(' ', '|')


_bengali = r'[\p{L}&&\p{Bengali}]'
_hebrew = r'[\p{L}&&\p{Hebrew}]'
_latin_lower = r'[\p{Ll}&&\p{Latin}]'
_latin_upper = r'[\p{Lu}&&\p{Latin}]'
_latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
_persian = r'[\p{L}&&\p{Arabic}]'
_russian_lower = r'[ёа-я]'
_russian_upper = r'[ЁА-Я]'

_upper = [_latin_upper, _russian_upper]
_lower = [_latin_lower, _russian_lower]
_uncased = [_bengali, _hebrew, _persian]

ALPHA = merge_char_classes(_upper + _lower + _uncased)
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
ALPHA_UPPER = merge_char_classes(_upper + _uncased)


_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
          'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
          'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
          'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼'

# These expressions contain various unicode variations, including characters
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
# conflicts, spaCy's base tokenizer should handle all of those by default
_punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ؛ ٪'
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
_hyphens = '- – — -- --- —— ~'

# Various symbols like dingbats, but also emoji
# Details: https://www.compart.com/en/unicode/category/So
_other_symbols = r'[\p{So}]'


UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency)
QUOTES = merge_chars(_quotes)
PUNCT = merge_chars(_punct)
HYPHENS = merge_chars(_hyphens)
ICONS = _other_symbols

LIST_UNITS = split_chars(_units)
LIST_CURRENCY = split_chars(_currency)
LIST_QUOTES = split_chars(_quotes)
LIST_PUNCT = split_chars(_punct)
LIST_HYPHENS = split_chars(_hyphens)
LIST_ELLIPSES = [r'\.\.+', '…']
LIST_ICONS = [_other_symbols]
-												Add char classes to global language data

											
										
										
											2017-05-09 00:59:33 +03:00
+								# coding: utf8
 								from __future__ import unicode_literals
 								import regex as re
 								re.DEFAULT_VERSION = re.VERSION1
 								merge_char_classes = lambda classes: '[{}]'.format('||'.join(classes))
 								split_chars = lambda char: list(char.strip().split(' '))
 								merge_chars = lambda char: char.strip().replace(' ', '|')
 								_bengali = r'[\p{L}&&\p{Bengali}]'
 								_hebrew = r'[\p{L}&&\p{Hebrew}]'
 								_latin_lower = r'[\p{Ll}&&\p{Latin}]'
 								_latin_upper = r'[\p{Lu}&&\p{Latin}]'
 								_latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
-												Add Persian character and symbols 

Add Persian characters and the following:
- ٪ used instead of %
- ؟ used instead of ?
- ﷼ used instead of $
- ، used instead of ,
- ؛ used instead of ;
											
										
										
											2018-01-23 12:50:36 +03:00
+								_persian = r'[\p{L}&&\p{Arabic}]'
-												Fixed tokenizer: added char classes; added first lemmatizer and
tokenizer tests

											
										
										
											2017-11-21 22:23:59 +03:00
+								_russian_lower = r'[ёа-я]'
 								_russian_upper = r'[ЁА-Я]'
-												Add char classes to global language data

											
										
										
											2017-05-09 00:59:33 +03:00
-												Fixed tokenizer: added char classes; added first lemmatizer and
tokenizer tests

											
										
										
											2017-11-21 22:23:59 +03:00
+								_upper = [_latin_upper, _russian_upper]
 								_lower = [_latin_lower, _russian_lower]
-												Add Persian character and symbols 

Add Persian characters and the following:
- ٪ used instead of %
- ؟ used instead of ?
- ﷼ used instead of $
- ، used instead of ,
- ؛ used instead of ;
											
										
										
											2018-01-23 12:50:36 +03:00
+								_uncased = [_bengali, _hebrew, _persian]
-												Add char classes to global language data

											
										
										
											2017-05-09 00:59:33 +03:00
 								ALPHA = merge_char_classes(_upper + _lower + _uncased)
 								ALPHA_LOWER = merge_char_classes(_lower + _uncased)
 								ALPHA_UPPER = merge_char_classes(_upper + _uncased)
 								_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
 								          'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
-												add ٪ as punctuation
											
										
										
											2018-01-23 17:41:33 +03:00
+								          'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
-												Fixed tokenizer: added char classes; added first lemmatizer and
tokenizer tests

											
										
										
											2017-11-21 22:23:59 +03:00
+								          'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
-												Add Persian character and symbols 

Add Persian characters and the following:
- ٪ used instead of %
- ؟ used instead of ?
- ﷼ used instead of $
- ، used instead of ,
- ؛ used instead of ;
											
										
										
											2018-01-23 12:50:36 +03:00
+								_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼'
-												Port over changes from #1333 and add comments

											
										
										
											2017-10-14 13:52:59 +03:00
 								# These expressions contain various unicode variations, including characters
 								# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
 								# conflicts, spaCy's base tokenizer should handle all of those by default
-												add ٪ as punctuation
											
										
										
											2018-01-23 17:41:33 +03:00
+								_punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ؛ ٪'
-												Port over changes from #1333 and add comments

											
										
										
											2017-10-14 13:52:59 +03:00
+								_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
-												Port over changes from #1340

											
										
										
											2017-09-26 17:38:08 +03:00
+								_hyphens = '- – — -- --- —— ~'
-												Port over changes from #1333 and add comments

											
										
										
											2017-10-14 13:52:59 +03:00
 								# Various symbols like dingbats, but also emoji
 								# Details: https://www.compart.com/en/unicode/category/So
-												Add symbols class to punctuation rules to handle emoji (see #1088)

Currently doesn't work for Hungarian, because of conflicts with the
custom punctuation rules. Also doesn't take multi-character emoji like
👩🏽‍💻 into account.

											
										
										
											2017-05-27 18:57:10 +03:00
+								_other_symbols = r'[\p{So}]'
-												Add char classes to global language data

											
										
										
											2017-05-09 00:59:33 +03:00
-												Port over changes from #1333 and add comments

											
										
										
											2017-10-14 13:52:59 +03:00
-												Add char classes to global language data

											
										
										
											2017-05-09 00:59:33 +03:00
+								UNITS = merge_chars(_units)
 								CURRENCY = merge_chars(_currency)
 								QUOTES = merge_chars(_quotes)
 								PUNCT = merge_chars(_punct)
 								HYPHENS = merge_chars(_hyphens)
-												Add symbols class to punctuation rules to handle emoji (see #1088)

Currently doesn't work for Hungarian, because of conflicts with the
custom punctuation rules. Also doesn't take multi-character emoji like
👩🏽‍💻 into account.

											
										
										
											2017-05-27 18:57:10 +03:00
+								ICONS = _other_symbols
-												Add char classes to global language data

											
										
										
											2017-05-09 00:59:33 +03:00
 								LIST_UNITS = split_chars(_units)
 								LIST_CURRENCY = split_chars(_currency)
 								LIST_QUOTES = split_chars(_quotes)
 								LIST_PUNCT = split_chars(_punct)
 								LIST_HYPHENS = split_chars(_hyphens)
 								LIST_ELLIPSES = [r'\.\.+', '…']
-												Add symbols class to punctuation rules to handle emoji (see #1088)

Currently doesn't work for Hungarian, because of conflicts with the
custom punctuation rules. Also doesn't take multi-character emoji like
👩🏽‍💻 into account.

											
										
										
											2017-05-27 18:57:10 +03:00
+								LIST_ICONS = [_other_symbols]