diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index dc4950eed..892de6374 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -32,9 +32,11 @@ _tatar_lower = r"әөүҗңһ" _tatar_upper = r"ӘӨҮҖҢҺ" _greek_lower = r"α-ωάέίόώήύ" _greek_upper = r"Α-ΩΆΈΊΌΏΉΎ" +_ukrainian_lower = r"а-щюяіїєґ" +_ukrainian_upper = r"А-ЩЮЯІЇЄҐ" -_upper = _latin_upper + _russian_upper + _tatar_upper + _greek_upper -_lower = _latin_lower + _russian_lower + _tatar_lower + _greek_lower +_upper = _latin_upper + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper +_lower = _latin_lower + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower _uncased = _bengali + _hebrew + _persian + _sinhala ALPHA = group_chars(_upper + _lower + _uncased) @@ -48,7 +50,7 @@ _units = ( "кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб" "كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب" ) -_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼" +_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴" # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language