From 815f8d13dd0cfe034201b2c35452012a7adb1e03 Mon Sep 17 00:00:00 2001 From: yash Date: Thu, 11 Jul 2019 15:00:51 +0530 Subject: [PATCH] Fix default punctuation rules for hindi text (#3625 explosion) --- spacy/lang/char_classes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index cb2e817d5..fb320b2ff 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -9,6 +9,8 @@ _bengali = r"\u0980-\u09FF" _hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F" +_hindi = r"\u0900-\u097F" + # Latin standard _latin_u_standard = r"A-Z" _latin_l_standard = r"a-z" @@ -193,7 +195,7 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ" _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower -_uncased = _bengali + _hebrew + _persian + _sinhala +_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased)