From e85e1d571b834d35922a816e1886cfc74cdf50d8 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 14 Oct 2017 14:59:23 +0200 Subject: [PATCH 1/3] Update base punctuation --- spacy/lang/char_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 89774b17d..7ec631c92 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -33,7 +33,7 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language # conflicts, spaCy's base tokenizer should handle all of those by default -_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ ·' +_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · ।' _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' _hyphens = '- – — -- --- —— ~' From 266e7180a747c78ac4a123c12ef6c1fc3e0286c5 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 14 Oct 2017 14:59:52 +0200 Subject: [PATCH 2/3] Add Language class, stop words and basic stemmer that sets NORM --- spacy/lang/hi/__init__.py | 24 +++++ spacy/lang/hi/lex_attrs.py | 38 ++++++++ spacy/lang/hi/stop_words.py | 177 ++++++++++++++++++++++++++++++++++++ 3 files changed, 239 insertions(+) create mode 100644 spacy/lang/hi/__init__.py create mode 100644 spacy/lang/hi/lex_attrs.py create mode 100644 spacy/lang/hi/stop_words.py diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py new file mode 100644 index 000000000..0503b5b7f --- /dev/null +++ b/spacy/lang/hi/__init__.py @@ -0,0 +1,24 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS + +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG + + +class HindiDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: 'hi' + stop_words = STOP_WORDS + + +class Hindi(Language): + lang = 'hi' + Defaults = HindiDefaults + + +__all__ = ['Hindi'] diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py new file mode 100644 index 000000000..8886e26c3 --- /dev/null +++ b/spacy/lang/hi/lex_attrs.py @@ -0,0 +1,38 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..norm_exceptions import BASE_NORMS +from ...attrs import NORM +from ...util import add_lookups + + +_stem_suffixes = [ + ["ो","े","ू","ु","ी","ि","ा"], + ["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"], + ["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"], + ["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"], + ["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"] +] + + +def norm(string): + # normalise base exceptions, e.g. punctuation or currency symbols + if string in BASE_NORMS: + return BASE_NORMS[string] + # set stem word as norm, if available, adapted from: + # http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf + # http://research.variancia.com/hindi_stemmer/ + # https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142 + for suffix_group in reversed(_stem_suffixes): + length = len(suffix_group[0]) + if len(string) <= length: + break + for suffix in suffix_group: + if string.endswith(suffix): + return string[:-length] + return string + + +LEX_ATTRS = { + NORM: norm +} diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py new file mode 100644 index 000000000..2ff27c015 --- /dev/null +++ b/spacy/lang/hi/stop_words.py @@ -0,0 +1,177 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt + +STOP_WORDS = set(""" +अत +अपना +अपनी +अपने +अभी +अंदर +आदि +आप +इत्यादि +इन +इनका +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हों +उस +उसके +उसी +उसे +एक +एवं +एस +ऐसे +और +कई +कर +करता +करते +करना +करने +करें +कहते +कहा +का +काफ़ी +कि +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गया +घर +जब +जहाँ +जा +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसे +जीधर +जैसा +जैसे +जो +तक +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तो +था +थी +थे +दबारा +दिया +दुसरा +दूसरे +दो +द्वारा +न +नके +नहीं +ना +निहायत +नीचे +ने +पर +पहले +पूरा +पे +फिर +बनी +बही +बहुत +बाद +बाला +बिलकुल +भी +भीतर +मगर +मानो +मे +में +यदि +यह +यहाँ +यही +या +यिह +ये +रखें +रहा +रहे +ऱ्वासा +लिए +लिये +लेकिन +व +वग़ैरह +वर्ग +वह +वहाँ +वहीं +वाले +वुह +वे +सकता +सकते +सबसे +सभी +साथ +साबुत +साभ +सारा +से +सो +संग +ही +हुआ +हुई +हुए +है +हैं +हो +होता +होती +होते +होना +होने +""".split()) From c0aceb9fbecfa0c62b3d3624b627a79e9984c040 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 14 Oct 2017 15:16:41 +0200 Subject: [PATCH 3/3] Add Hindi to supported languages --- website/models/_data.json | 1 + 1 file changed, 1 insertion(+) diff --git a/website/models/_data.json b/website/models/_data.json index f7ba16c9f..ff65d44ef 100644 --- a/website/models/_data.json +++ b/website/models/_data.json @@ -83,6 +83,7 @@ "ru": "Russian", "he": "Hebrew", "bn": "Bengali", + "hi": "Hindi", "id": "Indonesian", "th": "Thai", "zh": "Chinese",