Merge pull request #1425 from explosion/feature/hindi-tokenizer

💫 Basic Hindi tokenization support
2025-07-31 10:29:46 +03:00 · 2017-10-18 13:34:52 +02:00 · 2017-10-18 13:34:52 +02:00 · f0d577e460
commit f0d577e460
parent 394633efce c0aceb9fbe
5 changed files with 241 additions and 1 deletions
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -33,7 +33,7 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 # These expressions contain various unicode variations, including characters
 # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
 # conflicts, spaCy's base tokenizer should handle all of those by default
-_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ ·'
+_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · ।'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
 _hyphens = '- – — -- --- —— ~'

--- a/spacy/lang/hi/init.py
+++ b/spacy/lang/hi/init.py
@ -0,0 +1,24 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG
+
+
+class HindiDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
+    lex_attr_getters[LANG] = lambda text: 'hi'
+    stop_words = STOP_WORDS
+
+
+class Hindi(Language):
+    lang = 'hi'
+    Defaults = HindiDefaults
+
+
+__all__ = ['Hindi']
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@ -0,0 +1,38 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..norm_exceptions import BASE_NORMS
+from ...attrs import NORM
+from ...util import add_lookups
+
+
+_stem_suffixes = [
+    ["ो","े","ू","ु","ी","ि","ा"],
+    ["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"],
+    ["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"],
+    ["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"],
+    ["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]
+]
+
+
+def norm(string):
+    # normalise base exceptions, e.g. punctuation or currency symbols
+    if string in BASE_NORMS:
+        return BASE_NORMS[string]
+    # set stem word as norm, if available, adapted from:
+    # http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
+    # http://research.variancia.com/hindi_stemmer/
+    # https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
+    for suffix_group in reversed(_stem_suffixes):
+        length = len(suffix_group[0])
+        if len(string) <= length:
+            break
+        for suffix in suffix_group:
+            if string.endswith(suffix):
+                return string[:-length]
+    return string
+
+
+LEX_ATTRS = {
+    NORM: norm
+}
--- a/spacy/lang/hi/stop_words.py
+++ b/spacy/lang/hi/stop_words.py
@ -0,0 +1,177 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt
+
+STOP_WORDS = set("""
+अत
+अपना
+अपनी
+अपने
+अभी
+अंदर
+आदि
+आप
+इत्यादि
+इन
+इनका
+इन्हीं
+इन्हें
+इन्हों
+इस
+इसका
+इसकी
+इसके
+इसमें
+इसी
+इसे
+उन
+उनका
+उनकी
+उनके
+उनको
+उन्हीं
+उन्हें
+उन्हों
+उस
+उसके
+उसी
+उसे
+एक
+एवं
+एस
+ऐसे
+और
+कई
+कर
+करता
+करते
+करना
+करने
+करें
+कहते
+कहा
+का
+काफ़ी
+कि
+कितना
+किन्हें
+किन्हों
+किया
+किर
+किस
+किसी
+किसे
+की
+कुछ
+कुल
+के
+को
+कोई
+कौन
+कौनसा
+गया
+घर
+जब
+जहाँ
+जा
+जितना
+जिन
+जिन्हें
+जिन्हों
+जिस
+जिसे
+जीधर
+जैसा
+जैसे
+जो
+तक
+तब
+तरह
+तिन
+तिन्हें
+तिन्हों
+तिस
+तिसे
+तो
+था
+थी
+थे
+दबारा
+दिया
+दुसरा
+दूसरे
+दो
+द्वारा
+न
+नके
+नहीं
+ना
+निहायत
+नीचे
+ने
+पर
+पहले
+पूरा
+पे
+फिर
+बनी
+बही
+बहुत
+बाद
+बाला
+बिलकुल
+भी
+भीतर
+मगर
+मानो
+मे
+में
+यदि
+यह
+यहाँ
+यही
+या
+यिह
+ये
+रखें
+रहा
+रहे
+ऱ्वासा
+लिए
+लिये
+लेकिन
+व
+वग़ैरह
+वर्ग
+वह
+वहाँ
+वहीं
+वाले
+वुह
+वे
+सकता
+सकते
+सबसे
+सभी
+साथ
+साबुत
+साभ
+सारा
+से
+सो
+संग
+ही
+हुआ
+हुई
+हुए
+है
+हैं
+हो
+होता
+होती
+होते
+होना
+होने
+""".split())
--- a/website/models/_data.json
+++ b/website/models/_data.json
@ -82,6 +82,7 @@
        "pl": "Polish",
        "he": "Hebrew",
        "bn": "Bengali",
+        "hi": "Hindi",
        "id": "Indonesian",
        "th": "Thai",
        "zh": "Chinese",