Merge pull request #1425 from explosion/feature/hindi-tokenizer

💫 Basic Hindi tokenization support
2025-07-07 21:33:13 +03:00 · 2017-10-18 13:34:52 +02:00 · 2017-10-18 13:34:52 +02:00 · f0d577e460
commit f0d577e460
parent 394633efce c0aceb9fbe
5 changed files with 241 additions and 1 deletions
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -33,7 +33,7 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 # These expressions contain various unicode variations, including characters
 # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
 # conflicts, spaCy's base tokenizer should handle all of those by default
-_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ ·'
+_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · ।'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
 _hyphens = '- – — -- --- —— ~'
--- a/spacy/lang/hi/init.py
+++ b/spacy/lang/hi/init.py
@ -0,0 +1,24 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG
 class HindiDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: 'hi'
    stop_words = STOP_WORDS
 class Hindi(Language):
    lang = 'hi'
    Defaults = HindiDefaults
 __all__ = ['Hindi']
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@ -0,0 +1,38 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..norm_exceptions import BASE_NORMS
 from ...attrs import NORM
 from ...util import add_lookups
 _stem_suffixes = [
    ["ो","े","ू","ु","ी","ि","ा"],
    ["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"],
    ["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"],
    ["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"],
    ["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]
 ]
 def norm(string):
    # normalise base exceptions, e.g. punctuation or currency symbols
    if string in BASE_NORMS:
        return BASE_NORMS[string]
    # set stem word as norm, if available, adapted from:
    # http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
    # http://research.variancia.com/hindi_stemmer/
    # https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
    for suffix_group in reversed(_stem_suffixes):
        length = len(suffix_group[0])
        if len(string) <= length:
            break
        for suffix in suffix_group:
            if string.endswith(suffix):
                return string[:-length]
    return string
 LEX_ATTRS = {
    NORM: norm
 }
--- a/spacy/lang/hi/stop_words.py
+++ b/spacy/lang/hi/stop_words.py
@ -0,0 +1,177 @@
 # coding: utf8
 from __future__ import unicode_literals
 # Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt
 STOP_WORDS = set("""
 अत
 अपना
 अपनी
 अपने
 अभी
 अंदर
 आदि
 आप
 इत्यादि
 इन
 इनका
 इन्हीं
 इन्हें
 इन्हों
 इस
 इसका
 इसकी
 इसके
 इसमें
 इसी
 इसे
 उन
 उनका
 उनकी
 उनके
 उनको
 उन्हीं
 उन्हें
 उन्हों
 उस
 उसके
 उसी
 उसे
 एक
 एवं
 एस
 ऐसे
 और
 कई
 कर
 करता
 करते
 करना
 करने
 करें
 कहते
 कहा
 का
 काफ़ी
 कि
 कितना
 किन्हें
 किन्हों
 किया
 किर
 किस
 किसी
 किसे
 की
 कुछ
 कुल
 के
 को
 कोई
 कौन
 कौनसा
 गया
 घर
 जब
 जहाँ
 जा
 जितना
 जिन
 जिन्हें
 जिन्हों
 जिस
 जिसे
 जीधर
 जैसा
 जैसे
 जो
 तक
 तब
 तरह
 तिन
 तिन्हें
 तिन्हों
 तिस
 तिसे
 तो
 था
 थी
 थे
 दबारा
 दिया
 दुसरा
 दूसरे
 दो
 द्वारा
 न
 नके
 नहीं
 ना
 निहायत
 नीचे
 ने
 पर
 पहले
 पूरा
 पे
 फिर
 बनी
 बही
 बहुत
 बाद
 बाला
 बिलकुल
 भी
 भीतर
 मगर
 मानो
 मे
 में
 यदि
 यह
 यहाँ
 यही
 या
 यिह
 ये
 रखें
 रहा
 रहे
 ऱ्वासा
 लिए
 लिये
 लेकिन
 व
 वग़ैरह
 वर्ग
 वह
 वहाँ
 वहीं
 वाले
 वुह
 वे
 सकता
 सकते
 सबसे
 सभी
 साथ
 साबुत
 साभ
 सारा
 से
 सो
 संग
 ही
 हुआ
 हुई
 हुए
 है
 हैं
 हो
 होता
 होती
 होते
 होना
 होने
 """.split())
--- a/website/models/_data.json
+++ b/website/models/_data.json
@ -82,6 +82,7 @@
        "pl": "Polish",
        "he": "Hebrew",
        "bn": "Bengali",
        "hi": "Hindi",
        "id": "Indonesian",
        "th": "Thai",
        "zh": "Chinese",