From 266e7180a747c78ac4a123c12ef6c1fc3e0286c5 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 14 Oct 2017 14:59:52 +0200 Subject: [PATCH] Add Language class, stop words and basic stemmer that sets NORM --- spacy/lang/hi/__init__.py | 24 +++++ spacy/lang/hi/lex_attrs.py | 38 ++++++++ spacy/lang/hi/stop_words.py | 177 ++++++++++++++++++++++++++++++++++++ 3 files changed, 239 insertions(+) create mode 100644 spacy/lang/hi/__init__.py create mode 100644 spacy/lang/hi/lex_attrs.py create mode 100644 spacy/lang/hi/stop_words.py diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py new file mode 100644 index 000000000..0503b5b7f --- /dev/null +++ b/spacy/lang/hi/__init__.py @@ -0,0 +1,24 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS + +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG + + +class HindiDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: 'hi' + stop_words = STOP_WORDS + + +class Hindi(Language): + lang = 'hi' + Defaults = HindiDefaults + + +__all__ = ['Hindi'] diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py new file mode 100644 index 000000000..8886e26c3 --- /dev/null +++ b/spacy/lang/hi/lex_attrs.py @@ -0,0 +1,38 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..norm_exceptions import BASE_NORMS +from ...attrs import NORM +from ...util import add_lookups + + +_stem_suffixes = [ + ["ो","े","ू","ु","ी","ि","ा"], + ["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"], + ["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"], + ["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"], + ["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"] +] + + +def norm(string): + # normalise base exceptions, e.g. punctuation or currency symbols + if string in BASE_NORMS: + return BASE_NORMS[string] + # set stem word as norm, if available, adapted from: + # http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf + # http://research.variancia.com/hindi_stemmer/ + # https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142 + for suffix_group in reversed(_stem_suffixes): + length = len(suffix_group[0]) + if len(string) <= length: + break + for suffix in suffix_group: + if string.endswith(suffix): + return string[:-length] + return string + + +LEX_ATTRS = { + NORM: norm +} diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py new file mode 100644 index 000000000..2ff27c015 --- /dev/null +++ b/spacy/lang/hi/stop_words.py @@ -0,0 +1,177 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt + +STOP_WORDS = set(""" +अत +अपना +अपनी +अपने +अभी +अंदर +आदि +आप +इत्यादि +इन +इनका +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हों +उस +उसके +उसी +उसे +एक +एवं +एस +ऐसे +और +कई +कर +करता +करते +करना +करने +करें +कहते +कहा +का +काफ़ी +कि +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गया +घर +जब +जहाँ +जा +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसे +जीधर +जैसा +जैसे +जो +तक +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तो +था +थी +थे +दबारा +दिया +दुसरा +दूसरे +दो +द्वारा +न +नके +नहीं +ना +निहायत +नीचे +ने +पर +पहले +पूरा +पे +फिर +बनी +बही +बहुत +बाद +बाला +बिलकुल +भी +भीतर +मगर +मानो +मे +में +यदि +यह +यहाँ +यही +या +यिह +ये +रखें +रहा +रहे +ऱ्वासा +लिए +लिये +लेकिन +व +वग़ैरह +वर्ग +वह +वहाँ +वहीं +वाले +वुह +वे +सकता +सकते +सबसे +सभी +साथ +साबुत +साभ +सारा +से +सो +संग +ही +हुआ +हुई +हुए +है +हैं +हो +होता +होती +होते +होना +होने +""".split())