Add Language class, stop words and basic stemmer that sets NORM

This commit is contained in:
ines 2017-10-14 14:59:52 +02:00
parent e85e1d571b
commit 266e7180a7
3 changed files with 239 additions and 0 deletions

24
spacy/lang/hi/__init__.py Normal file
View File

@ -0,0 +1,24 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
class HindiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'hi'
stop_words = STOP_WORDS
class Hindi(Language):
lang = 'hi'
Defaults = HindiDefaults
__all__ = ['Hindi']

View File

@ -0,0 +1,38 @@
# coding: utf8
from __future__ import unicode_literals
from ..norm_exceptions import BASE_NORMS
from ...attrs import NORM
from ...util import add_lookups
_stem_suffixes = [
["","","","","","ि",""],
["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"],
["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"],
["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"],
["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]
]
def norm(string):
# normalise base exceptions, e.g. punctuation or currency symbols
if string in BASE_NORMS:
return BASE_NORMS[string]
# set stem word as norm, if available, adapted from:
# http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
# http://research.variancia.com/hindi_stemmer/
# https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
for suffix_group in reversed(_stem_suffixes):
length = len(suffix_group[0])
if len(string) <= length:
break
for suffix in suffix_group:
if string.endswith(suffix):
return string[:-length]
return string
LEX_ATTRS = {
NORM: norm
}

177
spacy/lang/hi/stop_words.py Normal file
View File

@ -0,0 +1,177 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt
STOP_WORDS = set("""
अत
अपन
अपन
अपन
अभ
दर
आदि
आप
इति
इन
इनक
इन
इन
इन
इस
इसक
इसक
इसक
इसम
इस
इस
उन
उनक
उनक
उनक
उनक
उन
उन
उन
उस
उसक
उस
उस
एक
एव
एस
ऐस
और
कई
कर
करत
करत
करन
करन
कर
कहत
कह
ि
ितन
ि
ि
ि
ि
ि
ि
ि
नस
गय
घर
जब
जह
ितन
ि
ि
ि
ि
ि
धर
तक
तब
तरह
ि
ि
ि
ि
ि
दब
ि
सर
सर
नक
नह
ियत
पर
पहल
ि
बन
बह
बह
िलक
तर
मगर
यदि
यह
यह
यह
ि
रख
रह
रह
ि
ि
ि
वग़रह
वर
वह
वह
वह
सकत
सकत
सबस
सभ
""".split())