# coding: utf8 from __future__ import unicode_literals from ..norm_exceptions import BASE_NORMS from ...attrs import NORM from ...util import add_lookups _stem_suffixes = [ ["ो","े","ू","ु","ी","ि","ा"], ["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"], ["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"], ["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"], ["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"] ] def norm(string): # normalise base exceptions, e.g. punctuation or currency symbols if string in BASE_NORMS: return BASE_NORMS[string] # set stem word as norm, if available, adapted from: # http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf # http://research.variancia.com/hindi_stemmer/ # https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142 for suffix_group in reversed(_stem_suffixes): length = len(suffix_group[0]) if len(string) <= length: break for suffix in suffix_group: if string.endswith(suffix): return string[:-length] return string LEX_ATTRS = { NORM: norm }