mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
39 lines
1.7 KiB
Python
39 lines
1.7 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ..norm_exceptions import BASE_NORMS
|
|
from ...attrs import NORM
|
|
from ...util import add_lookups
|
|
|
|
|
|
_stem_suffixes = [
|
|
["ो","े","ू","ु","ी","ि","ा"],
|
|
["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"],
|
|
["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"],
|
|
["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"],
|
|
["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]
|
|
]
|
|
|
|
|
|
def norm(string):
|
|
# normalise base exceptions, e.g. punctuation or currency symbols
|
|
if string in BASE_NORMS:
|
|
return BASE_NORMS[string]
|
|
# set stem word as norm, if available, adapted from:
|
|
# http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
|
|
# http://research.variancia.com/hindi_stemmer/
|
|
# https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
|
|
for suffix_group in reversed(_stem_suffixes):
|
|
length = len(suffix_group[0])
|
|
if len(string) <= length:
|
|
break
|
|
for suffix in suffix_group:
|
|
if string.endswith(suffix):
|
|
return string[:-length]
|
|
return string
|
|
|
|
|
|
LEX_ATTRS = {
|
|
NORM: norm
|
|
}
|