mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
6eebfc7bf4
I have added numbers in hindi lex_attrs.py file according to Indian numbering system(https://en.wikipedia.org/wiki/Indian_numbering_system) and here are there english translations: 'शून्य' => zero 'एक' => one 'दो' => two 'तीन' => three 'चार' => four 'पांच' => five 'छह' => six 'सात'=>seven 'आठ' => eight 'नौ' => nine 'दस' => ten 'ग्यारह' => eleven 'बारह' => twelve 'तेरह' => thirteen 'चौदह' => fourteen 'पंद्रह' => fifteen 'सोलह'=> sixteen 'सत्रह' => seventeen 'अठारह' => eighteen 'उन्नीस' => nineteen 'बीस' => twenty 'तीस' => thirty 'चालीस' => forty 'पचास' => fifty 'साठ' => sixty 'सत्तर' => seventy 'अस्सी' => eighty 'नब्बे' => ninety 'सौ' => hundred 'हज़ार' => thousand 'लाख' => hundred thousand 'करोड़' => ten million 'अरब' => billion 'खरब' => hundred billion <!--- Provide a general summary of your changes in the title. --> ## Description <!--- Use this section to describe your changes. If your changes required testing, include information about the testing environment and the tests you ran. If your test fixes a bug reported in an issue, don't forget to include the issue number. If your PR is still a work in progress, that's totally fine – just include a note to let us know. --> ### Types of change <!-- What type of change does your PR cover? Is it a bug fix, an enhancement or new feature, or a change to the documentation? --> ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
59 lines
2.8 KiB
Python
59 lines
2.8 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ..norm_exceptions import BASE_NORMS
|
|
from ...attrs import NORM
|
|
from ...attrs import LIKE_NUM
|
|
from ...util import add_lookups
|
|
|
|
_stem_suffixes = [
|
|
["ो","े","ू","ु","ी","ि","ा"],
|
|
["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"],
|
|
["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"],
|
|
["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"],
|
|
["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]
|
|
]
|
|
|
|
#reference 1:https://en.wikipedia.org/wiki/Indian_numbering_system
|
|
#reference 2: https://blogs.transparent.com/hindi/hindi-numbers-1-100/
|
|
|
|
_num_words = ['शून्य', 'एक', 'दो', 'तीन', 'चार', 'पांच', 'छह', 'सात', 'आठ', 'नौ', 'दस',
|
|
'ग्यारह', 'बारह', 'तेरह', 'चौदह', 'पंद्रह', 'सोलह', 'सत्रह', 'अठारह', 'उन्नीस',
|
|
'बीस', 'तीस', 'चालीस', 'पचास', 'साठ', 'सत्तर', 'अस्सी', 'नब्बे', 'सौ', 'हज़ार',
|
|
'लाख', 'करोड़', 'अरब', 'खरब']
|
|
|
|
def norm(string):
|
|
# normalise base exceptions, e.g. punctuation or currency symbols
|
|
if string in BASE_NORMS:
|
|
return BASE_NORMS[string]
|
|
# set stem word as norm, if available, adapted from:
|
|
# http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
|
|
# http://research.variancia.com/hindi_stemmer/
|
|
# https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
|
|
for suffix_group in reversed(_stem_suffixes):
|
|
length = len(suffix_group[0])
|
|
if len(string) <= length:
|
|
break
|
|
for suffix in suffix_group:
|
|
if string.endswith(suffix):
|
|
return string[:-length]
|
|
return string
|
|
|
|
def like_num(text):
|
|
text = text.replace(',', '').replace('.', '')
|
|
if text.isdigit():
|
|
return True
|
|
if text.count('/') == 1:
|
|
num, denom = text.split('/')
|
|
if num.isdigit() and denom.isdigit():
|
|
return True
|
|
if text.lower() in _num_words:
|
|
return True
|
|
return False
|
|
|
|
|
|
LEX_ATTRS = {
|
|
NORM: norm
|
|
LIKE_NUM: like_num
|
|
}
|