spaCy/spacy/lang/hi/lex_attrs.py

59 lines
2.8 KiB
Python
Raw Normal View History

# coding: utf8
from __future__ import unicode_literals
from ..norm_exceptions import BASE_NORMS
from ...attrs import NORM
Added numbers to ../lang/hi/lex_attrs.py (#2629) I have added numbers in hindi lex_attrs.py file according to Indian numbering system(https://en.wikipedia.org/wiki/Indian_numbering_system) and here are there english translations: 'рд╢реВрдиреНрдп' => zero 'рдПрдХ' => one 'рджреЛ' => two 'рддреАрди' => three 'рдЪрд╛рд░' => four 'рдкрд╛рдВрдЪ' => five 'рдЫрд╣' => six 'рд╕рд╛рдд'=>seven 'рдЖрда' => eight 'рдиреМ' => nine 'рджрд╕' => ten 'рдЧреНрдпрд╛рд░рд╣' => eleven 'рдмрд╛рд░рд╣' => twelve 'рддреЗрд░рд╣' => thirteen 'рдЪреМрджрд╣' => fourteen 'рдкрдВрджреНрд░рд╣' => fifteen 'рд╕реЛрд▓рд╣'=> sixteen 'рд╕рддреНрд░рд╣' => seventeen 'рдЕрдард╛рд░рд╣' => eighteen 'рдЙрдиреНрдиреАрд╕' => nineteen 'рдмреАрд╕' => twenty 'рддреАрд╕' => thirty 'рдЪрд╛рд▓реАрд╕' => forty 'рдкрдЪрд╛рд╕' => fifty 'рд╕рд╛рда' => sixty 'рд╕рддреНрддрд░' => seventy 'рдЕрд╕реНрд╕реА' => eighty 'рдирдмреНрдмреЗ' => ninety 'рд╕реМ' => hundred 'рд╣рдЬрд╝рд╛рд░' => thousand 'рд▓рд╛рдЦ' => hundred thousand 'рдХрд░реЛрдбрд╝' => ten million 'рдЕрд░рдм' => billion 'рдЦрд░рдм' => hundred billion <!--- Provide a general summary of your changes in the title. --> ## Description <!--- Use this section to describe your changes. If your changes required testing, include information about the testing environment and the tests you ran. If your test fixes a bug reported in an issue, don't forget to include the issue number. If your PR is still a work in progress, that's totally fine тАУ just include a note to let us know. --> ### Types of change <!-- What type of change does your PR cover? Is it a bug fix, an enhancement or new feature, or a change to the documentation? --> ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-08-08 17:06:11 +03:00
from ...attrs import LIKE_NUM
from ...util import add_lookups
_stem_suffixes = [
["реЛ","реЗ","реВ","реБ","реА","рд┐","рд╛"],
["рдХрд░","рд╛рдУ","рд┐рдП","рд╛рдИ","рд╛рдП","рдиреЗ","рдиреА","рдирд╛","рддреЗ","реАрдВ","рддреА","рддрд╛","рд╛рдБ","рд╛рдВ","реЛрдВ","реЗрдВ"],
["рд╛рдХрд░","рд╛рдЗрдП","рд╛рдИрдВ","рд╛рдпрд╛","реЗрдЧреА","реЗрдЧрд╛","реЛрдЧреА","реЛрдЧреЗ","рд╛рдиреЗ","рд╛рдирд╛","рд╛рддреЗ","рд╛рддреА","рд╛рддрд╛","рддреАрдВ","рд╛рдУрдВ","рд╛рдПрдВ","реБрдУрдВ","реБрдПрдВ","реБрдЖрдВ"],
["рд╛рдПрдЧреА","рд╛рдПрдЧрд╛","рд╛рдУрдЧреА","рд╛рдУрдЧреЗ","рдПрдВрдЧреА","реЗрдВрдЧреА","рдПрдВрдЧреЗ","реЗрдВрдЧреЗ","реВрдВрдЧреА","реВрдВрдЧрд╛","рд╛рддреАрдВ","рдирд╛рдУрдВ","рдирд╛рдПрдВ","рддрд╛рдУрдВ","рддрд╛рдПрдВ","рд┐рдпрд╛рдБ","рд┐рдпреЛрдВ","рд┐рдпрд╛рдВ"],
["рд╛рдПрдВрдЧреА","рд╛рдПрдВрдЧреЗ","рд╛рдКрдВрдЧреА","рд╛рдКрдВрдЧрд╛","рд╛рдЗрдпрд╛рдБ","рд╛рдЗрдпреЛрдВ","рд╛рдЗрдпрд╛рдВ"]
]
Added numbers to ../lang/hi/lex_attrs.py (#2629) I have added numbers in hindi lex_attrs.py file according to Indian numbering system(https://en.wikipedia.org/wiki/Indian_numbering_system) and here are there english translations: 'рд╢реВрдиреНрдп' => zero 'рдПрдХ' => one 'рджреЛ' => two 'рддреАрди' => three 'рдЪрд╛рд░' => four 'рдкрд╛рдВрдЪ' => five 'рдЫрд╣' => six 'рд╕рд╛рдд'=>seven 'рдЖрда' => eight 'рдиреМ' => nine 'рджрд╕' => ten 'рдЧреНрдпрд╛рд░рд╣' => eleven 'рдмрд╛рд░рд╣' => twelve 'рддреЗрд░рд╣' => thirteen 'рдЪреМрджрд╣' => fourteen 'рдкрдВрджреНрд░рд╣' => fifteen 'рд╕реЛрд▓рд╣'=> sixteen 'рд╕рддреНрд░рд╣' => seventeen 'рдЕрдард╛рд░рд╣' => eighteen 'рдЙрдиреНрдиреАрд╕' => nineteen 'рдмреАрд╕' => twenty 'рддреАрд╕' => thirty 'рдЪрд╛рд▓реАрд╕' => forty 'рдкрдЪрд╛рд╕' => fifty 'рд╕рд╛рда' => sixty 'рд╕рддреНрддрд░' => seventy 'рдЕрд╕реНрд╕реА' => eighty 'рдирдмреНрдмреЗ' => ninety 'рд╕реМ' => hundred 'рд╣рдЬрд╝рд╛рд░' => thousand 'рд▓рд╛рдЦ' => hundred thousand 'рдХрд░реЛрдбрд╝' => ten million 'рдЕрд░рдм' => billion 'рдЦрд░рдм' => hundred billion <!--- Provide a general summary of your changes in the title. --> ## Description <!--- Use this section to describe your changes. If your changes required testing, include information about the testing environment and the tests you ran. If your test fixes a bug reported in an issue, don't forget to include the issue number. If your PR is still a work in progress, that's totally fine тАУ just include a note to let us know. --> ### Types of change <!-- What type of change does your PR cover? Is it a bug fix, an enhancement or new feature, or a change to the documentation? --> ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-08-08 17:06:11 +03:00
#reference 1:https://en.wikipedia.org/wiki/Indian_numbering_system
#reference 2: https://blogs.transparent.com/hindi/hindi-numbers-1-100/
_num_words = ['рд╢реВрдиреНрдп', 'рдПрдХ', 'рджреЛ', 'рддреАрди', 'рдЪрд╛рд░', 'рдкрд╛рдВрдЪ', 'рдЫрд╣', 'рд╕рд╛рдд', 'рдЖрда', 'рдиреМ', 'рджрд╕',
'рдЧреНрдпрд╛рд░рд╣', 'рдмрд╛рд░рд╣', 'рддреЗрд░рд╣', 'рдЪреМрджрд╣', 'рдкрдВрджреНрд░рд╣', 'рд╕реЛрд▓рд╣', 'рд╕рддреНрд░рд╣', 'рдЕрдард╛рд░рд╣', 'рдЙрдиреНрдиреАрд╕',
'рдмреАрд╕', 'рддреАрд╕', 'рдЪрд╛рд▓реАрд╕', 'рдкрдЪрд╛рд╕', 'рд╕рд╛рда', 'рд╕рддреНрддрд░', 'рдЕрд╕реНрд╕реА', 'рдирдмреНрдмреЗ', 'рд╕реМ', 'рд╣рдЬрд╝рд╛рд░',
'рд▓рд╛рдЦ', 'рдХрд░реЛрдбрд╝', 'рдЕрд░рдм', 'рдЦрд░рдм']
def norm(string):
# normalise base exceptions, e.g. punctuation or currency symbols
if string in BASE_NORMS:
return BASE_NORMS[string]
# set stem word as norm, if available, adapted from:
# http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
# http://research.variancia.com/hindi_stemmer/
# https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
for suffix_group in reversed(_stem_suffixes):
length = len(suffix_group[0])
if len(string) <= length:
break
for suffix in suffix_group:
if string.endswith(suffix):
return string[:-length]
return string
Added numbers to ../lang/hi/lex_attrs.py (#2629) I have added numbers in hindi lex_attrs.py file according to Indian numbering system(https://en.wikipedia.org/wiki/Indian_numbering_system) and here are there english translations: 'рд╢реВрдиреНрдп' => zero 'рдПрдХ' => one 'рджреЛ' => two 'рддреАрди' => three 'рдЪрд╛рд░' => four 'рдкрд╛рдВрдЪ' => five 'рдЫрд╣' => six 'рд╕рд╛рдд'=>seven 'рдЖрда' => eight 'рдиреМ' => nine 'рджрд╕' => ten 'рдЧреНрдпрд╛рд░рд╣' => eleven 'рдмрд╛рд░рд╣' => twelve 'рддреЗрд░рд╣' => thirteen 'рдЪреМрджрд╣' => fourteen 'рдкрдВрджреНрд░рд╣' => fifteen 'рд╕реЛрд▓рд╣'=> sixteen 'рд╕рддреНрд░рд╣' => seventeen 'рдЕрдард╛рд░рд╣' => eighteen 'рдЙрдиреНрдиреАрд╕' => nineteen 'рдмреАрд╕' => twenty 'рддреАрд╕' => thirty 'рдЪрд╛рд▓реАрд╕' => forty 'рдкрдЪрд╛рд╕' => fifty 'рд╕рд╛рда' => sixty 'рд╕рддреНрддрд░' => seventy 'рдЕрд╕реНрд╕реА' => eighty 'рдирдмреНрдмреЗ' => ninety 'рд╕реМ' => hundred 'рд╣рдЬрд╝рд╛рд░' => thousand 'рд▓рд╛рдЦ' => hundred thousand 'рдХрд░реЛрдбрд╝' => ten million 'рдЕрд░рдм' => billion 'рдЦрд░рдм' => hundred billion <!--- Provide a general summary of your changes in the title. --> ## Description <!--- Use this section to describe your changes. If your changes required testing, include information about the testing environment and the tests you ran. If your test fixes a bug reported in an issue, don't forget to include the issue number. If your PR is still a work in progress, that's totally fine тАУ just include a note to let us know. --> ### Types of change <!-- What type of change does your PR cover? Is it a bug fix, an enhancement or new feature, or a change to the documentation? --> ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-08-08 17:06:11 +03:00
def like_num(text):
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text.lower() in _num_words:
return True
return False
LEX_ATTRS = {
2018-10-28 01:09:16 +03:00
NORM: norm,
Added numbers to ../lang/hi/lex_attrs.py (#2629) I have added numbers in hindi lex_attrs.py file according to Indian numbering system(https://en.wikipedia.org/wiki/Indian_numbering_system) and here are there english translations: 'рд╢реВрдиреНрдп' => zero 'рдПрдХ' => one 'рджреЛ' => two 'рддреАрди' => three 'рдЪрд╛рд░' => four 'рдкрд╛рдВрдЪ' => five 'рдЫрд╣' => six 'рд╕рд╛рдд'=>seven 'рдЖрда' => eight 'рдиреМ' => nine 'рджрд╕' => ten 'рдЧреНрдпрд╛рд░рд╣' => eleven 'рдмрд╛рд░рд╣' => twelve 'рддреЗрд░рд╣' => thirteen 'рдЪреМрджрд╣' => fourteen 'рдкрдВрджреНрд░рд╣' => fifteen 'рд╕реЛрд▓рд╣'=> sixteen 'рд╕рддреНрд░рд╣' => seventeen 'рдЕрдард╛рд░рд╣' => eighteen 'рдЙрдиреНрдиреАрд╕' => nineteen 'рдмреАрд╕' => twenty 'рддреАрд╕' => thirty 'рдЪрд╛рд▓реАрд╕' => forty 'рдкрдЪрд╛рд╕' => fifty 'рд╕рд╛рда' => sixty 'рд╕рддреНрддрд░' => seventy 'рдЕрд╕реНрд╕реА' => eighty 'рдирдмреНрдмреЗ' => ninety 'рд╕реМ' => hundred 'рд╣рдЬрд╝рд╛рд░' => thousand 'рд▓рд╛рдЦ' => hundred thousand 'рдХрд░реЛрдбрд╝' => ten million 'рдЕрд░рдм' => billion 'рдЦрд░рдм' => hundred billion <!--- Provide a general summary of your changes in the title. --> ## Description <!--- Use this section to describe your changes. If your changes required testing, include information about the testing environment and the tests you ran. If your test fixes a bug reported in an issue, don't forget to include the issue number. If your PR is still a work in progress, that's totally fine тАУ just include a note to let us know. --> ### Types of change <!-- What type of change does your PR cover? Is it a bug fix, an enhancement or new feature, or a change to the documentation? --> ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-08-08 17:06:11 +03:00
LIKE_NUM: like_num
}