2017-03-12 15:07:28 +03:00
|
|
|
|
# coding: utf8
|
2017-03-04 09:20:08 +03:00
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
|
|
# Source: āĻāĻā§āĻāĻ¤āĻ° āĻŦāĻžāĻāĻ˛āĻž āĻŦā§āĻ¯āĻžāĻāĻ°āĻŖ āĻ āĻ°āĻāĻ¨āĻž - āĻ
āĻ§ā§āĻ¯āĻžāĻĒāĻ āĻ¨āĻŋāĻ°āĻā§āĻāĻ¨ āĻ
āĻ§āĻŋāĻāĻžāĻ°ā§ āĻ āĻ
āĻ§ā§āĻ¯āĻžāĻĒāĻ āĻĄ. āĻ¸āĻĢāĻŋāĻāĻĻā§āĻĻāĻŋāĻ¨ āĻāĻšāĻŽāĻĻ
|
|
|
|
|
|
|
|
|
|
LEMMA_RULES = {
|
|
|
|
|
"noun": [
|
|
|
|
|
["āĻāĻž", ""],
|
|
|
|
|
["āĻāĻŋ", ""],
|
|
|
|
|
["āĻāĻžāĻ¨", ""],
|
|
|
|
|
["āĻāĻžāĻ¨āĻž", ""],
|
|
|
|
|
["āĻāĻžāĻ¨āĻŋ", ""],
|
|
|
|
|
["āĻāĻžāĻāĻž", ""],
|
|
|
|
|
["āĻāĻžāĻāĻŋ", ""],
|
|
|
|
|
["āĻāĻĄāĻŧāĻž", ""],
|
|
|
|
|
|
|
|
|
|
["āĻā§", ""],
|
|
|
|
|
["ā§", ""],
|
|
|
|
|
["āĻ¤ā§", ""],
|
|
|
|
|
|
|
|
|
|
["āĻ°", ""],
|
|
|
|
|
["āĻ°āĻž", ""],
|
|
|
|
|
["āĻ°ā§", ""],
|
|
|
|
|
["ā§āĻ°", ""], # āĻāĻ°
|
|
|
|
|
["ā§āĻ°āĻž", ""], # āĻāĻ°āĻž
|
|
|
|
|
["āĻĻā§āĻ°", ""],
|
|
|
|
|
["āĻĻā§āĻ°āĻā§", ""],
|
|
|
|
|
["āĻā§āĻ˛āĻž", ""],
|
|
|
|
|
["āĻā§āĻ˛ā§", ""],
|
|
|
|
|
["āĻā§āĻ˛āĻŋ", ""],
|
|
|
|
|
|
|
|
|
|
["āĻā§āĻ˛", ""],
|
|
|
|
|
["āĻāĻŖ", ""],
|
|
|
|
|
["āĻĻāĻ˛", ""],
|
|
|
|
|
["āĻĒāĻžāĻ˛", ""],
|
|
|
|
|
["āĻĒā§āĻā§āĻ", ""],
|
|
|
|
|
["āĻŽāĻŖā§āĻĄāĻ˛ā§", ""],
|
|
|
|
|
["āĻŽāĻžāĻ˛āĻž", ""],
|
|
|
|
|
["āĻ°āĻžāĻāĻŋ", ""],
|
|
|
|
|
["āĻŦā§āĻ¨ā§āĻĻ", ""],
|
|
|
|
|
["āĻŦāĻ°ā§āĻ", ""],
|
|
|
|
|
["āĻļā§āĻ°ā§āĻŖā§", ""],
|
|
|
|
|
["āĻļā§āĻ°ā§āĻ¨āĻŋ", ""],
|
|
|
|
|
["āĻ°āĻžāĻļāĻŋ", ""],
|
|
|
|
|
["āĻ¸āĻāĻ˛", ""],
|
|
|
|
|
["āĻŽāĻšāĻ˛", ""],
|
|
|
|
|
["āĻžāĻŦāĻ˛āĻŋ", ""], # āĻāĻŦāĻ˛āĻŋ
|
|
|
|
|
|
|
|
|
|
# Bengali digit representations
|
|
|
|
|
["ā§Ļ", "0"],
|
|
|
|
|
["ā§§", "1"],
|
|
|
|
|
["ā§¨", "2"],
|
|
|
|
|
["ā§Š", "3"],
|
|
|
|
|
["ā§Ē", "4"],
|
|
|
|
|
["ā§Ģ", "5"],
|
|
|
|
|
["ā§Ŧ", "6"],
|
|
|
|
|
["ā§", "7"],
|
|
|
|
|
["ā§Ž", "8"],
|
|
|
|
|
["ā§¯", "9"],
|
|
|
|
|
],
|
|
|
|
|
|
|
|
|
|
"punct": [
|
|
|
|
|
["â", "\""],
|
|
|
|
|
["â", "\""],
|
|
|
|
|
["\u2018", "'"],
|
|
|
|
|
["\u2019", "'"]
|
|
|
|
|
]
|
|
|
|
|
}
|