2017-03-12 15:07:28 +03:00
|
|
|
|
# coding: utf8
|
2017-03-04 09:20:08 +03:00
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
|
|
# Source: āĻāĻā§āĻāϤāϰ āĻŦāĻžāĻāϞāĻž āĻŦā§āϝāĻžāĻāϰāĻŖ āĻ āϰāĻāύāĻž - āĻ
āϧā§āϝāĻžāĻĒāĻ āύāĻŋāϰāĻā§āĻāύ āĻ
āϧāĻŋāĻāĻžāϰ⧠āĻ āĻ
āϧā§āϝāĻžāĻĒāĻ āĻĄ. āϏāĻĢāĻŋāĻāĻĻā§āĻĻāĻŋāύ āĻāĻšāĻŽāĻĻ
|
|
|
|
|
|
|
|
|
|
LEMMA_RULES = {
|
|
|
|
|
"noun": [
|
|
|
|
|
["āĻāĻž", ""],
|
|
|
|
|
["āĻāĻŋ", ""],
|
|
|
|
|
["āĻāĻžāύ", ""],
|
|
|
|
|
["āĻāĻžāύāĻž", ""],
|
|
|
|
|
["āĻāĻžāύāĻŋ", ""],
|
|
|
|
|
["āĻāĻžāĻāĻž", ""],
|
|
|
|
|
["āĻāĻžāĻāĻŋ", ""],
|
|
|
|
|
["āĻāĻĄāĻŧāĻž", ""],
|
|
|
|
|
|
|
|
|
|
["āĻā§", ""],
|
|
|
|
|
["ā§", ""],
|
|
|
|
|
["āϤā§", ""],
|
|
|
|
|
|
|
|
|
|
["āϰ", ""],
|
|
|
|
|
["āϰāĻž", ""],
|
|
|
|
|
["āϰā§", ""],
|
|
|
|
|
["ā§āϰ", ""], # āĻāϰ
|
|
|
|
|
["ā§āϰāĻž", ""], # āĻāϰāĻž
|
|
|
|
|
["āĻĻā§āϰ", ""],
|
|
|
|
|
["āĻĻā§āϰāĻā§", ""],
|
|
|
|
|
["āĻā§āϞāĻž", ""],
|
|
|
|
|
["āĻā§āϞā§", ""],
|
|
|
|
|
["āĻā§āϞāĻŋ", ""],
|
|
|
|
|
|
|
|
|
|
["āĻā§āϞ", ""],
|
|
|
|
|
["āĻāĻŖ", ""],
|
|
|
|
|
["āĻĻāϞ", ""],
|
|
|
|
|
["āĻĒāĻžāϞ", ""],
|
|
|
|
|
["āĻĒā§āĻā§āĻ", ""],
|
|
|
|
|
["āĻŽāĻŖā§āĻĄāϞā§", ""],
|
|
|
|
|
["āĻŽāĻžāϞāĻž", ""],
|
|
|
|
|
["āϰāĻžāĻāĻŋ", ""],
|
|
|
|
|
["āĻŦā§āύā§āĻĻ", ""],
|
|
|
|
|
["āĻŦāϰā§āĻ", ""],
|
|
|
|
|
["āĻļā§āϰā§āĻŖā§", ""],
|
|
|
|
|
["āĻļā§āϰā§āύāĻŋ", ""],
|
|
|
|
|
["āϰāĻžāĻļāĻŋ", ""],
|
|
|
|
|
["āϏāĻāϞ", ""],
|
|
|
|
|
["āĻŽāĻšāϞ", ""],
|
|
|
|
|
["āĻžāĻŦāϞāĻŋ", ""], # āĻāĻŦāϞāĻŋ
|
|
|
|
|
|
|
|
|
|
# Bengali digit representations
|
|
|
|
|
["ā§Ļ", "0"],
|
|
|
|
|
["ā§§", "1"],
|
|
|
|
|
["⧍", "2"],
|
|
|
|
|
["ā§Š", "3"],
|
|
|
|
|
["ā§Ē", "4"],
|
|
|
|
|
["ā§Ģ", "5"],
|
|
|
|
|
["ā§Ŧ", "6"],
|
|
|
|
|
["ā§", "7"],
|
|
|
|
|
["ā§Ž", "8"],
|
|
|
|
|
["⧝", "9"],
|
|
|
|
|
],
|
|
|
|
|
|
|
|
|
|
"punct": [
|
|
|
|
|
["â", "\""],
|
|
|
|
|
["â", "\""],
|
|
|
|
|
["\u2018", "'"],
|
|
|
|
|
["\u2019", "'"]
|
|
|
|
|
]
|
|
|
|
|
}
|