spaCy/spacy/lang/bn/lemmatizer.py

69 lines
1.7 KiB
Python
Raw Normal View History

2017-03-12 15:07:28 +03:00
# coding: utf8
from __future__ import unicode_literals
# Source: āĻ‰āĻšā§āĻšāĻ¤āĻ° āĻŦāĻžāĻ‚āĻ˛āĻž āĻŦā§āĻ¯āĻžāĻ•āĻ°āĻŖ āĻ“ āĻ°āĻšāĻ¨āĻž - āĻ…āĻ§ā§āĻ¯āĻžāĻĒāĻ• āĻ¨āĻŋāĻ°āĻžā§āĻœāĻ¨ āĻ…āĻ§āĻŋāĻ•āĻžāĻ°ā§€ āĻ“ āĻ…āĻ§ā§āĻ¯āĻžāĻĒāĻ• āĻĄ. āĻ¸āĻĢāĻŋāĻ‰āĻĻā§āĻĻāĻŋāĻ¨ āĻ†āĻšāĻŽāĻĻ
LEMMA_RULES = {
"noun": [
["āĻŸāĻž", ""],
["āĻŸāĻŋ", ""],
["āĻ–āĻžāĻ¨", ""],
["āĻ–āĻžāĻ¨āĻž", ""],
["āĻ–āĻžāĻ¨āĻŋ", ""],
["āĻ—āĻžāĻ›āĻž", ""],
["āĻ—āĻžāĻ›āĻŋ", ""],
["āĻ›āĻĄāĻŧāĻž", ""],
["āĻ•ā§‡", ""],
["ā§‡", ""],
["āĻ¤ā§‡", ""],
["āĻ°", ""],
["āĻ°āĻž", ""],
["āĻ°ā§‡", ""],
["ā§‡āĻ°", ""], # āĻāĻ°
["ā§‡āĻ°āĻž", ""], # āĻāĻ°āĻž
["āĻĻā§‡āĻ°", ""],
["āĻĻā§‡āĻ°āĻ•ā§‡", ""],
["āĻ—ā§āĻ˛āĻž", ""],
["āĻ—ā§āĻ˛ā§‹", ""],
["āĻ—ā§āĻ˛āĻŋ", ""],
["āĻ•ā§āĻ˛", ""],
["āĻ—āĻŖ", ""],
["āĻĻāĻ˛", ""],
["āĻĒāĻžāĻ˛", ""],
["āĻĒā§āĻžā§āĻœ", ""],
["āĻŽāĻŖā§āĻĄāĻ˛ā§€", ""],
["āĻŽāĻžāĻ˛āĻž", ""],
["āĻ°āĻžāĻœāĻŋ", ""],
["āĻŦā§ƒāĻ¨ā§āĻĻ", ""],
["āĻŦāĻ°ā§āĻ—", ""],
["āĻļā§āĻ°ā§‡āĻŖā§€", ""],
["āĻļā§āĻ°ā§‡āĻ¨āĻŋ", ""],
["āĻ°āĻžāĻļāĻŋ", ""],
["āĻ¸āĻ•āĻ˛", ""],
["āĻŽāĻšāĻ˛", ""],
["āĻžāĻŦāĻ˛āĻŋ", ""], # āĻ†āĻŦāĻ˛āĻŋ
# Bengali digit representations
["ā§Ļ", "0"],
["ā§§", "1"],
["ā§¨", "2"],
["ā§Š", "3"],
["ā§Ē", "4"],
["ā§Ģ", "5"],
["ā§Ŧ", "6"],
["ā§­", "7"],
["ā§Ž", "8"],
["ā§¯", "9"],
],
"punct": [
["“", "\""],
["”", "\""],
["\u2018", "'"],
["\u2019", "'"]
]
}