spaCy/spacy/lang/bn/lemmatizer.py

69 lines
1.7 KiB
Python
Raw Normal View History

2017-03-12 15:07:28 +03:00
# coding: utf8
from __future__ import unicode_literals
# Source: āωāĻšā§āϚāϤāϰ āĻŦāĻžāĻ‚āϞāĻž āĻŦā§āϝāĻžāĻ•āϰāĻŖ āĻ“ āϰāϚāύāĻž - āĻ…āĻ§ā§āϝāĻžāĻĒāĻ• āύāĻŋāϰāĻžā§āϜāύ āĻ…āϧāĻŋāĻ•āĻžāϰ⧀ āĻ“ āĻ…āĻ§ā§āϝāĻžāĻĒāĻ• āĻĄ. āϏāĻĢāĻŋāωāĻĻā§āĻĻāĻŋāύ āφāĻšāĻŽāĻĻ
LEMMA_RULES = {
"noun": [
["āϟāĻž", ""],
["āϟāĻŋ", ""],
["āĻ–āĻžāύ", ""],
["āĻ–āĻžāύāĻž", ""],
["āĻ–āĻžāύāĻŋ", ""],
["āĻ—āĻžāĻ›āĻž", ""],
["āĻ—āĻžāĻ›āĻŋ", ""],
["āĻ›āĻĄāĻŧāĻž", ""],
["āϕ⧇", ""],
["⧇", ""],
["āϤ⧇", ""],
["āϰ", ""],
["āϰāĻž", ""],
["āϰ⧇", ""],
["⧇āϰ", ""], # āĻāϰ
["⧇āϰāĻž", ""], # āĻāϰāĻž
["āĻĻ⧇āϰ", ""],
["āĻĻ⧇āϰāϕ⧇", ""],
["āϗ⧁āϞāĻž", ""],
["āϗ⧁āϞ⧋", ""],
["āϗ⧁āϞāĻŋ", ""],
["āϕ⧁āϞ", ""],
["āĻ—āĻŖ", ""],
["āĻĻāϞ", ""],
["āĻĒāĻžāϞ", ""],
["āĻĒ⧁āĻžā§āϜ", ""],
["āĻŽāĻŖā§āĻĄāϞ⧀", ""],
["āĻŽāĻžāϞāĻž", ""],
["āϰāĻžāϜāĻŋ", ""],
["āĻŦ⧃āĻ¨ā§āĻĻ", ""],
["āĻŦāĻ°ā§āĻ—", ""],
["āĻļā§āϰ⧇āĻŖā§€", ""],
["āĻļā§āϰ⧇āύāĻŋ", ""],
["āϰāĻžāĻļāĻŋ", ""],
["āϏāĻ•āϞ", ""],
["āĻŽāĻšāϞ", ""],
["āĻžāĻŦāϞāĻŋ", ""], # āφāĻŦāϞāĻŋ
# Bengali digit representations
["ā§Ļ", "0"],
["ā§§", "1"],
["⧍", "2"],
["ā§Š", "3"],
["ā§Ē", "4"],
["ā§Ģ", "5"],
["ā§Ŧ", "6"],
["ā§­", "7"],
["ā§Ž", "8"],
["⧝", "9"],
],
"punct": [
["“", "\""],
["”", "\""],
["\u2018", "'"],
["\u2019", "'"]
]
}