mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge pull request #872 from banglakit/bn-improvements
[Bengali] basic tag map, morph, lemma rules and exceptions
This commit is contained in:
commit
bb959692f5
|
@ -15,7 +15,9 @@ class Bengali(Language):
|
||||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||||
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
lemma_rules = LEMMA_RULES
|
||||||
|
|
||||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||||
|
|
|
@ -4,15 +4,24 @@ from __future__ import unicode_literals
|
||||||
from spacy.language_data import strings_to_exc, update_exc
|
from spacy.language_data import strings_to_exc, update_exc
|
||||||
from .punctuation import *
|
from .punctuation import *
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tag_map import TAG_MAP as TAG_MAP_BN
|
||||||
|
from .morph_rules import MORPH_RULES
|
||||||
|
from .lemma_rules import LEMMA_RULES
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS as TOKENIZER_EXCEPTIONS_BN
|
||||||
from .. import language_data as base
|
from .. import language_data as base
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
TAG_MAP = base.TAG_MAP
|
||||||
|
TAG_MAP.update(TAG_MAP_BN)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
TOKENIZER_EXCEPTIONS.update(TOKENIZER_EXCEPTIONS_BN)
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
|
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
|
||||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
||||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TAG_MAP", "MORPH_RULES", "LEMMA_RULES",
|
||||||
|
"TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||||
|
|
68
spacy/bn/lemma_rules.py
Normal file
68
spacy/bn/lemma_rules.py
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
# Source: উচ্চতর বাংলা ব্যাকরণ ও রচনা - অধ্যাপক নিরঞ্জন অধিকারী ও অধ্যাপক ড. সফিউদ্দিন আহমদ
|
||||||
|
|
||||||
|
LEMMA_RULES = {
|
||||||
|
"noun": [
|
||||||
|
["টা", ""],
|
||||||
|
["টি", ""],
|
||||||
|
["খান", ""],
|
||||||
|
["খানা", ""],
|
||||||
|
["খানি", ""],
|
||||||
|
["গাছা", ""],
|
||||||
|
["গাছি", ""],
|
||||||
|
["ছড়া", ""],
|
||||||
|
|
||||||
|
["কে", ""],
|
||||||
|
["ে", ""],
|
||||||
|
["তে", ""],
|
||||||
|
|
||||||
|
["র", ""],
|
||||||
|
["রা", ""],
|
||||||
|
["রে", ""],
|
||||||
|
["ের", ""], # এর
|
||||||
|
["েরা", ""], # এরা
|
||||||
|
["দের", ""],
|
||||||
|
["দেরকে", ""],
|
||||||
|
["গুলা", ""],
|
||||||
|
["গুলো", ""],
|
||||||
|
["গুলি", ""],
|
||||||
|
|
||||||
|
["কুল", ""],
|
||||||
|
["গণ", ""],
|
||||||
|
["দল", ""],
|
||||||
|
["পাল", ""],
|
||||||
|
["পুঞ্জ", ""],
|
||||||
|
["মণ্ডলী", ""],
|
||||||
|
["মালা", ""],
|
||||||
|
["রাজি", ""],
|
||||||
|
["বৃন্দ", ""],
|
||||||
|
["বর্গ", ""],
|
||||||
|
["শ্রেণী", ""],
|
||||||
|
["শ্রেনি", ""],
|
||||||
|
["রাশি", ""],
|
||||||
|
["সকল", ""],
|
||||||
|
["মহল", ""],
|
||||||
|
["াবলি", ""], # আবলি
|
||||||
|
|
||||||
|
# Bengali digit representations
|
||||||
|
["০", "0"],
|
||||||
|
["১", "1"],
|
||||||
|
["২", "2"],
|
||||||
|
["৩", "3"],
|
||||||
|
["৪", "4"],
|
||||||
|
["৫", "5"],
|
||||||
|
["৬", "6"],
|
||||||
|
["৭", "7"],
|
||||||
|
["৮", "8"],
|
||||||
|
["৯", "9"],
|
||||||
|
],
|
||||||
|
|
||||||
|
"punct": [
|
||||||
|
["“", "\""],
|
||||||
|
["”", "\""],
|
||||||
|
["\u2018", "'"],
|
||||||
|
["\u2019", "'"]
|
||||||
|
]
|
||||||
|
}
|
55
spacy/bn/morph_rules.py
Normal file
55
spacy/bn/morph_rules.py
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..language_data import PRON_LEMMA
|
||||||
|
from ..symbols import *
|
||||||
|
|
||||||
|
MORPH_RULES = {
|
||||||
|
"PRP": {
|
||||||
|
'ঐ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
|
||||||
|
'আমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||||
|
'কি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
||||||
|
'সে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||||
|
'কিসে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
||||||
|
'কাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
||||||
|
'তাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||||
|
'স্বয়ং': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||||
|
'কোনগুলো': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
||||||
|
'তুমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||||
|
'তাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||||
|
'আমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||||
|
'যিনি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||||
|
'আমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||||
|
'কোন': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
||||||
|
'কারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
||||||
|
'তোমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||||
|
'খোদ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||||
|
'কে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
||||||
|
'যারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||||
|
'যে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||||
|
'তোমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||||
|
'তোমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||||
|
'আপন': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||||
|
'এ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
|
||||||
|
'নিজ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||||
|
'কার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
||||||
|
'যা': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||||
|
'তারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||||
|
'আমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Nom'}
|
||||||
|
},
|
||||||
|
"PRP$": {
|
||||||
|
|
||||||
|
'আমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||||
|
'Case': 'Nom'},
|
||||||
|
'তার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||||
|
'Case': 'Nom'},
|
||||||
|
'তোমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||||
|
'Case': 'Nom'},
|
||||||
|
'আমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||||
|
'Case': 'Nom'},
|
||||||
|
'তোমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||||
|
'Case': 'Nom'},
|
||||||
|
'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||||
|
'Case': 'Nom'},
|
||||||
|
},
|
||||||
|
}
|
58
spacy/bn/tag_map.py
Normal file
58
spacy/bn/tag_map.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..symbols import *
|
||||||
|
|
||||||
|
|
||||||
|
TAG_MAP = {
|
||||||
|
".": {POS: PUNCT, "PunctType": "peri"},
|
||||||
|
",": {POS: PUNCT, "PunctType": "comm"},
|
||||||
|
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||||
|
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||||
|
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||||
|
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||||
|
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||||
|
":": {POS: PUNCT},
|
||||||
|
"৳": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||||
|
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||||
|
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||||
|
"CC": {POS: CONJ, "ConjType": "coor"},
|
||||||
|
"CD": {POS: NUM, "NumType": "card"},
|
||||||
|
"DT": {POS: DET},
|
||||||
|
"EX": {POS: ADV, "AdvType": "ex"},
|
||||||
|
"FW": {POS: X, "Foreign": "yes"},
|
||||||
|
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||||
|
"IN": {POS: ADP},
|
||||||
|
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||||
|
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||||
|
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||||
|
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||||
|
"MD": {POS: VERB, "VerbType": "mod"},
|
||||||
|
"NIL": {POS: ""},
|
||||||
|
"NN": {POS: NOUN, "Number": "sing"},
|
||||||
|
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||||
|
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||||
|
"NNS": {POS: NOUN, "Number": "plur"},
|
||||||
|
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||||
|
"POS": {POS: PART, "Poss": "yes"},
|
||||||
|
"PRP": {POS: PRON, "PronType": "prs"},
|
||||||
|
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||||
|
"RB": {POS: ADV, "Degree": "pos"},
|
||||||
|
"RBR": {POS: ADV, "Degree": "comp"},
|
||||||
|
"RBS": {POS: ADV, "Degree": "sup"},
|
||||||
|
"RP": {POS: PART},
|
||||||
|
"SYM": {POS: SYM},
|
||||||
|
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||||
|
"UH": {POS: INTJ},
|
||||||
|
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||||
|
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||||
|
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||||
|
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||||
|
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||||
|
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
|
||||||
|
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||||
|
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||||
|
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||||
|
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||||
|
"SP": {POS: SPACE},
|
||||||
|
}
|
36
spacy/bn/tokenizer_exceptions.py
Normal file
36
spacy/bn/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# coding=utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..symbols import *
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = {}
|
||||||
|
|
||||||
|
ABBREVIATIONS = {
|
||||||
|
"ডঃ": [
|
||||||
|
{ORTH: "ডঃ", LEMMA: "ডক্টর"},
|
||||||
|
],
|
||||||
|
"ডাঃ": [
|
||||||
|
{ORTH: "ডাঃ", LEMMA: "ডাক্তার"},
|
||||||
|
],
|
||||||
|
"ড.": [
|
||||||
|
{ORTH: "ড.", LEMMA: "ডক্টর"},
|
||||||
|
],
|
||||||
|
"ডা.": [
|
||||||
|
{ORTH: "ডা.", LEMMA: "ডাক্তার"},
|
||||||
|
],
|
||||||
|
"মোঃ": [
|
||||||
|
{ORTH: "মোঃ", LEMMA: "মোহাম্মদ"},
|
||||||
|
],
|
||||||
|
"মো.": [
|
||||||
|
{ORTH: "মো.", LEMMA: "মোহাম্মদ"},
|
||||||
|
],
|
||||||
|
"সে.": [
|
||||||
|
{ORTH: "সে.", LEMMA: "সেলসিয়াস"},
|
||||||
|
],
|
||||||
|
"কি.মি": [
|
||||||
|
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
|
||||||
|
{ORTH: "কি.মি.", LEMMA: "কিলোমিটার"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
|
Loading…
Reference in New Issue
Block a user