mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Merge pull request #872 from banglakit/bn-improvements
[Bengali] basic tag map, morph, lemma rules and exceptions
This commit is contained in:
commit
bb959692f5
|
@ -15,7 +15,9 @@ class Bengali(Language):
|
|||
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
lemma_rules = LEMMA_RULES
|
||||
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
|
|
|
@ -4,15 +4,24 @@ from __future__ import unicode_literals
|
|||
from spacy.language_data import strings_to_exc, update_exc
|
||||
from .punctuation import *
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP as TAG_MAP_BN
|
||||
from .morph_rules import MORPH_RULES
|
||||
from .lemma_rules import LEMMA_RULES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS as TOKENIZER_EXCEPTIONS_BN
|
||||
from .. import language_data as base
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
TAG_MAP = base.TAG_MAP
|
||||
TAG_MAP.update(TAG_MAP_BN)
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
TOKENIZER_EXCEPTIONS.update(TOKENIZER_EXCEPTIONS_BN)
|
||||
|
||||
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
|
||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TAG_MAP", "MORPH_RULES", "LEMMA_RULES",
|
||||
"TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||
|
|
68
spacy/bn/lemma_rules.py
Normal file
68
spacy/bn/lemma_rules.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Source: উচ্চতর বাংলা ব্যাকরণ ও রচনা - অধ্যাপক নিরঞ্জন অধিকারী ও অধ্যাপক ড. সফিউদ্দিন আহমদ
|
||||
|
||||
LEMMA_RULES = {
|
||||
"noun": [
|
||||
["টা", ""],
|
||||
["টি", ""],
|
||||
["খান", ""],
|
||||
["খানা", ""],
|
||||
["খানি", ""],
|
||||
["গাছা", ""],
|
||||
["গাছি", ""],
|
||||
["ছড়া", ""],
|
||||
|
||||
["কে", ""],
|
||||
["ে", ""],
|
||||
["তে", ""],
|
||||
|
||||
["র", ""],
|
||||
["রা", ""],
|
||||
["রে", ""],
|
||||
["ের", ""], # এর
|
||||
["েরা", ""], # এরা
|
||||
["দের", ""],
|
||||
["দেরকে", ""],
|
||||
["গুলা", ""],
|
||||
["গুলো", ""],
|
||||
["গুলি", ""],
|
||||
|
||||
["কুল", ""],
|
||||
["গণ", ""],
|
||||
["দল", ""],
|
||||
["পাল", ""],
|
||||
["পুঞ্জ", ""],
|
||||
["মণ্ডলী", ""],
|
||||
["মালা", ""],
|
||||
["রাজি", ""],
|
||||
["বৃন্দ", ""],
|
||||
["বর্গ", ""],
|
||||
["শ্রেণী", ""],
|
||||
["শ্রেনি", ""],
|
||||
["রাশি", ""],
|
||||
["সকল", ""],
|
||||
["মহল", ""],
|
||||
["াবলি", ""], # আবলি
|
||||
|
||||
# Bengali digit representations
|
||||
["০", "0"],
|
||||
["১", "1"],
|
||||
["২", "2"],
|
||||
["৩", "3"],
|
||||
["৪", "4"],
|
||||
["৫", "5"],
|
||||
["৬", "6"],
|
||||
["৭", "7"],
|
||||
["৮", "8"],
|
||||
["৯", "9"],
|
||||
],
|
||||
|
||||
"punct": [
|
||||
["“", "\""],
|
||||
["”", "\""],
|
||||
["\u2018", "'"],
|
||||
["\u2019", "'"]
|
||||
]
|
||||
}
|
55
spacy/bn/morph_rules.py
Normal file
55
spacy/bn/morph_rules.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..language_data import PRON_LEMMA
|
||||
from ..symbols import *
|
||||
|
||||
MORPH_RULES = {
|
||||
"PRP": {
|
||||
'ঐ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
|
||||
'আমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'কি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'সে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'কিসে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'কাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'তাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'স্বয়ং': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||
'কোনগুলো': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'তুমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'তাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'আমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'যিনি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||
'আমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'কোন': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'কারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'তোমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'খোদ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||
'কে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'যারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||
'যে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||
'তোমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'তোমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
||||
'আপন': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||
'এ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
|
||||
'নিজ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
||||
'কার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
||||
'যা': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Rel', 'Case': 'Nom'},
|
||||
'তারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
|
||||
'আমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Nom'}
|
||||
},
|
||||
"PRP$": {
|
||||
|
||||
'আমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'তার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'তোমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'আমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'তোমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
},
|
||||
}
|
58
spacy/bn/tag_map.py
Normal file
58
spacy/bn/tag_map.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..symbols import *
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
".": {POS: PUNCT, "PunctType": "peri"},
|
||||
",": {POS: PUNCT, "PunctType": "comm"},
|
||||
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
":": {POS: PUNCT},
|
||||
"৳": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
||||
"CD": {POS: NUM, "NumType": "card"},
|
||||
"DT": {POS: DET},
|
||||
"EX": {POS: ADV, "AdvType": "ex"},
|
||||
"FW": {POS: X, "Foreign": "yes"},
|
||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||
"IN": {POS: ADP},
|
||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||
"MD": {POS: VERB, "VerbType": "mod"},
|
||||
"NIL": {POS: ""},
|
||||
"NN": {POS: NOUN, "Number": "sing"},
|
||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||
"NNS": {POS: NOUN, "Number": "plur"},
|
||||
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||
"POS": {POS: PART, "Poss": "yes"},
|
||||
"PRP": {POS: PRON, "PronType": "prs"},
|
||||
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||
"RB": {POS: ADV, "Degree": "pos"},
|
||||
"RBR": {POS: ADV, "Degree": "comp"},
|
||||
"RBS": {POS: ADV, "Degree": "sup"},
|
||||
"RP": {POS: PART},
|
||||
"SYM": {POS: SYM},
|
||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||
"UH": {POS: INTJ},
|
||||
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
|
||||
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||
"SP": {POS: SPACE},
|
||||
}
|
36
spacy/bn/tokenizer_exceptions.py
Normal file
36
spacy/bn/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# coding=utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..symbols import *
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {}
|
||||
|
||||
ABBREVIATIONS = {
|
||||
"ডঃ": [
|
||||
{ORTH: "ডঃ", LEMMA: "ডক্টর"},
|
||||
],
|
||||
"ডাঃ": [
|
||||
{ORTH: "ডাঃ", LEMMA: "ডাক্তার"},
|
||||
],
|
||||
"ড.": [
|
||||
{ORTH: "ড.", LEMMA: "ডক্টর"},
|
||||
],
|
||||
"ডা.": [
|
||||
{ORTH: "ডা.", LEMMA: "ডাক্তার"},
|
||||
],
|
||||
"মোঃ": [
|
||||
{ORTH: "মোঃ", LEMMA: "মোহাম্মদ"},
|
||||
],
|
||||
"মো.": [
|
||||
{ORTH: "মো.", LEMMA: "মোহাম্মদ"},
|
||||
],
|
||||
"সে.": [
|
||||
{ORTH: "সে.", LEMMA: "সেলসিয়াস"},
|
||||
],
|
||||
"কি.মি": [
|
||||
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
|
||||
{ORTH: "কি.মি.", LEMMA: "কিলোমিটার"},
|
||||
],
|
||||
}
|
||||
|
||||
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
|
Loading…
Reference in New Issue
Block a user