diff --git a/spacy/bn/__init__.py b/spacy/bn/__init__.py index 2d3a5d404..198d4d8c6 100644 --- a/spacy/bn/__init__.py +++ b/spacy/bn/__init__.py @@ -15,7 +15,9 @@ class Bengali(Language): lex_attr_getters[LANG] = lambda text: 'bn' tokenizer_exceptions = TOKENIZER_EXCEPTIONS + tag_map = TAG_MAP stop_words = STOP_WORDS + lemma_rules = LEMMA_RULES prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) diff --git a/spacy/bn/language_data.py b/spacy/bn/language_data.py index d7da4ec4b..1925f9029 100644 --- a/spacy/bn/language_data.py +++ b/spacy/bn/language_data.py @@ -4,15 +4,24 @@ from __future__ import unicode_literals from spacy.language_data import strings_to_exc, update_exc from .punctuation import * from .stop_words import STOP_WORDS +from .tag_map import TAG_MAP as TAG_MAP_BN +from .morph_rules import MORPH_RULES +from .lemma_rules import LEMMA_RULES +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS as TOKENIZER_EXCEPTIONS_BN from .. import language_data as base STOP_WORDS = set(STOP_WORDS) +TAG_MAP = base.TAG_MAP +TAG_MAP.update(TAG_MAP_BN) + TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) +TOKENIZER_EXCEPTIONS.update(TOKENIZER_EXCEPTIONS_BN) TOKENIZER_PREFIXES = TOKENIZER_PREFIXES TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES TOKENIZER_INFIXES = TOKENIZER_INFIXES -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TAG_MAP", "MORPH_RULES", "LEMMA_RULES", + "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/bn/lemma_rules.py b/spacy/bn/lemma_rules.py new file mode 100644 index 000000000..59db5d052 --- /dev/null +++ b/spacy/bn/lemma_rules.py @@ -0,0 +1,68 @@ +# encoding: utf8 +from __future__ import unicode_literals + +# Source: উচ্চতর বাংলা ব্যাকরণ ও রচনা - অধ্যাপক নিরঞ্জন অধিকারী ও অধ্যাপক ড. সফিউদ্দিন আহমদ + +LEMMA_RULES = { + "noun": [ + ["টা", ""], + ["টি", ""], + ["খান", ""], + ["খানা", ""], + ["খানি", ""], + ["গাছা", ""], + ["গাছি", ""], + ["ছড়া", ""], + + ["কে", ""], + ["ে", ""], + ["তে", ""], + + ["র", ""], + ["রা", ""], + ["রে", ""], + ["ের", ""], # এর + ["েরা", ""], # এরা + ["দের", ""], + ["দেরকে", ""], + ["গুলা", ""], + ["গুলো", ""], + ["গুলি", ""], + + ["কুল", ""], + ["গণ", ""], + ["দল", ""], + ["পাল", ""], + ["পুঞ্জ", ""], + ["মণ্ডলী", ""], + ["মালা", ""], + ["রাজি", ""], + ["বৃন্দ", ""], + ["বর্গ", ""], + ["শ্রেণী", ""], + ["শ্রেনি", ""], + ["রাশি", ""], + ["সকল", ""], + ["মহল", ""], + ["াবলি", ""], # আবলি + + # Bengali digit representations + ["০", "0"], + ["১", "1"], + ["২", "2"], + ["৩", "3"], + ["৪", "4"], + ["৫", "5"], + ["৬", "6"], + ["৭", "7"], + ["৮", "8"], + ["৯", "9"], + ], + + "punct": [ + ["“", "\""], + ["”", "\""], + ["\u2018", "'"], + ["\u2019", "'"] + ] +} diff --git a/spacy/bn/morph_rules.py b/spacy/bn/morph_rules.py new file mode 100644 index 000000000..3a0cebcbc --- /dev/null +++ b/spacy/bn/morph_rules.py @@ -0,0 +1,55 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..language_data import PRON_LEMMA +from ..symbols import * + +MORPH_RULES = { + "PRP": { + 'ঐ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'}, + 'আমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'}, + 'কি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, + 'সে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'}, + 'কিসে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, + 'কাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, + 'তাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'}, + 'স্বয়ং': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, + 'কোনগুলো': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'}, + 'তুমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, + 'তাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'}, + 'আমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'}, + 'যিনি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'}, + 'আমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'}, + 'কোন': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'}, + 'কারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'}, + 'তোমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, + 'খোদ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, + 'কে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'}, + 'যারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'}, + 'যে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'}, + 'তোমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'}, + 'তোমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'}, + 'আপন': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, + 'এ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'}, + 'নিজ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'}, + 'কার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'}, + 'যা': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Rel', 'Case': 'Nom'}, + 'তারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'}, + 'আমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Nom'} + }, + "PRP$": { + + 'আমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'তার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'তোমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'আমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'তোমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + 'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', + 'Case': 'Nom'}, + }, +} diff --git a/spacy/bn/tag_map.py b/spacy/bn/tag_map.py new file mode 100644 index 000000000..3dcd9bdcf --- /dev/null +++ b/spacy/bn/tag_map.py @@ -0,0 +1,58 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + ".": {POS: PUNCT, "PunctType": "peri"}, + ",": {POS: PUNCT, "PunctType": "comm"}, + "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, + "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, + "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, + "\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + ":": {POS: PUNCT}, + "৳": {POS: SYM, "Other": {"SymType": "currency"}}, + "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "AFX": {POS: ADJ, "Hyph": "yes"}, + "CC": {POS: CONJ, "ConjType": "coor"}, + "CD": {POS: NUM, "NumType": "card"}, + "DT": {POS: DET}, + "EX": {POS: ADV, "AdvType": "ex"}, + "FW": {POS: X, "Foreign": "yes"}, + "HYPH": {POS: PUNCT, "PunctType": "dash"}, + "IN": {POS: ADP}, + "JJ": {POS: ADJ, "Degree": "pos"}, + "JJR": {POS: ADJ, "Degree": "comp"}, + "JJS": {POS: ADJ, "Degree": "sup"}, + "LS": {POS: PUNCT, "NumType": "ord"}, + "MD": {POS: VERB, "VerbType": "mod"}, + "NIL": {POS: ""}, + "NN": {POS: NOUN, "Number": "sing"}, + "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, + "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, + "NNS": {POS: NOUN, "Number": "plur"}, + "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, + "POS": {POS: PART, "Poss": "yes"}, + "PRP": {POS: PRON, "PronType": "prs"}, + "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, + "RB": {POS: ADV, "Degree": "pos"}, + "RBR": {POS: ADV, "Degree": "comp"}, + "RBS": {POS: ADV, "Degree": "sup"}, + "RP": {POS: PART}, + "SYM": {POS: SYM}, + "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, + "UH": {POS: INTJ}, + "VB": {POS: VERB, "VerbForm": "inf"}, + "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, + "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, + "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, + "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, + "VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3}, + "WDT": {POS: ADJ, "PronType": "int|rel"}, + "WP": {POS: NOUN, "PronType": "int|rel"}, + "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, + "WRB": {POS: ADV, "PronType": "int|rel"}, + "SP": {POS: SPACE}, +} diff --git a/spacy/bn/tokenizer_exceptions.py b/spacy/bn/tokenizer_exceptions.py new file mode 100644 index 000000000..7722c9dcc --- /dev/null +++ b/spacy/bn/tokenizer_exceptions.py @@ -0,0 +1,36 @@ +# coding=utf-8 +from __future__ import unicode_literals + +from ..symbols import * + +TOKENIZER_EXCEPTIONS = {} + +ABBREVIATIONS = { + "ডঃ": [ + {ORTH: "ডঃ", LEMMA: "ডক্টর"}, + ], + "ডাঃ": [ + {ORTH: "ডাঃ", LEMMA: "ডাক্তার"}, + ], + "ড.": [ + {ORTH: "ড.", LEMMA: "ডক্টর"}, + ], + "ডা.": [ + {ORTH: "ডা.", LEMMA: "ডাক্তার"}, + ], + "মোঃ": [ + {ORTH: "মোঃ", LEMMA: "মোহাম্মদ"}, + ], + "মো.": [ + {ORTH: "মো.", LEMMA: "মোহাম্মদ"}, + ], + "সে.": [ + {ORTH: "সে.", LEMMA: "সেলসিয়াস"}, + ], + "কি.মি": [ + {ORTH: "কি.মি", LEMMA: "কিলোমিটার"}, + {ORTH: "কি.মি.", LEMMA: "কিলোমিটার"}, + ], +} + +TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)