Reorganise Bengali language data

This commit is contained in:
ines 2017-05-08 15:43:50 +02:00
parent 607ba458e7
commit 7b3a983f96
5 changed files with 40 additions and 12 deletions

View File

@ -1,10 +1,16 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lemmatizer import LEMMA_RULES
from ..language_data import BASE_EXCEPTIONS
from ..language import Language from ..language import Language
from ..attrs import LANG from ..attrs import LANG
from ..util import update_exc
from .language_data import *
class Bengali(Language): class Bengali(Language):
@ -14,7 +20,7 @@ class Bengali(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn' lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES lemma_rules = LEMMA_RULES
@ -23,4 +29,5 @@ class Bengali(Language):
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)
EXPORT = Bengali
__all__ = ['Bengali']

View File

@ -1,8 +1,9 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..language_data import PRON_LEMMA from ..symbols import LEMMA
from ..symbols import * from ..deprecated import PRON_LEMMA
MORPH_RULES = { MORPH_RULES = {
"PRP": { "PRP": {
@ -51,5 +52,5 @@ MORPH_RULES = {
'Case': 'Nom'}, 'Case': 'Nom'},
'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', 'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
'Case': 'Nom'}, 'Case': 'Nom'},
}, }
} }

View File

@ -1,8 +1,10 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPHA_UPPER, LIST_QUOTES, UNITS, \ from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES
CURRENCY, LIST_PUNCT, ALPHA, _QUOTES from ..language_data.punctuation import ALPHA_UPPER, LIST_QUOTES, UNITS
from ..language_data.punctuation import CURRENCY, LIST_PUNCT, ALPHA, _QUOTES
CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿ ৳" CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿ ৳"
@ -42,4 +44,3 @@ TOKENIZER_INFIXES = (
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")), r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")),
] ]
) )
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

View File

@ -1,7 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import * from ..symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
from ..symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
TAG_MAP = { TAG_MAP = {
@ -55,4 +56,22 @@ TAG_MAP = {
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
"WRB": {POS: ADV, "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"},
"SP": {POS: SPACE}, "SP": {POS: SPACE},
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB},
"PART": {POS: PART},
} }