mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Reorganise Bengali language data
This commit is contained in:
parent
607ba458e7
commit
7b3a983f96
|
@ -1,10 +1,16 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lemmatizer import LEMMA_RULES
|
||||||
|
|
||||||
|
from ..language_data import BASE_EXCEPTIONS
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
|
from ..util import update_exc
|
||||||
from .language_data import *
|
|
||||||
|
|
||||||
|
|
||||||
class Bengali(Language):
|
class Bengali(Language):
|
||||||
|
@ -14,7 +20,7 @@ class Bengali(Language):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||||
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lemma_rules = LEMMA_RULES
|
lemma_rules = LEMMA_RULES
|
||||||
|
@ -23,4 +29,5 @@ class Bengali(Language):
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
|
|
||||||
EXPORT = Bengali
|
|
||||||
|
__all__ = ['Bengali']
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..language_data import PRON_LEMMA
|
from ..symbols import LEMMA
|
||||||
from ..symbols import *
|
from ..deprecated import PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
MORPH_RULES = {
|
MORPH_RULES = {
|
||||||
"PRP": {
|
"PRP": {
|
||||||
|
@ -51,5 +52,5 @@ MORPH_RULES = {
|
||||||
'Case': 'Nom'},
|
'Case': 'Nom'},
|
||||||
'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||||
'Case': 'Nom'},
|
'Case': 'Nom'},
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPHA_UPPER, LIST_QUOTES, UNITS, \
|
from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES
|
||||||
CURRENCY, LIST_PUNCT, ALPHA, _QUOTES
|
from ..language_data.punctuation import ALPHA_UPPER, LIST_QUOTES, UNITS
|
||||||
|
from ..language_data.punctuation import CURRENCY, LIST_PUNCT, ALPHA, _QUOTES
|
||||||
|
|
||||||
|
|
||||||
CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿ ৳"
|
CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿ ৳"
|
||||||
|
|
||||||
|
@ -42,4 +44,3 @@ TOKENIZER_INFIXES = (
|
||||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")),
|
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||||
|
from ..symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
|
@ -55,4 +56,22 @@ TAG_MAP = {
|
||||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||||
"SP": {POS: SPACE},
|
"SP": {POS: SPACE},
|
||||||
|
"ADV": {POS: ADV},
|
||||||
|
"NOUN": {POS: NOUN},
|
||||||
|
"ADP": {POS: ADP},
|
||||||
|
"PRON": {POS: PRON},
|
||||||
|
"SCONJ": {POS: SCONJ},
|
||||||
|
"PROPN": {POS: PROPN},
|
||||||
|
"DET": {POS: DET},
|
||||||
|
"SYM": {POS: SYM},
|
||||||
|
"INTJ": {POS: INTJ},
|
||||||
|
"PUNCT": {POS: PUNCT},
|
||||||
|
"NUM": {POS: NUM},
|
||||||
|
"AUX": {POS: AUX},
|
||||||
|
"X": {POS: X},
|
||||||
|
"CONJ": {POS: CONJ},
|
||||||
|
"CCONJ": {POS: CCONJ},
|
||||||
|
"ADJ": {POS: ADJ},
|
||||||
|
"VERB": {POS: VERB},
|
||||||
|
"PART": {POS: PART},
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user