mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Reorganise Bengali language data
This commit is contained in:
parent
607ba458e7
commit
7b3a983f96
|
@ -1,10 +1,16 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LEMMA_RULES
|
||||
|
||||
from ..language_data import BASE_EXCEPTIONS
|
||||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
|
||||
from .language_data import *
|
||||
from ..util import update_exc
|
||||
|
||||
|
||||
class Bengali(Language):
|
||||
|
@ -14,7 +20,7 @@ class Bengali(Language):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
lemma_rules = LEMMA_RULES
|
||||
|
@ -23,4 +29,5 @@ class Bengali(Language):
|
|||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
|
||||
EXPORT = Bengali
|
||||
|
||||
__all__ = ['Bengali']
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..language_data import PRON_LEMMA
|
||||
from ..symbols import *
|
||||
from ..symbols import LEMMA
|
||||
from ..deprecated import PRON_LEMMA
|
||||
|
||||
|
||||
MORPH_RULES = {
|
||||
"PRP": {
|
||||
|
@ -51,5 +52,5 @@ MORPH_RULES = {
|
|||
'Case': 'Nom'},
|
||||
'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
||||
'Case': 'Nom'},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPHA_UPPER, LIST_QUOTES, UNITS, \
|
||||
CURRENCY, LIST_PUNCT, ALPHA, _QUOTES
|
||||
from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES
|
||||
from ..language_data.punctuation import ALPHA_UPPER, LIST_QUOTES, UNITS
|
||||
from ..language_data.punctuation import CURRENCY, LIST_PUNCT, ALPHA, _QUOTES
|
||||
|
||||
|
||||
CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿ ৳"
|
||||
|
||||
|
@ -42,4 +44,3 @@ TOKENIZER_INFIXES = (
|
|||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")),
|
||||
]
|
||||
)
|
||||
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..symbols import *
|
||||
from ..symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ..symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
|
@ -55,4 +56,22 @@ TAG_MAP = {
|
|||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||
"SP": {POS: SPACE},
|
||||
"ADV": {POS: ADV},
|
||||
"NOUN": {POS: NOUN},
|
||||
"ADP": {POS: ADP},
|
||||
"PRON": {POS: PRON},
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"PROPN": {POS: PROPN},
|
||||
"DET": {POS: DET},
|
||||
"SYM": {POS: SYM},
|
||||
"INTJ": {POS: INTJ},
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"NUM": {POS: NUM},
|
||||
"AUX": {POS: AUX},
|
||||
"X": {POS: X},
|
||||
"CONJ": {POS: CONJ},
|
||||
"CCONJ": {POS: CCONJ},
|
||||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB},
|
||||
"PART": {POS: PART},
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user