mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Reorganise Bengali language data
This commit is contained in:
		
							parent
							
								
									607ba458e7
								
							
						
					
					
						commit
						7b3a983f96
					
				|  | @ -1,10 +1,16 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals, print_function | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from .tag_map import TAG_MAP | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lemmatizer import LEMMA_RULES | ||||
| 
 | ||||
| from ..language_data import BASE_EXCEPTIONS | ||||
| from ..language import Language | ||||
| from ..attrs import LANG | ||||
| 
 | ||||
| from .language_data import * | ||||
| from ..util import update_exc | ||||
| 
 | ||||
| 
 | ||||
| class Bengali(Language): | ||||
|  | @ -14,7 +20,7 @@ class Bengali(Language): | |||
|         lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|         lex_attr_getters[LANG] = lambda text: 'bn' | ||||
| 
 | ||||
|         tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|         tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|         tag_map = TAG_MAP | ||||
|         stop_words = STOP_WORDS | ||||
|         lemma_rules = LEMMA_RULES | ||||
|  | @ -23,4 +29,5 @@ class Bengali(Language): | |||
|         suffixes = tuple(TOKENIZER_SUFFIXES) | ||||
|         infixes = tuple(TOKENIZER_INFIXES) | ||||
| 
 | ||||
| EXPORT = Bengali | ||||
| 
 | ||||
| __all__ = ['Bengali'] | ||||
|  |  | |||
|  | @ -1,8 +1,9 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..language_data import PRON_LEMMA | ||||
| from ..symbols import * | ||||
| from ..symbols import LEMMA | ||||
| from ..deprecated import PRON_LEMMA | ||||
| 
 | ||||
| 
 | ||||
| MORPH_RULES = { | ||||
|     "PRP":  { | ||||
|  | @ -51,5 +52,5 @@ MORPH_RULES = { | |||
|                     'Case': 'Nom'}, | ||||
|         'তাদের':   {LEMMA:  PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes', | ||||
|                     'Case': 'Nom'}, | ||||
|     }, | ||||
|     } | ||||
| } | ||||
|  |  | |||
|  | @ -1,8 +1,10 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPHA_UPPER, LIST_QUOTES, UNITS, \ | ||||
|     CURRENCY, LIST_PUNCT, ALPHA, _QUOTES | ||||
| from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES | ||||
| from ..language_data.punctuation import ALPHA_UPPER, LIST_QUOTES, UNITS | ||||
| from ..language_data.punctuation import CURRENCY, LIST_PUNCT, ALPHA, _QUOTES | ||||
| 
 | ||||
| 
 | ||||
| CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿ ৳" | ||||
| 
 | ||||
|  | @ -42,4 +44,3 @@ TOKENIZER_INFIXES = ( | |||
|         r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")), | ||||
|     ] | ||||
| ) | ||||
| __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] | ||||
|  |  | |||
|  | @ -1,7 +1,8 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..symbols import * | ||||
| from ..symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB | ||||
| from ..symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM | ||||
| 
 | ||||
| 
 | ||||
| TAG_MAP = { | ||||
|  | @ -55,4 +56,22 @@ TAG_MAP = { | |||
|     "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, | ||||
|     "WRB":      {POS: ADV, "PronType": "int|rel"}, | ||||
|     "SP":       {POS: SPACE}, | ||||
|     "ADV":      {POS: ADV}, | ||||
|     "NOUN":     {POS: NOUN}, | ||||
|     "ADP":      {POS: ADP}, | ||||
|     "PRON":     {POS: PRON}, | ||||
|     "SCONJ":    {POS: SCONJ}, | ||||
|     "PROPN":    {POS: PROPN}, | ||||
|     "DET":      {POS: DET}, | ||||
|     "SYM":      {POS: SYM}, | ||||
|     "INTJ":     {POS: INTJ}, | ||||
|     "PUNCT":    {POS: PUNCT}, | ||||
|     "NUM":      {POS: NUM}, | ||||
|     "AUX":      {POS: AUX}, | ||||
|     "X":        {POS: X}, | ||||
|     "CONJ":     {POS: CONJ}, | ||||
|     "CCONJ":    {POS: CCONJ}, | ||||
|     "ADJ":      {POS: ADJ}, | ||||
|     "VERB":     {POS: VERB}, | ||||
|     "PART":     {POS: PART}, | ||||
| } | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user