mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	add punctuations for Bengali
This commit is contained in:
		
							parent
							
								
									5a4fc09576
								
							
						
					
					
						commit
						d91be7aed4
					
				|  | @ -16,3 +16,7 @@ class Bengali(Language): | |||
| 
 | ||||
|         tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|         stop_words = STOP_WORDS | ||||
| 
 | ||||
|         prefixes = tuple(TOKENIZER_PREFIXES) | ||||
|         suffixes = tuple(TOKENIZER_SUFFIXES) | ||||
|         infixes = tuple(TOKENIZER_INFIXES) | ||||
|  |  | |||
|  | @ -1,17 +1,18 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .. import language_data as base | ||||
| from ..language_data import update_exc, strings_to_exc | ||||
| 
 | ||||
| from spacy.language_data import strings_to_exc, update_exc | ||||
| from .punctuation import * | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| from .. import language_data as base | ||||
| 
 | ||||
| STOP_WORDS = set(STOP_WORDS) | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) | ||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) | ||||
| 
 | ||||
| TOKENIZER_PREFIXES = TOKENIZER_PREFIXES | ||||
| TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES | ||||
| TOKENIZER_INFIXES = TOKENIZER_INFIXES | ||||
| 
 | ||||
| __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] | ||||
| __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] | ||||
|  |  | |||
							
								
								
									
										45
									
								
								spacy/bn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								spacy/bn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,45 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPHA_UPPER, LIST_QUOTES, UNITS, \ | ||||
|     CURRENCY, LIST_PUNCT, ALPHA, _QUOTES | ||||
| 
 | ||||
| CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿ ৳" | ||||
| 
 | ||||
| _PUNCT = '। ॥' | ||||
| 
 | ||||
| LIST_PUNCT.extend(_PUNCT.strip().split()) | ||||
| 
 | ||||
| TOKENIZER_PREFIXES = ( | ||||
|     [r'\+'] + | ||||
|     LIST_PUNCT + | ||||
|     LIST_ELLIPSES + | ||||
|     LIST_QUOTES | ||||
| ) | ||||
| 
 | ||||
| TOKENIZER_SUFFIXES = ( | ||||
|     LIST_PUNCT + | ||||
|     LIST_ELLIPSES + | ||||
|     LIST_QUOTES + | ||||
|     [ | ||||
|         r'(?<=[0-9])\+', | ||||
|         r'(?<=°[FfCcKk])\.', | ||||
|         r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), | ||||
|         r'(?<=[0-9])(?:{u})'.format(u=UNITS), | ||||
|         r'(?<=[{al}{p}{c}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES, c=CURRENCY_SYMBOLS), | ||||
|         r'(?<=[{al})])-e'.format(al=ALPHA_LOWER) | ||||
|     ] | ||||
| ) | ||||
| 
 | ||||
| TOKENIZER_INFIXES = ( | ||||
|     LIST_ELLIPSES + | ||||
|     [ | ||||
|         r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||
|         r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), | ||||
|         r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), | ||||
|         r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), | ||||
|         r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), | ||||
|         r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")), | ||||
|     ] | ||||
| ) | ||||
| __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user