mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	add punctuations for Bengali
This commit is contained in:
		
							parent
							
								
									5a4fc09576
								
							
						
					
					
						commit
						d91be7aed4
					
				|  | @ -16,3 +16,7 @@ class Bengali(Language): | ||||||
| 
 | 
 | ||||||
|         tokenizer_exceptions = TOKENIZER_EXCEPTIONS |         tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||||
|         stop_words = STOP_WORDS |         stop_words = STOP_WORDS | ||||||
|  | 
 | ||||||
|  |         prefixes = tuple(TOKENIZER_PREFIXES) | ||||||
|  |         suffixes = tuple(TOKENIZER_SUFFIXES) | ||||||
|  |         infixes = tuple(TOKENIZER_INFIXES) | ||||||
|  |  | ||||||
|  | @ -1,17 +1,18 @@ | ||||||
| # encoding: utf8 | # encoding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from .. import language_data as base | from spacy.language_data import strings_to_exc, update_exc | ||||||
| from ..language_data import update_exc, strings_to_exc | from .punctuation import * | ||||||
| 
 |  | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| 
 | from .. import language_data as base | ||||||
| 
 | 
 | ||||||
| STOP_WORDS = set(STOP_WORDS) | STOP_WORDS = set(STOP_WORDS) | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) | TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) | ||||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) | update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) | ||||||
| 
 | 
 | ||||||
|  | TOKENIZER_PREFIXES = TOKENIZER_PREFIXES | ||||||
|  | TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES | ||||||
|  | TOKENIZER_INFIXES = TOKENIZER_INFIXES | ||||||
| 
 | 
 | ||||||
| __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] | __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] | ||||||
|  |  | ||||||
							
								
								
									
										45
									
								
								spacy/bn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								spacy/bn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,45 @@ | ||||||
|  | # encoding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPHA_UPPER, LIST_QUOTES, UNITS, \ | ||||||
|  |     CURRENCY, LIST_PUNCT, ALPHA, _QUOTES | ||||||
|  | 
 | ||||||
|  | CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿ ৳" | ||||||
|  | 
 | ||||||
|  | _PUNCT = '। ॥' | ||||||
|  | 
 | ||||||
|  | LIST_PUNCT.extend(_PUNCT.strip().split()) | ||||||
|  | 
 | ||||||
|  | TOKENIZER_PREFIXES = ( | ||||||
|  |     [r'\+'] + | ||||||
|  |     LIST_PUNCT + | ||||||
|  |     LIST_ELLIPSES + | ||||||
|  |     LIST_QUOTES | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | TOKENIZER_SUFFIXES = ( | ||||||
|  |     LIST_PUNCT + | ||||||
|  |     LIST_ELLIPSES + | ||||||
|  |     LIST_QUOTES + | ||||||
|  |     [ | ||||||
|  |         r'(?<=[0-9])\+', | ||||||
|  |         r'(?<=°[FfCcKk])\.', | ||||||
|  |         r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), | ||||||
|  |         r'(?<=[0-9])(?:{u})'.format(u=UNITS), | ||||||
|  |         r'(?<=[{al}{p}{c}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES, c=CURRENCY_SYMBOLS), | ||||||
|  |         r'(?<=[{al})])-e'.format(al=ALPHA_LOWER) | ||||||
|  |     ] | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | TOKENIZER_INFIXES = ( | ||||||
|  |     LIST_ELLIPSES + | ||||||
|  |     [ | ||||||
|  |         r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||||
|  |         r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), | ||||||
|  |         r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), | ||||||
|  |         r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), | ||||||
|  |         r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), | ||||||
|  |         r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")), | ||||||
|  |     ] | ||||||
|  | ) | ||||||
|  | __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user