mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
		
			
				
	
	
		
			105 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			105 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | ||
| from __future__ import unicode_literals
 | ||
| from ...attrs import LIKE_NUM
 | ||
| 
 | ||
| 
 | ||
| MIM = "م"
 | ||
| ZWNJ_O_MIM = "ام"
 | ||
| YE_NUN = "ین"
 | ||
| 
 | ||
| 
 | ||
| _num_words = set(
 | ||
|     """
 | ||
| صفر
 | ||
| یک
 | ||
| دو
 | ||
| سه
 | ||
| چهار
 | ||
| پنج
 | ||
| شش
 | ||
| شیش
 | ||
| هفت
 | ||
| هشت
 | ||
| نه
 | ||
| ده
 | ||
| یازده
 | ||
| دوازده
 | ||
| سیزده
 | ||
| چهارده
 | ||
| پانزده
 | ||
| پونزده
 | ||
| شانزده
 | ||
| شونزده
 | ||
| هفده
 | ||
| هجده
 | ||
| هیجده
 | ||
| نوزده
 | ||
| بیست
 | ||
| سی
 | ||
| چهل
 | ||
| پنجاه
 | ||
| شصت
 | ||
| هفتاد
 | ||
| هشتاد
 | ||
| نود
 | ||
| صد
 | ||
| یکصد
 | ||
| یکصد
 | ||
| دویست
 | ||
| سیصد
 | ||
| چهارصد
 | ||
| پانصد
 | ||
| پونصد
 | ||
| ششصد
 | ||
| شیشصد
 | ||
| هفتصد
 | ||
| هفصد
 | ||
| هشتصد
 | ||
| نهصد
 | ||
| هزار
 | ||
| میلیون
 | ||
| میلیارد
 | ||
| بیلیون
 | ||
| بیلیارد
 | ||
| تریلیون
 | ||
| تریلیارد
 | ||
| کوادریلیون
 | ||
| کادریلیارد
 | ||
| کوینتیلیون
 | ||
| """.split()
 | ||
| )
 | ||
| 
 | ||
| _ordinal_words = set(
 | ||
|     """
 | ||
| اول
 | ||
| سوم
 | ||
| سیام""".split()
 | ||
| )
 | ||
| 
 | ||
| _ordinal_words.update({num + MIM for num in _num_words})
 | ||
| _ordinal_words.update({num + ZWNJ_O_MIM for num in _num_words})
 | ||
| _ordinal_words.update({num + YE_NUN for num in _ordinal_words})
 | ||
| 
 | ||
| 
 | ||
| def like_num(text):
 | ||
|     """
 | ||
|     check if text resembles a number
 | ||
|     """
 | ||
|     text = (
 | ||
|         text.replace(",", "")
 | ||
|         .replace(".", "")
 | ||
|         .replace("،", "")
 | ||
|         .replace("٫", "")
 | ||
|         .replace("/", "")
 | ||
|     )
 | ||
|     if text.isdigit():
 | ||
|         return True
 | ||
|     if text in _num_words:
 | ||
|         return True
 | ||
|     if text in _ordinal_words:
 | ||
|         return True
 | ||
|     return False
 | ||
| 
 | ||
| 
 | ||
| LEX_ATTRS = {LIKE_NUM: like_num}
 |