mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
eddeb36c96
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
105 lines
1.4 KiB
Python
105 lines
1.4 KiB
Python
# coding: utf8
|
||
from __future__ import unicode_literals
|
||
from ...attrs import LIKE_NUM
|
||
|
||
|
||
MIM = "م"
|
||
ZWNJ_O_MIM = "ام"
|
||
YE_NUN = "ین"
|
||
|
||
|
||
_num_words = set(
|
||
"""
|
||
صفر
|
||
یک
|
||
دو
|
||
سه
|
||
چهار
|
||
پنج
|
||
شش
|
||
شیش
|
||
هفت
|
||
هشت
|
||
نه
|
||
ده
|
||
یازده
|
||
دوازده
|
||
سیزده
|
||
چهارده
|
||
پانزده
|
||
پونزده
|
||
شانزده
|
||
شونزده
|
||
هفده
|
||
هجده
|
||
هیجده
|
||
نوزده
|
||
بیست
|
||
سی
|
||
چهل
|
||
پنجاه
|
||
شصت
|
||
هفتاد
|
||
هشتاد
|
||
نود
|
||
صد
|
||
یکصد
|
||
یکصد
|
||
دویست
|
||
سیصد
|
||
چهارصد
|
||
پانصد
|
||
پونصد
|
||
ششصد
|
||
شیشصد
|
||
هفتصد
|
||
هفصد
|
||
هشتصد
|
||
نهصد
|
||
هزار
|
||
میلیون
|
||
میلیارد
|
||
بیلیون
|
||
بیلیارد
|
||
تریلیون
|
||
تریلیارد
|
||
کوادریلیون
|
||
کادریلیارد
|
||
کوینتیلیون
|
||
""".split()
|
||
)
|
||
|
||
_ordinal_words = set(
|
||
"""
|
||
اول
|
||
سوم
|
||
سیام""".split()
|
||
)
|
||
|
||
_ordinal_words.update({num + MIM for num in _num_words})
|
||
_ordinal_words.update({num + ZWNJ_O_MIM for num in _num_words})
|
||
_ordinal_words.update({num + YE_NUN for num in _ordinal_words})
|
||
|
||
|
||
def like_num(text):
|
||
"""
|
||
check if text resembles a number
|
||
"""
|
||
text = (
|
||
text.replace(",", "")
|
||
.replace(".", "")
|
||
.replace("،", "")
|
||
.replace("٫", "")
|
||
.replace("/", "")
|
||
)
|
||
if text.isdigit():
|
||
return True
|
||
if text in _num_words:
|
||
return True
|
||
if text in _ordinal_words:
|
||
return True
|
||
return False
|
||
|
||
|
||
LEX_ATTRS = {LIKE_NUM: like_num}
|