mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
eddeb36c96
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
69 lines
2.7 KiB
Python
69 lines
2.7 KiB
Python
# coding: utf8
|
||
from __future__ import unicode_literals
|
||
|
||
import regex as re
|
||
|
||
re.DEFAULT_VERSION = re.VERSION1
|
||
merge_char_classes = lambda classes: "[{}]".format("||".join(classes))
|
||
split_chars = lambda char: list(char.strip().split(" "))
|
||
merge_chars = lambda char: char.strip().replace(" ", "|")
|
||
|
||
_bengali = r"[\p{L}&&\p{Bengali}]"
|
||
_hebrew = r"[\p{L}&&\p{Hebrew}]"
|
||
_latin_lower = r"[\p{Ll}&&\p{Latin}]"
|
||
_latin_upper = r"[\p{Lu}&&\p{Latin}]"
|
||
_latin = r"[[\p{Ll}||\p{Lu}]&&\p{Latin}]"
|
||
_persian = r"[\p{L}&&\p{Arabic}]"
|
||
_russian_lower = r"[ёа-я]"
|
||
_russian_upper = r"[ЁА-Я]"
|
||
_sinhala = r"[\p{L}&&\p{Sinhala}]"
|
||
_tatar_lower = r"[әөүҗңһ]"
|
||
_tatar_upper = r"[ӘӨҮҖҢҺ]"
|
||
_greek_lower = r"[α-ωάέίόώήύ]"
|
||
_greek_upper = r"[Α-ΩΆΈΊΌΏΉΎ]"
|
||
|
||
_upper = [_latin_upper, _russian_upper, _tatar_upper, _greek_upper]
|
||
_lower = [_latin_lower, _russian_lower, _tatar_lower, _greek_lower]
|
||
_uncased = [_bengali, _hebrew, _persian, _sinhala]
|
||
|
||
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
||
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
||
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
||
|
||
_units = (
|
||
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
|
||
"TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм "
|
||
"кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб"
|
||
"كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب"
|
||
)
|
||
_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼"
|
||
|
||
# These expressions contain various unicode variations, including characters
|
||
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
||
# conflicts, spaCy's base tokenizer should handle all of those by default
|
||
_punct = (
|
||
r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪"
|
||
)
|
||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
|
||
_hyphens = "- – — -- --- —— ~"
|
||
|
||
# Various symbols like dingbats, but also emoji
|
||
# Details: https://www.compart.com/en/unicode/category/So
|
||
_other_symbols = r"[\p{So}]"
|
||
|
||
UNITS = merge_chars(_units)
|
||
CURRENCY = merge_chars(_currency)
|
||
QUOTES = merge_chars(_quotes)
|
||
PUNCT = merge_chars(_punct)
|
||
HYPHENS = merge_chars(_hyphens)
|
||
ICONS = _other_symbols
|
||
|
||
LIST_UNITS = split_chars(_units)
|
||
LIST_CURRENCY = split_chars(_currency)
|
||
LIST_QUOTES = split_chars(_quotes)
|
||
LIST_PUNCT = split_chars(_punct)
|
||
LIST_HYPHENS = split_chars(_hyphens)
|
||
LIST_ELLIPSES = [r"\.\.+", "…"]
|
||
LIST_ICONS = [_other_symbols]
|