mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
eddeb36c96
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
79 lines
1.6 KiB
Python
79 lines
1.6 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ...symbols import ORTH, NORM
|
|
|
|
|
|
_exc = {
|
|
"às": [{ORTH: "à", NORM: "a"}, {ORTH: "s", NORM: "as"}],
|
|
"ao": [{ORTH: "a"}, {ORTH: "o"}],
|
|
"aos": [{ORTH: "a"}, {ORTH: "os"}],
|
|
"àquele": [{ORTH: "à", NORM: "a"}, {ORTH: "quele", NORM: "aquele"}],
|
|
"àquela": [{ORTH: "à", NORM: "a"}, {ORTH: "quela", NORM: "aquela"}],
|
|
"àqueles": [{ORTH: "à", NORM: "a"}, {ORTH: "queles", NORM: "aqueles"}],
|
|
"àquelas": [{ORTH: "à", NORM: "a"}, {ORTH: "quelas", NORM: "aquelas"}],
|
|
"àquilo": [{ORTH: "à", NORM: "a"}, {ORTH: "quilo", NORM: "aquilo"}],
|
|
"aonde": [{ORTH: "a"}, {ORTH: "onde"}],
|
|
}
|
|
|
|
|
|
# Contractions
|
|
_per_pron = ["ele", "ela", "eles", "elas"]
|
|
_dem_pron = [
|
|
"este",
|
|
"esta",
|
|
"estes",
|
|
"estas",
|
|
"isto",
|
|
"esse",
|
|
"essa",
|
|
"esses",
|
|
"essas",
|
|
"isso",
|
|
"aquele",
|
|
"aquela",
|
|
"aqueles",
|
|
"aquelas",
|
|
"aquilo",
|
|
]
|
|
_und_pron = ["outro", "outra", "outros", "outras"]
|
|
_adv = ["aqui", "aí", "ali", "além"]
|
|
|
|
|
|
for orth in _per_pron + _dem_pron + _und_pron + _adv:
|
|
_exc["d" + orth] = [{ORTH: "d", NORM: "de"}, {ORTH: orth}]
|
|
|
|
for orth in _per_pron + _dem_pron + _und_pron:
|
|
_exc["n" + orth] = [{ORTH: "n", NORM: "em"}, {ORTH: orth}]
|
|
|
|
|
|
for orth in [
|
|
"Adm.",
|
|
"Dr.",
|
|
"e.g.",
|
|
"E.g.",
|
|
"E.G.",
|
|
"Gen.",
|
|
"Gov.",
|
|
"i.e.",
|
|
"I.e.",
|
|
"I.E.",
|
|
"Jr.",
|
|
"Ltd.",
|
|
"p.m.",
|
|
"Ph.D.",
|
|
"Rep.",
|
|
"Rev.",
|
|
"Sen.",
|
|
"Sr.",
|
|
"Sra.",
|
|
"vs.",
|
|
"tel.",
|
|
"pág.",
|
|
"pag.",
|
|
]:
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
|
|
TOKENIZER_EXCEPTIONS = _exc
|