2017-03-12 15:07:28 +03:00
|
|
|
|
# coding: utf8
|
2017-02-10 15:17:05 +03:00
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
2019-02-01 10:05:22 +03:00
|
|
|
|
import re
|
2017-02-10 15:17:05 +03:00
|
|
|
|
|
2017-05-08 23:29:04 +03:00
|
|
|
|
from .punctuation import ELISION, HYPHENS
|
|
|
|
|
from ..tokenizer_exceptions import URL_PATTERN
|
2019-02-01 10:05:22 +03:00
|
|
|
|
from ..char_classes import ALPHA_LOWER, ALPHA
|
2020-03-25 14:28:12 +03:00
|
|
|
|
from ...symbols import ORTH, LEMMA
|
2017-02-24 20:22:40 +03:00
|
|
|
|
|
2019-02-21 00:10:13 +03:00
|
|
|
|
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
2020-03-25 14:28:12 +03:00
|
|
|
|
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
2019-02-21 00:10:13 +03:00
|
|
|
|
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
|
|
|
|
|
|
2017-02-10 15:17:05 +03:00
|
|
|
|
|
|
|
|
|
def upper_first_letter(text):
|
|
|
|
|
if len(text) == 0:
|
|
|
|
|
return text
|
|
|
|
|
if len(text) == 1:
|
|
|
|
|
return text.upper()
|
|
|
|
|
return text[0].upper() + text[1:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def lower_first_letter(text):
|
|
|
|
|
if len(text) == 0:
|
|
|
|
|
return text
|
|
|
|
|
if len(text) == 1:
|
|
|
|
|
return text.lower()
|
|
|
|
|
return text[0].lower() + text[1:]
|
|
|
|
|
|
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]}
|
2017-05-08 16:49:05 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for exc_data in [
|
|
|
|
|
{LEMMA: "avant", ORTH: "av."},
|
|
|
|
|
{LEMMA: "janvier", ORTH: "janv."},
|
|
|
|
|
{LEMMA: "février", ORTH: "févr."},
|
|
|
|
|
{LEMMA: "avril", ORTH: "avr."},
|
|
|
|
|
{LEMMA: "juillet", ORTH: "juill."},
|
|
|
|
|
{LEMMA: "septembre", ORTH: "sept."},
|
|
|
|
|
{LEMMA: "octobre", ORTH: "oct."},
|
|
|
|
|
{LEMMA: "novembre", ORTH: "nov."},
|
|
|
|
|
{LEMMA: "décembre", ORTH: "déc."},
|
|
|
|
|
{LEMMA: "après", ORTH: "apr."},
|
|
|
|
|
{LEMMA: "docteur", ORTH: "Dr."},
|
|
|
|
|
{LEMMA: "monsieur", ORTH: "M."},
|
|
|
|
|
{LEMMA: "monsieur", ORTH: "Mr."},
|
|
|
|
|
{LEMMA: "madame", ORTH: "Mme."},
|
|
|
|
|
{LEMMA: "mademoiselle", ORTH: "Mlle."},
|
|
|
|
|
{LEMMA: "numéro", ORTH: "n°"},
|
|
|
|
|
{LEMMA: "degrés", ORTH: "d°"},
|
|
|
|
|
{LEMMA: "saint", ORTH: "St."},
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
{LEMMA: "sainte", ORTH: "Ste."},
|
|
|
|
|
]:
|
2017-11-02 01:02:45 +03:00
|
|
|
|
_exc[exc_data[ORTH]] = [exc_data]
|
2017-05-08 16:49:05 +03:00
|
|
|
|
|
|
|
|
|
|
2020-03-25 13:27:42 +03:00
|
|
|
|
for orth in [
|
|
|
|
|
"après-midi",
|
|
|
|
|
"au-delà",
|
|
|
|
|
"au-dessus",
|
|
|
|
|
"celle-ci",
|
|
|
|
|
"celles-ci",
|
|
|
|
|
"celui-ci",
|
|
|
|
|
"cf.",
|
|
|
|
|
"ci-dessous",
|
|
|
|
|
"elle-même",
|
|
|
|
|
"en-dessous",
|
|
|
|
|
"etc.",
|
|
|
|
|
"jusque-là",
|
|
|
|
|
"lui-même",
|
|
|
|
|
"MM.",
|
|
|
|
|
"No.",
|
|
|
|
|
"peut-être",
|
|
|
|
|
"pp.",
|
|
|
|
|
"quelques-uns",
|
|
|
|
|
"rendez-vous",
|
|
|
|
|
"Vol.",
|
|
|
|
|
]:
|
2017-05-08 16:49:05 +03:00
|
|
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for verb, verb_lemma in [
|
|
|
|
|
("a", "avoir"),
|
|
|
|
|
("est", "être"),
|
|
|
|
|
("semble", "sembler"),
|
|
|
|
|
("indique", "indiquer"),
|
|
|
|
|
("moque", "moquer"),
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
("passe", "passer"),
|
|
|
|
|
]:
|
2017-05-08 16:49:05 +03:00
|
|
|
|
for orth in [verb, verb.title()]:
|
|
|
|
|
for pronoun in ["elle", "il", "on"]:
|
|
|
|
|
token = "{}-t-{}".format(orth, pronoun)
|
|
|
|
|
_exc[token] = [
|
2020-03-25 14:28:12 +03:00
|
|
|
|
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
|
2017-05-08 16:49:05 +03:00
|
|
|
|
{LEMMA: "t", ORTH: "-t"},
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
|
|
|
|
]
|
2017-05-08 16:49:05 +03:00
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
for verb, verb_lemma in [("est", "être")]:
|
2017-05-08 16:49:05 +03:00
|
|
|
|
for orth in [verb, verb.title()]:
|
|
|
|
|
token = "{}-ce".format(orth)
|
|
|
|
|
_exc[token] = [
|
2020-03-25 14:28:12 +03:00
|
|
|
|
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
{LEMMA: "ce", ORTH: "-ce"},
|
|
|
|
|
]
|
2017-05-08 16:49:05 +03:00
|
|
|
|
|
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
|
|
|
|
|
for orth in [pre, pre.title()]:
|
|
|
|
|
_exc["%sest-ce" % orth] = [
|
2020-03-25 13:27:42 +03:00
|
|
|
|
{LEMMA: pre_lemma, ORTH: orth},
|
|
|
|
|
{LEMMA: "être", ORTH: "est"},
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
{LEMMA: "ce", ORTH: "-ce"},
|
|
|
|
|
]
|
2017-05-08 16:49:05 +03:00
|
|
|
|
|
|
|
|
|
|
2020-03-25 13:27:42 +03:00
|
|
|
|
for verb, pronoun in [("est", "il"), ("EST", "IL")]:
|
|
|
|
|
token = "{}-{}".format(verb, pronoun)
|
|
|
|
|
_exc[token] = [
|
|
|
|
|
{LEMMA: "être", ORTH: verb},
|
|
|
|
|
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
|
|
|
|
|
token = "{}'{}-{}".format(s, verb, pronoun)
|
|
|
|
|
_exc[token] = [
|
|
|
|
|
{LEMMA: "se", ORTH: s + "'"},
|
|
|
|
|
{LEMMA: "être", ORTH: verb},
|
|
|
|
|
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
2017-05-08 16:49:05 +03:00
|
|
|
|
_infixes_exc = []
|
2018-12-16 20:04:55 +03:00
|
|
|
|
orig_elision = "'"
|
2018-12-18 15:48:10 +03:00
|
|
|
|
orig_hyphen = "-"
|
2018-12-16 20:04:55 +03:00
|
|
|
|
|
|
|
|
|
# loop through the elison and hyphen characters, and try to substitute the ones that weren't used in the original list
|
|
|
|
|
for infix in FR_BASE_EXCEPTIONS:
|
|
|
|
|
variants_infix = {infix}
|
|
|
|
|
for elision_char in [x for x in ELISION if x != orig_elision]:
|
2018-12-18 15:48:10 +03:00
|
|
|
|
variants_infix.update(
|
|
|
|
|
[word.replace(orig_elision, elision_char) for word in variants_infix]
|
|
|
|
|
)
|
|
|
|
|
for hyphen_char in [x for x in ["-", "‐"] if x != orig_hyphen]:
|
|
|
|
|
variants_infix.update(
|
|
|
|
|
[word.replace(orig_hyphen, hyphen_char) for word in variants_infix]
|
|
|
|
|
)
|
2018-12-16 20:04:55 +03:00
|
|
|
|
variants_infix.update([upper_first_letter(word) for word in variants_infix])
|
|
|
|
|
_infixes_exc.extend(variants_infix)
|
2017-05-08 16:49:05 +03:00
|
|
|
|
|
2017-05-08 17:11:45 +03:00
|
|
|
|
for orth in _infixes_exc:
|
2017-05-08 16:49:05 +03:00
|
|
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_hyphen_prefix = [
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"a[ée]ro",
|
|
|
|
|
"abat",
|
|
|
|
|
"a[fg]ro",
|
|
|
|
|
"after",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"aigues?",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"am[ée]ricano",
|
|
|
|
|
"anglo",
|
|
|
|
|
"anti",
|
|
|
|
|
"apr[èe]s",
|
|
|
|
|
"arabo",
|
|
|
|
|
"arcs?",
|
|
|
|
|
"archi",
|
|
|
|
|
"arrières?",
|
2019-02-21 00:10:13 +03:00
|
|
|
|
"audio",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"avant",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"avion",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"auto",
|
|
|
|
|
"banc",
|
|
|
|
|
"bas(?:ses?)?",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"bateaux?",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"bec?",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"belles?",
|
|
|
|
|
"beau",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"best",
|
|
|
|
|
"bio?",
|
|
|
|
|
"bien",
|
|
|
|
|
"blanc",
|
|
|
|
|
"bo[îi]te",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"bonn?e?s?",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"bois",
|
|
|
|
|
"bou(?:c|rg)",
|
|
|
|
|
"b[êe]ta",
|
|
|
|
|
"cache",
|
|
|
|
|
"cap(?:ello)?",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"casse",
|
|
|
|
|
"castel",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"champ",
|
|
|
|
|
"chapelle",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"ch[âa]teau(?:neuf)?",
|
|
|
|
|
"chasse",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"cha(?:ud|t)e?s?",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"chauffe",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"chou",
|
|
|
|
|
"chromo",
|
|
|
|
|
"claire?s?",
|
|
|
|
|
"co(?:de|ca)?",
|
|
|
|
|
"compte",
|
|
|
|
|
"contre",
|
|
|
|
|
"cordon",
|
|
|
|
|
"coupe?",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"courte?s?",
|
|
|
|
|
"couvre",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"crash",
|
|
|
|
|
"crise",
|
|
|
|
|
"croche",
|
|
|
|
|
"cross",
|
|
|
|
|
"cyber",
|
|
|
|
|
"côte",
|
|
|
|
|
"demi",
|
|
|
|
|
"di(?:sney)?",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"dix",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"d[ée]s?",
|
|
|
|
|
"dys",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"ex?",
|
|
|
|
|
"émirato",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"entre",
|
|
|
|
|
"est",
|
|
|
|
|
"ethno",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"ex",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"extra",
|
|
|
|
|
"extrême",
|
|
|
|
|
"[ée]co",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"faux",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"fil",
|
|
|
|
|
"fort",
|
|
|
|
|
"franco?s?",
|
|
|
|
|
"gallo",
|
|
|
|
|
"gardes?",
|
|
|
|
|
"gastro",
|
|
|
|
|
"grande?",
|
|
|
|
|
"gratte",
|
|
|
|
|
"gr[ée]co",
|
|
|
|
|
"gros",
|
|
|
|
|
"g[ée]o",
|
|
|
|
|
"haute?s?",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"homm?es?",
|
|
|
|
|
"hors",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"hyper",
|
|
|
|
|
"indo",
|
|
|
|
|
"infra",
|
|
|
|
|
"inter",
|
|
|
|
|
"intra",
|
|
|
|
|
"islamo",
|
|
|
|
|
"italo",
|
|
|
|
|
"jean",
|
|
|
|
|
"labio",
|
|
|
|
|
"latino",
|
|
|
|
|
"live",
|
|
|
|
|
"lot",
|
|
|
|
|
"louis",
|
|
|
|
|
"m[ai]cro",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"mal",
|
|
|
|
|
"médio",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"mesnil",
|
|
|
|
|
"mi(?:ni)?",
|
|
|
|
|
"mono",
|
|
|
|
|
"mont?s?",
|
|
|
|
|
"moyen",
|
|
|
|
|
"multi",
|
|
|
|
|
"m[ée]cano",
|
|
|
|
|
"m[ée]dico",
|
|
|
|
|
"m[ée]do",
|
|
|
|
|
"m[ée]ta",
|
|
|
|
|
"mots?",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"neuro",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"noix",
|
|
|
|
|
"non",
|
|
|
|
|
"nord",
|
|
|
|
|
"notre",
|
|
|
|
|
"n[ée]o",
|
|
|
|
|
"ouest",
|
|
|
|
|
"outre",
|
|
|
|
|
"ouvre",
|
|
|
|
|
"passe",
|
|
|
|
|
"perce",
|
|
|
|
|
"pharmaco",
|
|
|
|
|
"ph[oy]to",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"pieds?",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"pique",
|
|
|
|
|
"poissons?",
|
|
|
|
|
"ponce",
|
|
|
|
|
"pont",
|
|
|
|
|
"po[rs]t",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"pousse",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"primo",
|
|
|
|
|
"pro(?:cès|to)?",
|
|
|
|
|
"pare",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"petite?s?",
|
|
|
|
|
"plessis",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"porte",
|
|
|
|
|
"pré",
|
|
|
|
|
"prêchi",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"protège",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"pseudo",
|
|
|
|
|
"pêle",
|
|
|
|
|
"péri",
|
|
|
|
|
"puy",
|
|
|
|
|
"quasi",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"quatre",
|
|
|
|
|
"radio",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"recourt",
|
|
|
|
|
"rythmo",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"(?:re)?doubles?",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"r[ée]",
|
|
|
|
|
"r[ée]tro",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"requin",
|
|
|
|
|
"sans?",
|
|
|
|
|
"sa?inte?s?",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"semi",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"serre",
|
|
|
|
|
"sino",
|
|
|
|
|
"socio",
|
|
|
|
|
"sociale?s?",
|
|
|
|
|
"soixante",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"sous",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"su[bdrs]",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"super",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"taille",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"tire",
|
|
|
|
|
"thermo",
|
|
|
|
|
"tiers",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"tourne",
|
|
|
|
|
"toute?s?",
|
|
|
|
|
"tra[iî]ne?",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"trans",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"trente",
|
|
|
|
|
"trois",
|
|
|
|
|
"trousse",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"tr(?:i|ou)",
|
|
|
|
|
"t[ée]l[ée]",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"utéro",
|
|
|
|
|
"vaso",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"vi[cd]e",
|
|
|
|
|
"vid[ée]o",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"vie(?:ux|i?lles?|i?l)",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"vill(?:e|eneuve|ers|ette|iers|y)",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"vingt",
|
|
|
|
|
"voitures?",
|
|
|
|
|
"wagons?",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"ultra",
|
|
|
|
|
"à",
|
|
|
|
|
"[ée]lectro",
|
|
|
|
|
"[ée]qui",
|
2018-12-16 20:04:55 +03:00
|
|
|
|
"Fontaine",
|
|
|
|
|
"La Chapelle",
|
|
|
|
|
"Marie",
|
|
|
|
|
"Le Mesnil",
|
|
|
|
|
"Neuville",
|
|
|
|
|
"Pierre",
|
|
|
|
|
"Val",
|
|
|
|
|
"Vaux",
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
]
|
|
|
|
|
|
2017-05-08 16:49:05 +03:00
|
|
|
|
_regular_exp = [
|
2019-02-01 10:05:22 +03:00
|
|
|
|
"^a[{hyphen}]sexualis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^arginine[{hyphen}]méthyl[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^binge[{hyphen}]watch[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^black[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^bouche[{hyphen}]por[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^burn[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^by[{hyphen}]pass[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^ch[{elision}]tiis[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
|
|
|
|
|
"^chape[{hyphen}]chut[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^down[{hyphen}]load[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^[ée]tats[{hyphen}]uni[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^droits?[{hyphen}]de[{hyphen}]l'homm[{al}]+$".format(
|
|
|
|
|
hyphen=HYPHENS, al=ALPHA_LOWER
|
2018-12-18 15:48:10 +03:00
|
|
|
|
),
|
2019-02-01 10:05:22 +03:00
|
|
|
|
"^fac[{hyphen}]simil[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^fleur[{hyphen}]bleuis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^flic[{hyphen}]flaqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^fox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^google[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^hard[{hyphen}]discount[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^hip[{hyphen}]hop[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^jet[{hyphen}]set[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^knock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^lèche[{hyphen}]bott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^litho[{hyphen}]typographi[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^lock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^lombri[{hyphen}]compost[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^mac[{hyphen}]adamis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^marque[{hyphen}]pag[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^mouton[{hyphen}]noiris[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^new[{hyphen}]york[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^pair[{hyphen}]programm[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^people[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^plan[{hyphen}]socialis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^premier[{hyphen}]ministr[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^prud[{elision}]hom[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
|
|
|
|
|
"^réarc[{hyphen}]bout[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^refox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^remicro[{hyphen}]ond[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^repique[{hyphen}]niqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^repetit[{hyphen}]déjeun[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^rick[{hyphen}]roll[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^rond[{hyphen}]ponn[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^shift[{hyphen}]cliqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^soudo[{hyphen}]bras[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^stabilo[{hyphen}]boss[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^strip[{hyphen}]teas[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^terra[{hyphen}]form[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^teuf[{hyphen}]teuf[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^yo[{hyphen}]yo[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^zig[{hyphen}]zag[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
|
|
|
"^z[{elision}]yeut[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
]
|
2018-12-16 20:04:55 +03:00
|
|
|
|
# catching cases like faux-vampire
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
_regular_exp += [
|
2019-02-04 12:37:19 +03:00
|
|
|
|
"^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
prefix=p,
|
2019-03-09 04:37:41 +03:00
|
|
|
|
hyphen=HYPHENS, # putting the - first in the [] range avoids having to use a backslash
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
elision=ELISION,
|
2019-02-01 10:05:22 +03:00
|
|
|
|
al=ALPHA_LOWER,
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
)
|
|
|
|
|
for p in _hyphen_prefix
|
|
|
|
|
]
|
2018-12-16 20:04:55 +03:00
|
|
|
|
|
|
|
|
|
# catching cases like entr'abat
|
2018-12-18 15:48:10 +03:00
|
|
|
|
_elision_prefix = ["r?é?entr", "grande?s?", "r"]
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
_regular_exp += [
|
2019-02-04 12:37:19 +03:00
|
|
|
|
"^{prefix}[{elision}][{al}][{hyphen}{al}{elision}]*$".format(
|
|
|
|
|
prefix=p, elision=ELISION, hyphen=HYPHENS, al=ALPHA_LOWER
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
)
|
|
|
|
|
for p in _elision_prefix
|
|
|
|
|
]
|
2018-12-16 20:04:55 +03:00
|
|
|
|
|
|
|
|
|
# catching cases like saut-de-ski, pet-en-l'air
|
2018-12-18 15:48:10 +03:00
|
|
|
|
_hyphen_combination = [
|
|
|
|
|
"l[èe]s?",
|
|
|
|
|
"la",
|
|
|
|
|
"en",
|
|
|
|
|
"des?",
|
|
|
|
|
"d[eu]",
|
|
|
|
|
"sur",
|
|
|
|
|
"sous",
|
|
|
|
|
"aux?",
|
|
|
|
|
"à",
|
|
|
|
|
"et",
|
|
|
|
|
"près",
|
|
|
|
|
"saint",
|
|
|
|
|
]
|
2018-12-16 20:04:55 +03:00
|
|
|
|
_regular_exp += [
|
2019-02-01 10:05:22 +03:00
|
|
|
|
"^[{a}]+[{hyphen}]{hyphen_combo}[{hyphen}](?:l[{elision}])?[{a}]+$".format(
|
|
|
|
|
hyphen_combo=hc, elision=ELISION, hyphen=HYPHENS, a=ALPHA
|
2018-12-16 20:04:55 +03:00
|
|
|
|
)
|
|
|
|
|
for hc in _hyphen_combination
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# URLs
|
2017-05-09 01:00:00 +03:00
|
|
|
|
_regular_exp.append(URL_PATTERN)
|
2017-05-08 16:49:05 +03:00
|
|
|
|
|
|
|
|
|
|
2017-10-31 23:05:29 +03:00
|
|
|
|
TOKENIZER_EXCEPTIONS = _exc
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
TOKEN_MATCH = re.compile(
|
2020-05-21 15:14:01 +03:00
|
|
|
|
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
).match
|