spaCy/spacy/lang/fr/tokenizer_exceptions.py

# coding: utf8
from __future__ import unicode_literals

import regex as re

from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER
from ...symbols import ORTH, LEMMA, TAG


def upper_first_letter(text):
    if len(text) == 0:
        return text
    if len(text) == 1:
        return text.upper()
    return text[0].upper() + text[1:]


def lower_first_letter(text):
    if len(text) == 0:
        return text
    if len(text) == 1:
        return text.lower()
    return text[0].lower() + text[1:]


_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]}


for exc_data in [
    {LEMMA: "avant", ORTH: "av."},
    {LEMMA: "janvier", ORTH: "janv."},
    {LEMMA: "février", ORTH: "févr."},
    {LEMMA: "avril", ORTH: "avr."},
    {LEMMA: "juillet", ORTH: "juill."},
    {LEMMA: "septembre", ORTH: "sept."},
    {LEMMA: "octobre", ORTH: "oct."},
    {LEMMA: "novembre", ORTH: "nov."},
    {LEMMA: "décembre", ORTH: "déc."},
    {LEMMA: "après", ORTH: "apr."},
    {LEMMA: "docteur", ORTH: "Dr."},
    {LEMMA: "monsieur", ORTH: "M."},
    {LEMMA: "monsieur", ORTH: "Mr."},
    {LEMMA: "madame", ORTH: "Mme."},
    {LEMMA: "mademoiselle", ORTH: "Mlle."},
    {LEMMA: "numéro", ORTH: "n°"},
    {LEMMA: "degrés", ORTH: "d°"},
    {LEMMA: "saint", ORTH: "St."},
    {LEMMA: "sainte", ORTH: "Ste."},
]:
    _exc[exc_data[ORTH]] = [exc_data]


for orth in FR_BASE_EXCEPTIONS + ["etc."]:
    _exc[orth] = [{ORTH: orth}]


for verb, verb_lemma in [
    ("a", "avoir"),
    ("est", "être"),
    ("semble", "sembler"),
    ("indique", "indiquer"),
    ("moque", "moquer"),
    ("passe", "passer"),
]:
    for orth in [verb, verb.title()]:
        for pronoun in ["elle", "il", "on"]:
            token = "{}-t-{}".format(orth, pronoun)
            _exc[token] = [
                {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
                {LEMMA: "t", ORTH: "-t"},
                {LEMMA: pronoun, ORTH: "-" + pronoun},
            ]

for verb, verb_lemma in [("est", "être")]:
    for orth in [verb, verb.title()]:
        token = "{}-ce".format(orth)
        _exc[token] = [
            {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
            {LEMMA: "ce", ORTH: "-ce"},
        ]


for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
    for orth in [pre, pre.title()]:
        _exc["%sest-ce" % orth] = [
            {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
            {LEMMA: "être", ORTH: "est", TAG: "VERB"},
            {LEMMA: "ce", ORTH: "-ce"},
        ]


_infixes_exc = []
for elision_char in ELISION:
    for hyphen_char in ["-", "‐"]:
        _infixes_exc += [
            infix.replace("'", elision_char).replace("-", hyphen_char)
            for infix in FR_BASE_EXCEPTIONS
        ]
_infixes_exc += [upper_first_letter(word) for word in _infixes_exc]
_infixes_exc = list(set(_infixes_exc))

for orth in _infixes_exc:
    _exc[orth] = [{ORTH: orth}]


_hyphen_prefix = [
    "a[ée]ro",
    "abat",
    "a[fg]ro",
    "after",
    "am[ée]ricano",
    "anglo",
    "anti",
    "apr[èe]s",
    "arabo",
    "arcs?",
    "archi",
    "arrières?",
    "avant",
    "auto",
    "banc",
    "bas(?:ses?)?",
    "bec?",
    "best",
    "bio?",
    "bien",
    "blanc",
    "bo[îi]te",
    "bois",
    "bou(?:c|rg)",
    "b[êe]ta",
    "cache",
    "cap(?:ello)?",
    "champ",
    "chapelle",
    "ch[âa]teau",
    "cha(?:ud|t)e?s?",
    "chou",
    "chromo",
    "claire?s?",
    "co(?:de|ca)?",
    "compte",
    "contre",
    "cordon",
    "coupe?",
    "court",
    "crash",
    "crise",
    "croche",
    "cross",
    "cyber",
    "côte",
    "demi",
    "di(?:sney)?",
    "d[ée]s?",
    "double",
    "dys",
    "entre",
    "est",
    "ethno",
    "extra",
    "extrême",
    "[ée]co",
    "fil",
    "fort",
    "franco?s?",
    "gallo",
    "gardes?",
    "gastro",
    "grande?",
    "gratte",
    "gr[ée]co",
    "gros",
    "g[ée]o",
    "haute?s?",
    "hyper",
    "indo",
    "infra",
    "inter",
    "intra",
    "islamo",
    "italo",
    "jean",
    "labio",
    "latino",
    "live",
    "lot",
    "louis",
    "m[ai]cro",
    "mesnil",
    "mi(?:ni)?",
    "mono",
    "mont?s?",
    "moyen",
    "multi",
    "m[ée]cano",
    "m[ée]dico",
    "m[ée]do",
    "m[ée]ta",
    "mots?",
    "noix",
    "non",
    "nord",
    "notre",
    "n[ée]o",
    "ouest",
    "outre",
    "ouvre",
    "passe",
    "perce",
    "pharmaco",
    "ph[oy]to",
    "pique",
    "poissons?",
    "ponce",
    "pont",
    "po[rs]t",
    "primo",
    "pro(?:cès|to)?",
    "pare",
    "petite?",
    "porte",
    "pré",
    "prêchi",
    "pseudo",
    "pêle",
    "péri",
    "puy",
    "quasi",
    "recourt",
    "rythmo",
    "r[ée]",
    "r[ée]tro",
    "sans",
    "sainte?s?",
    "semi",
    "social",
    "sous",
    "su[bdr]",
    "super",
    "tire",
    "thermo",
    "tiers",
    "trans",
    "tr(?:i|ou)",
    "t[ée]l[ée]",
    "vi[cd]e",
    "vid[ée]o",
    "vie(?:ux|illes?)",
    "vill(?:e|eneuve|ers|ette|iers|y)",
    "ultra",
    "à",
    "[ée]lectro",
    "[ée]qui",
]

_elision_prefix = ["entr", "grande?s?"]
_other_hyphens = "".join([h for h in HYPHENS if h != "-"])

_regular_exp = [
    "^droits?[{hyphen}]de[{hyphen}]l'homm[{alpha}]+$".format(
        hyphen=HYPHENS, alpha=ALPHA_LOWER
    ),
    "^zig[{hyphen}]zag[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^prud[{elision}]homm[{alpha}]*$".format(elision=ELISION, alpha=ALPHA_LOWER),
]
_regular_exp += [
    "^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
        prefix=p,
        hyphen=HYPHENS,
        other_hyphen=_other_hyphens,
        elision=ELISION,
        alpha=ALPHA_LOWER,
    )
    for p in _hyphen_prefix
]
_regular_exp += [
    "^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
        prefix=p, elision=HYPHENS, hyphen=_other_hyphens, alpha=ALPHA_LOWER
    )
    for p in _elision_prefix
]
_regular_exp.append(URL_PATTERN)


TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile(
    "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
).match
-												Use consistent unicode declarations

											
										
										
											2017-03-12 15:07:28 +03:00
+								# coding: utf8
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 15:17:05 +03:00
+								from __future__ import unicode_literals
-												Use `regex` instead of `re`


											
										
										
											2017-04-20 02:22:52 +03:00
+								import regex as re
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 15:17:05 +03:00
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
+								from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
-												Fix relative imports

											
										
										
											2017-05-08 23:29:04 +03:00
+								from .punctuation import ELISION, HYPHENS
 								from ..tokenizer_exceptions import URL_PATTERN
 								from ..char_classes import ALPHA_LOWER
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								from ...symbols import ORTH, LEMMA, TAG
-												Convert exceptions to Python list

											
										
										
											2017-02-24 20:22:40 +03:00
-												Revert "Revert "Merge pull request #818 from raphael0202/tokenizer_exceptions""

This reverts commit f02a2f9322969a637ee2445efd7d1901d2a0d09a.

											
										
										
											2017-02-10 15:17:05 +03:00
 								def upper_first_letter(text):
 								    if len(text) == 0:
 								        return text
 								    if len(text) == 1:
 								        return text.upper()
 								    return text[0].upper() + text[1:]
 								def lower_first_letter(text):
 								    if len(text) == 0:
 								        return text
 								    if len(text) == 1:
 								        return text.lower()
 								    return text[0].lower() + text[1:]
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]}
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
 								for exc_data in [
 								    {LEMMA: "avant", ORTH: "av."},
 								    {LEMMA: "janvier", ORTH: "janv."},
 								    {LEMMA: "février", ORTH: "févr."},
 								    {LEMMA: "avril", ORTH: "avr."},
 								    {LEMMA: "juillet", ORTH: "juill."},
 								    {LEMMA: "septembre", ORTH: "sept."},
 								    {LEMMA: "octobre", ORTH: "oct."},
 								    {LEMMA: "novembre", ORTH: "nov."},
 								    {LEMMA: "décembre", ORTH: "déc."},
 								    {LEMMA: "après", ORTH: "apr."},
 								    {LEMMA: "docteur", ORTH: "Dr."},
 								    {LEMMA: "monsieur", ORTH: "M."},
 								    {LEMMA: "monsieur", ORTH: "Mr."},
 								    {LEMMA: "madame", ORTH: "Mme."},
 								    {LEMMA: "mademoiselle", ORTH: "Mlle."},
 								    {LEMMA: "numéro", ORTH: "n°"},
 								    {LEMMA: "degrés", ORTH: "d°"},
 								    {LEMMA: "saint", ORTH: "St."},
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    {LEMMA: "sainte", ORTH: "Ste."},
 								]:
-												Tidy up tokenizer exceptions

											
										
										
											2017-11-02 01:02:45 +03:00
+								    _exc[exc_data[ORTH]] = [exc_data]
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
 								for orth in FR_BASE_EXCEPTIONS + ["etc."]:
 								    _exc[orth] = [{ORTH: orth}]
 								for verb, verb_lemma in [
 								    ("a", "avoir"),
 								    ("est", "être"),
 								    ("semble", "sembler"),
 								    ("indique", "indiquer"),
 								    ("moque", "moquer"),
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    ("passe", "passer"),
 								]:
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
+								    for orth in [verb, verb.title()]:
 								        for pronoun in ["elle", "il", "on"]:
 								            token = "{}-t-{}".format(orth, pronoun)
 								            _exc[token] = [
-												Improvement of rules now title insentive and have same declaration format

											
										
										
											2017-04-27 11:23:56 +03:00
+								                {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
+								                {LEMMA: "t", ORTH: "-t"},
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								                {LEMMA: pronoun, ORTH: "-" + pronoun},
 								            ]
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								for verb, verb_lemma in [("est", "être")]:
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
+								    for orth in [verb, verb.title()]:
 								        token = "{}-ce".format(orth)
 								        _exc[token] = [
 								            {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            {LEMMA: "ce", ORTH: "-ce"},
 								        ]
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
 								    for orth in [pre, pre.title()]:
 								        _exc["%sest-ce" % orth] = [
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
+								            {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								            {LEMMA: "être", ORTH: "est", TAG: "VERB"},
 								            {LEMMA: "ce", ORTH: "-ce"},
 								        ]
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
 								_infixes_exc = []
 								for elision_char in ELISION:
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    for hyphen_char in ["-", "‐"]:
 								        _infixes_exc += [
 								            infix.replace("'", elision_char).replace("-", hyphen_char)
 								            for infix in FR_BASE_EXCEPTIONS
 								        ]
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
+								_infixes_exc += [upper_first_letter(word) for word in _infixes_exc]
 								_infixes_exc = list(set(_infixes_exc))
-												Fix typo

											
										
										
											2017-05-08 17:11:45 +03:00
+								for orth in _infixes_exc:
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
+								    _exc[orth] = [{ORTH: orth}]
 								_hyphen_prefix = [
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    "a[ée]ro",
 								    "abat",
 								    "a[fg]ro",
 								    "after",
 								    "am[ée]ricano",
 								    "anglo",
 								    "anti",
 								    "apr[èe]s",
 								    "arabo",
 								    "arcs?",
 								    "archi",
 								    "arrières?",
 								    "avant",
 								    "auto",
 								    "banc",
 								    "bas(?:ses?)?",
 								    "bec?",
 								    "best",
 								    "bio?",
 								    "bien",
 								    "blanc",
 								    "bo[îi]te",
 								    "bois",
 								    "bou(?:c|rg)",
 								    "b[êe]ta",
 								    "cache",
 								    "cap(?:ello)?",
 								    "champ",
 								    "chapelle",
 								    "ch[âa]teau",
 								    "cha(?:ud|t)e?s?",
 								    "chou",
 								    "chromo",
 								    "claire?s?",
 								    "co(?:de|ca)?",
 								    "compte",
 								    "contre",
 								    "cordon",
 								    "coupe?",
 								    "court",
 								    "crash",
 								    "crise",
 								    "croche",
 								    "cross",
 								    "cyber",
 								    "côte",
 								    "demi",
 								    "di(?:sney)?",
 								    "d[ée]s?",
 								    "double",
 								    "dys",
 								    "entre",
 								    "est",
 								    "ethno",
 								    "extra",
 								    "extrême",
 								    "[ée]co",
 								    "fil",
 								    "fort",
 								    "franco?s?",
 								    "gallo",
 								    "gardes?",
 								    "gastro",
 								    "grande?",
 								    "gratte",
 								    "gr[ée]co",
 								    "gros",
 								    "g[ée]o",
 								    "haute?s?",
 								    "hyper",
 								    "indo",
 								    "infra",
 								    "inter",
 								    "intra",
 								    "islamo",
 								    "italo",
 								    "jean",
 								    "labio",
 								    "latino",
 								    "live",
 								    "lot",
 								    "louis",
 								    "m[ai]cro",
 								    "mesnil",
 								    "mi(?:ni)?",
 								    "mono",
 								    "mont?s?",
 								    "moyen",
 								    "multi",
 								    "m[ée]cano",
 								    "m[ée]dico",
 								    "m[ée]do",
 								    "m[ée]ta",
 								    "mots?",
 								    "noix",
 								    "non",
 								    "nord",
 								    "notre",
 								    "n[ée]o",
 								    "ouest",
 								    "outre",
 								    "ouvre",
 								    "passe",
 								    "perce",
 								    "pharmaco",
 								    "ph[oy]to",
 								    "pique",
 								    "poissons?",
 								    "ponce",
 								    "pont",
 								    "po[rs]t",
 								    "primo",
 								    "pro(?:cès|to)?",
 								    "pare",
 								    "petite?",
 								    "porte",
 								    "pré",
 								    "prêchi",
 								    "pseudo",
 								    "pêle",
 								    "péri",
 								    "puy",
 								    "quasi",
 								    "recourt",
 								    "rythmo",
 								    "r[ée]",
 								    "r[ée]tro",
 								    "sans",
 								    "sainte?s?",
 								    "semi",
 								    "social",
 								    "sous",
 								    "su[bdr]",
 								    "super",
 								    "tire",
 								    "thermo",
 								    "tiers",
 								    "trans",
 								    "tr(?:i|ou)",
 								    "t[ée]l[ée]",
 								    "vi[cd]e",
 								    "vid[ée]o",
 								    "vie(?:ux|illes?)",
 								    "vill(?:e|eneuve|ers|ette|iers|y)",
 								    "ultra",
 								    "à",
 								    "[ée]lectro",
 								    "[ée]qui",
 								]
 								_elision_prefix = ["entr", "grande?s?"]
 								_other_hyphens = "".join([h for h in HYPHENS if h != "-"])
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
 								_regular_exp = [
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								    "^droits?[{hyphen}]de[{hyphen}]l'homm[{alpha}]+$".format(
 								        hyphen=HYPHENS, alpha=ALPHA_LOWER
 								    ),
 								    "^zig[{hyphen}]zag[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
 								    "^prud[{elision}]homm[{alpha}]*$".format(elision=ELISION, alpha=ALPHA_LOWER),
 								]
 								_regular_exp += [
 								    "^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
 								        prefix=p,
 								        hyphen=HYPHENS,
 								        other_hyphen=_other_hyphens,
 								        elision=ELISION,
 								        alpha=ALPHA_LOWER,
 								    )
 								    for p in _hyphen_prefix
 								]
 								_regular_exp += [
 								    "^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
 								        prefix=p, elision=HYPHENS, hyphen=_other_hyphens, alpha=ALPHA_LOWER
 								    )
 								    for p in _elision_prefix
 								]
-												Rename _URL_PATTERN to URL_PATTERN

											
										
										
											2017-05-09 01:00:00 +03:00
+								_regular_exp.append(URL_PATTERN)
-												Reorganise French language data

											
										
										
											2017-05-08 16:49:05 +03:00
-												Don't copy exception dicts if not necessary and tidy up

											
										
										
											2017-10-31 23:05:29 +03:00
+								TOKENIZER_EXCEPTIONS = _exc
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 19:03:03 +03:00
+								TOKEN_MATCH = re.compile(
 								    "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
 								).match