mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
eddeb36c96
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
293 lines
6.1 KiB
Python
293 lines
6.1 KiB
Python
# coding: utf8
|
||
from __future__ import unicode_literals
|
||
|
||
import regex as re
|
||
|
||
from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
||
from .punctuation import ELISION, HYPHENS
|
||
from ..tokenizer_exceptions import URL_PATTERN
|
||
from ..char_classes import ALPHA_LOWER
|
||
from ...symbols import ORTH, LEMMA, TAG
|
||
|
||
|
||
def upper_first_letter(text):
|
||
if len(text) == 0:
|
||
return text
|
||
if len(text) == 1:
|
||
return text.upper()
|
||
return text[0].upper() + text[1:]
|
||
|
||
|
||
def lower_first_letter(text):
|
||
if len(text) == 0:
|
||
return text
|
||
if len(text) == 1:
|
||
return text.lower()
|
||
return text[0].lower() + text[1:]
|
||
|
||
|
||
_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]}
|
||
|
||
|
||
for exc_data in [
|
||
{LEMMA: "avant", ORTH: "av."},
|
||
{LEMMA: "janvier", ORTH: "janv."},
|
||
{LEMMA: "février", ORTH: "févr."},
|
||
{LEMMA: "avril", ORTH: "avr."},
|
||
{LEMMA: "juillet", ORTH: "juill."},
|
||
{LEMMA: "septembre", ORTH: "sept."},
|
||
{LEMMA: "octobre", ORTH: "oct."},
|
||
{LEMMA: "novembre", ORTH: "nov."},
|
||
{LEMMA: "décembre", ORTH: "déc."},
|
||
{LEMMA: "après", ORTH: "apr."},
|
||
{LEMMA: "docteur", ORTH: "Dr."},
|
||
{LEMMA: "monsieur", ORTH: "M."},
|
||
{LEMMA: "monsieur", ORTH: "Mr."},
|
||
{LEMMA: "madame", ORTH: "Mme."},
|
||
{LEMMA: "mademoiselle", ORTH: "Mlle."},
|
||
{LEMMA: "numéro", ORTH: "n°"},
|
||
{LEMMA: "degrés", ORTH: "d°"},
|
||
{LEMMA: "saint", ORTH: "St."},
|
||
{LEMMA: "sainte", ORTH: "Ste."},
|
||
]:
|
||
_exc[exc_data[ORTH]] = [exc_data]
|
||
|
||
|
||
for orth in FR_BASE_EXCEPTIONS + ["etc."]:
|
||
_exc[orth] = [{ORTH: orth}]
|
||
|
||
|
||
for verb, verb_lemma in [
|
||
("a", "avoir"),
|
||
("est", "être"),
|
||
("semble", "sembler"),
|
||
("indique", "indiquer"),
|
||
("moque", "moquer"),
|
||
("passe", "passer"),
|
||
]:
|
||
for orth in [verb, verb.title()]:
|
||
for pronoun in ["elle", "il", "on"]:
|
||
token = "{}-t-{}".format(orth, pronoun)
|
||
_exc[token] = [
|
||
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
|
||
{LEMMA: "t", ORTH: "-t"},
|
||
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||
]
|
||
|
||
for verb, verb_lemma in [("est", "être")]:
|
||
for orth in [verb, verb.title()]:
|
||
token = "{}-ce".format(orth)
|
||
_exc[token] = [
|
||
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
|
||
{LEMMA: "ce", ORTH: "-ce"},
|
||
]
|
||
|
||
|
||
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
|
||
for orth in [pre, pre.title()]:
|
||
_exc["%sest-ce" % orth] = [
|
||
{LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
|
||
{LEMMA: "être", ORTH: "est", TAG: "VERB"},
|
||
{LEMMA: "ce", ORTH: "-ce"},
|
||
]
|
||
|
||
|
||
_infixes_exc = []
|
||
for elision_char in ELISION:
|
||
for hyphen_char in ["-", "‐"]:
|
||
_infixes_exc += [
|
||
infix.replace("'", elision_char).replace("-", hyphen_char)
|
||
for infix in FR_BASE_EXCEPTIONS
|
||
]
|
||
_infixes_exc += [upper_first_letter(word) for word in _infixes_exc]
|
||
_infixes_exc = list(set(_infixes_exc))
|
||
|
||
for orth in _infixes_exc:
|
||
_exc[orth] = [{ORTH: orth}]
|
||
|
||
|
||
_hyphen_prefix = [
|
||
"a[ée]ro",
|
||
"abat",
|
||
"a[fg]ro",
|
||
"after",
|
||
"am[ée]ricano",
|
||
"anglo",
|
||
"anti",
|
||
"apr[èe]s",
|
||
"arabo",
|
||
"arcs?",
|
||
"archi",
|
||
"arrières?",
|
||
"avant",
|
||
"auto",
|
||
"banc",
|
||
"bas(?:ses?)?",
|
||
"bec?",
|
||
"best",
|
||
"bio?",
|
||
"bien",
|
||
"blanc",
|
||
"bo[îi]te",
|
||
"bois",
|
||
"bou(?:c|rg)",
|
||
"b[êe]ta",
|
||
"cache",
|
||
"cap(?:ello)?",
|
||
"champ",
|
||
"chapelle",
|
||
"ch[âa]teau",
|
||
"cha(?:ud|t)e?s?",
|
||
"chou",
|
||
"chromo",
|
||
"claire?s?",
|
||
"co(?:de|ca)?",
|
||
"compte",
|
||
"contre",
|
||
"cordon",
|
||
"coupe?",
|
||
"court",
|
||
"crash",
|
||
"crise",
|
||
"croche",
|
||
"cross",
|
||
"cyber",
|
||
"côte",
|
||
"demi",
|
||
"di(?:sney)?",
|
||
"d[ée]s?",
|
||
"double",
|
||
"dys",
|
||
"entre",
|
||
"est",
|
||
"ethno",
|
||
"extra",
|
||
"extrême",
|
||
"[ée]co",
|
||
"fil",
|
||
"fort",
|
||
"franco?s?",
|
||
"gallo",
|
||
"gardes?",
|
||
"gastro",
|
||
"grande?",
|
||
"gratte",
|
||
"gr[ée]co",
|
||
"gros",
|
||
"g[ée]o",
|
||
"haute?s?",
|
||
"hyper",
|
||
"indo",
|
||
"infra",
|
||
"inter",
|
||
"intra",
|
||
"islamo",
|
||
"italo",
|
||
"jean",
|
||
"labio",
|
||
"latino",
|
||
"live",
|
||
"lot",
|
||
"louis",
|
||
"m[ai]cro",
|
||
"mesnil",
|
||
"mi(?:ni)?",
|
||
"mono",
|
||
"mont?s?",
|
||
"moyen",
|
||
"multi",
|
||
"m[ée]cano",
|
||
"m[ée]dico",
|
||
"m[ée]do",
|
||
"m[ée]ta",
|
||
"mots?",
|
||
"noix",
|
||
"non",
|
||
"nord",
|
||
"notre",
|
||
"n[ée]o",
|
||
"ouest",
|
||
"outre",
|
||
"ouvre",
|
||
"passe",
|
||
"perce",
|
||
"pharmaco",
|
||
"ph[oy]to",
|
||
"pique",
|
||
"poissons?",
|
||
"ponce",
|
||
"pont",
|
||
"po[rs]t",
|
||
"primo",
|
||
"pro(?:cès|to)?",
|
||
"pare",
|
||
"petite?",
|
||
"porte",
|
||
"pré",
|
||
"prêchi",
|
||
"pseudo",
|
||
"pêle",
|
||
"péri",
|
||
"puy",
|
||
"quasi",
|
||
"recourt",
|
||
"rythmo",
|
||
"r[ée]",
|
||
"r[ée]tro",
|
||
"sans",
|
||
"sainte?s?",
|
||
"semi",
|
||
"social",
|
||
"sous",
|
||
"su[bdr]",
|
||
"super",
|
||
"tire",
|
||
"thermo",
|
||
"tiers",
|
||
"trans",
|
||
"tr(?:i|ou)",
|
||
"t[ée]l[ée]",
|
||
"vi[cd]e",
|
||
"vid[ée]o",
|
||
"vie(?:ux|illes?)",
|
||
"vill(?:e|eneuve|ers|ette|iers|y)",
|
||
"ultra",
|
||
"à",
|
||
"[ée]lectro",
|
||
"[ée]qui",
|
||
]
|
||
|
||
_elision_prefix = ["entr", "grande?s?"]
|
||
_other_hyphens = "".join([h for h in HYPHENS if h != "-"])
|
||
|
||
_regular_exp = [
|
||
"^droits?[{hyphen}]de[{hyphen}]l'homm[{alpha}]+$".format(
|
||
hyphen=HYPHENS, alpha=ALPHA_LOWER
|
||
),
|
||
"^zig[{hyphen}]zag[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||
"^prud[{elision}]homm[{alpha}]*$".format(elision=ELISION, alpha=ALPHA_LOWER),
|
||
]
|
||
_regular_exp += [
|
||
"^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
|
||
prefix=p,
|
||
hyphen=HYPHENS,
|
||
other_hyphen=_other_hyphens,
|
||
elision=ELISION,
|
||
alpha=ALPHA_LOWER,
|
||
)
|
||
for p in _hyphen_prefix
|
||
]
|
||
_regular_exp += [
|
||
"^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
|
||
prefix=p, elision=HYPHENS, hyphen=_other_hyphens, alpha=ALPHA_LOWER
|
||
)
|
||
for p in _elision_prefix
|
||
]
|
||
_regular_exp.append(URL_PATTERN)
|
||
|
||
|
||
TOKENIZER_EXCEPTIONS = _exc
|
||
TOKEN_MATCH = re.compile(
|
||
"|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
|
||
).match
|