spaCy/spacy/lang/fr/tokenizer_exceptions.py

# coding: utf8
from __future__ import unicode_literals

import regex as re

from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER
from ...symbols import ORTH, LEMMA, TAG


def upper_first_letter(text):
    if len(text) == 0:
        return text
    if len(text) == 1:
        return text.upper()
    return text[0].upper() + text[1:]


def lower_first_letter(text):
    if len(text) == 0:
        return text
    if len(text) == 1:
        return text.lower()
    return text[0].lower() + text[1:]


_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]}


for exc_data in [
    {LEMMA: "avant", ORTH: "av."},
    {LEMMA: "janvier", ORTH: "janv."},
    {LEMMA: "février", ORTH: "févr."},
    {LEMMA: "avril", ORTH: "avr."},
    {LEMMA: "juillet", ORTH: "juill."},
    {LEMMA: "septembre", ORTH: "sept."},
    {LEMMA: "octobre", ORTH: "oct."},
    {LEMMA: "novembre", ORTH: "nov."},
    {LEMMA: "décembre", ORTH: "déc."},
    {LEMMA: "après", ORTH: "apr."},
    {LEMMA: "docteur", ORTH: "Dr."},
    {LEMMA: "monsieur", ORTH: "M."},
    {LEMMA: "monsieur", ORTH: "Mr."},
    {LEMMA: "madame", ORTH: "Mme."},
    {LEMMA: "mademoiselle", ORTH: "Mlle."},
    {LEMMA: "numéro", ORTH: "n°"},
    {LEMMA: "degrés", ORTH: "d°"},
    {LEMMA: "saint", ORTH: "St."},
    {LEMMA: "sainte", ORTH: "Ste."},
]:
    _exc[exc_data[ORTH]] = [exc_data]


for orth in FR_BASE_EXCEPTIONS + ["etc."]:
    _exc[orth] = [{ORTH: orth}]


for verb, verb_lemma in [
    ("a", "avoir"),
    ("est", "être"),
    ("semble", "sembler"),
    ("indique", "indiquer"),
    ("moque", "moquer"),
    ("passe", "passer"),
]:
    for orth in [verb, verb.title()]:
        for pronoun in ["elle", "il", "on"]:
            token = "{}-t-{}".format(orth, pronoun)
            _exc[token] = [
                {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
                {LEMMA: "t", ORTH: "-t"},
                {LEMMA: pronoun, ORTH: "-" + pronoun},
            ]

for verb, verb_lemma in [("est", "être")]:
    for orth in [verb, verb.title()]:
        token = "{}-ce".format(orth)
        _exc[token] = [
            {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
            {LEMMA: "ce", ORTH: "-ce"},
        ]


for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
    for orth in [pre, pre.title()]:
        _exc["%sest-ce" % orth] = [
            {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
            {LEMMA: "être", ORTH: "est", TAG: "VERB"},
            {LEMMA: "ce", ORTH: "-ce"},
        ]


_infixes_exc = []
for elision_char in ELISION:
    for hyphen_char in ["-", "‐"]:
        _infixes_exc += [
            infix.replace("'", elision_char).replace("-", hyphen_char)
            for infix in FR_BASE_EXCEPTIONS
        ]
_infixes_exc += [upper_first_letter(word) for word in _infixes_exc]
_infixes_exc = list(set(_infixes_exc))

for orth in _infixes_exc:
    _exc[orth] = [{ORTH: orth}]


_hyphen_prefix = [
    "a[ée]ro",
    "abat",
    "a[fg]ro",
    "after",
    "am[ée]ricano",
    "anglo",
    "anti",
    "apr[èe]s",
    "arabo",
    "arcs?",
    "archi",
    "arrières?",
    "avant",
    "auto",
    "banc",
    "bas(?:ses?)?",
    "bec?",
    "best",
    "bio?",
    "bien",
    "blanc",
    "bo[îi]te",
    "bois",
    "bou(?:c|rg)",
    "b[êe]ta",
    "cache",
    "cap(?:ello)?",
    "champ",
    "chapelle",
    "ch[âa]teau",
    "cha(?:ud|t)e?s?",
    "chou",
    "chromo",
    "claire?s?",
    "co(?:de|ca)?",
    "compte",
    "contre",
    "cordon",
    "coupe?",
    "court",
    "crash",
    "crise",
    "croche",
    "cross",
    "cyber",
    "côte",
    "demi",
    "di(?:sney)?",
    "d[ée]s?",
    "double",
    "dys",
    "entre",
    "est",
    "ethno",
    "extra",
    "extrême",
    "[ée]co",
    "fil",
    "fort",
    "franco?s?",
    "gallo",
    "gardes?",
    "gastro",
    "grande?",
    "gratte",
    "gr[ée]co",
    "gros",
    "g[ée]o",
    "haute?s?",
    "hyper",
    "indo",
    "infra",
    "inter",
    "intra",
    "islamo",
    "italo",
    "jean",
    "labio",
    "latino",
    "live",
    "lot",
    "louis",
    "m[ai]cro",
    "mesnil",
    "mi(?:ni)?",
    "mono",
    "mont?s?",
    "moyen",
    "multi",
    "m[ée]cano",
    "m[ée]dico",
    "m[ée]do",
    "m[ée]ta",
    "mots?",
    "noix",
    "non",
    "nord",
    "notre",
    "n[ée]o",
    "ouest",
    "outre",
    "ouvre",
    "passe",
    "perce",
    "pharmaco",
    "ph[oy]to",
    "pique",
    "poissons?",
    "ponce",
    "pont",
    "po[rs]t",
    "primo",
    "pro(?:cès|to)?",
    "pare",
    "petite?",
    "porte",
    "pré",
    "prêchi",
    "pseudo",
    "pêle",
    "péri",
    "puy",
    "quasi",
    "recourt",
    "rythmo",
    "r[ée]",
    "r[ée]tro",
    "sans",
    "sainte?s?",
    "semi",
    "social",
    "sous",
    "su[bdr]",
    "super",
    "tire",
    "thermo",
    "tiers",
    "trans",
    "tr(?:i|ou)",
    "t[ée]l[ée]",
    "vi[cd]e",
    "vid[ée]o",
    "vie(?:ux|illes?)",
    "vill(?:e|eneuve|ers|ette|iers|y)",
    "ultra",
    "à",
    "[ée]lectro",
    "[ée]qui",
]

_elision_prefix = ["entr", "grande?s?"]
_other_hyphens = "".join([h for h in HYPHENS if h != "-"])

_regular_exp = [
    "^droits?[{hyphen}]de[{hyphen}]l'homm[{alpha}]+$".format(
        hyphen=HYPHENS, alpha=ALPHA_LOWER
    ),
    "^zig[{hyphen}]zag[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^prud[{elision}]homm[{alpha}]*$".format(elision=ELISION, alpha=ALPHA_LOWER),
]
_regular_exp += [
    "^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
        prefix=p,
        hyphen=HYPHENS,
        other_hyphen=_other_hyphens,
        elision=ELISION,
        alpha=ALPHA_LOWER,
    )
    for p in _hyphen_prefix
]
_regular_exp += [
    "^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
        prefix=p, elision=HYPHENS, hyphen=_other_hyphens, alpha=ALPHA_LOWER
    )
    for p in _elision_prefix
]
_regular_exp.append(URL_PATTERN)


TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile(
    "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
).match