Merge aee667a781 into b3c46c315e

2025-09-18 10:02:40 +03:00 · 2025-02-04 23:49:59 +01:00 · 2025-02-04 23:49:59 +01:00 · 3e577b358e
commit 3e577b358e
parent b3c46c315e aee667a781
5 changed files with 125 additions and 16097 deletions
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -8,7 +8,7 @@ from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


 class FrenchDefaults(BaseDefaults):
@ -16,7 +16,6 @@ class FrenchDefaults(BaseDefaults):
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
-    token_match = TOKEN_MATCH
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
    stop_words = STOP_WORDS
@ -47,7 +46,12 @@ def make_lemmatizer(
    scorer: Optional[Callable],
 ):
    return FrenchLemmatizer(
-        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+        nlp.vocab,
+        model,
+        name,
+        mode=mode,
+        overwrite=overwrite,
+        scorer=scorer,
    )


--- a/spacy/lang/fr/_tokenizer_exceptions_list.py
+++ b/spacy/lang/fr/_tokenizer_exceptions_list.py
--- a/spacy/lang/fr/examples.py
+++ b/spacy/lang/fr/examples.py
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
 >>> docs = nlp.pipe(sentences)
 """

-
 sentences = [
    "Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars",
    "Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs",
@ -19,4 +18,18 @@ sentences = [
    "Qui est le président de la France ?",
    "Où est la capitale des États-Unis ?",
    "Quand est né Barack Obama ?",
+    "Où va-t'on?",
+    "Je ne sais pas mais on y va depuis le 2023-12-21.",
+    "Qu'en est-t-il des autres?",
+    "Sont-iels à Villar-le-ruisseau?",
+    "Et les non-humain-es?",
+    "Et le produit anti-nominaliste?",
+    "T'en as? Tu m'en donnnes?",
+    "Sinon mets-en un peu par terre.",
+    "il n'y a plus rien ici, enfin j'crois, nos p'tites affaires ont été enl'vées.",
+    "aujourd'hui, c'est comme ça.",
+    "un.e directeur.ice, des employé.es.",
+    "des non-humain-es étaient là aussi, visiblement heureux·ses.",
+    "j'ai trouvé ça surhttps://site_inexistant.fr/accueil#milieu ou  www.quelque_part.com/ je pense.",
+    "ou alors le 21/12 oui c'est ça c'était le 21/12/2023... ou alors le 12.02.2005",
 ]
--- a/spacy/lang/fr/punctuation.py
+++ b/spacy/lang/fr/punctuation.py
@ -5,31 +5,83 @@ from ..char_classes import (
    CONCAT_QUOTES,
    CURRENCY,
    LIST_ELLIPSES,
+    LIST_ICONS,
    LIST_PUNCT,
    LIST_QUOTES,
    UNITS,
-    merge_chars,
 )
-from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES

-ELISION = "' ’".replace(" ", "")
+ELISION = " ' ` ´ ’ ".replace(" ", "")
 HYPHENS = r"- – — ‐ ‑".replace(" ", "")
-_prefixes_elision = "d l n"
-_prefixes_elision += " " + _prefixes_elision.upper()
-_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
-_hyphen_suffixes += " " + _hyphen_suffixes.upper()

-
-_prefixes = TOKENIZER_PREFIXES + [
-    r"(?:({pe})[{el}])(?=[{a}])".format(
-        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
-    )
+# fmt: off
+_suffix_inversion = [
+    "je", "tu", "on", "il", "elle", "iel",
+    "nous", "vous", "elles", "ils", "iels",
+    "moi", "toi", "lui", "leur", "eux", "elleux",
+    "ce", "ici", "là",
+    # to avoid matching: Villar-le-bois
+    fr"la(?![{HYPHENS}])",
+    fr"le(?![{HYPHENS}])",
+    fr"les(?![{HYPHENS}])",
+    fr"en(?![{HYPHENS}])", "y",
+    # a-t-on, a-t'on
+    fr"t[{HYPHENS}]??[{ELISION}]?",
+    fr"m[{ELISION}]?",
 ]
+_prefix_elision = [
+    "n", "s", "c", "d", "j", "m", "t", "l", "qu",
+    # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which would lacks the idea of 'one person').
+    fr"quelqu(?![{ELISION}]un[ex]*\b)",
+    "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
+]
+# fmt: on

-_suffixes = (
+
+def upperandtitle(a):
+    """[alors, on] -> [alors, Alors, ALORS, on, On, ON]"""
+
+    def fn(i):
+        t = i[0].upper() + i[1:]
+        u = i.upper()
+        return [i, t] if t == u else [i, t, u]
+
+    return [x for y in [fn(i) for i in a] for x in y]
+
+
+_elision = rf"(?:\b(?:{r'|'.join(upperandtitle(_prefix_elision))})[{ELISION}])"
+_inversion = (
+    rf"(?:(?<=[^\W\d])[{HYPHENS}]\b(?:{r'|'.join(upperandtitle(_suffix_inversion))})\b)"
+)
+
+TOKENIZER_PREFIXES = [_elision]
+
+TOKENIZER_INFIXES = (
+    # base list without hyphen regex
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        # r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        # "2024-12-20" should be one token, so i remove "\-" from the above default regex
+        r"(?<=[0-9])[+\*^](?=[0-9-])",
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=\d),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}]),(?=[\d])".format(a=ALPHA),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+    # conditionnal hyphen: verb-subject inversion
+    + [_inversion]
+)
+
+TOKENIZER_SUFFIXES = (
+    # base list, les hyphens and english things such as "'s"
    LIST_PUNCT
    + LIST_ELLIPSES
    + LIST_QUOTES
+    + LIST_ICONS
    + [
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",  # °C. -> ["°C", "."]
@ -40,17 +92,5 @@ _suffixes = (
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
        ),
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
-        r"(?<=[{a}])[{h}]({hs})".format(
-            a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
-        ),
    ]
 )
-
-_infixes = TOKENIZER_INFIXES + [
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
-]
-
-
-TOKENIZER_PREFIXES = _prefixes
-TOKENIZER_SUFFIXES = _suffixes
-TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -1,443 +1,44 @@
-import re
-
-from ...symbols import ORTH
+from ...symbols import NORM, ORTH
 from ...util import update_exc
-from ..char_classes import ALPHA, ALPHA_LOWER
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from .punctuation import ELISION, HYPHENS
-
-# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
-# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
-FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
-
-
-def upper_first_letter(text):
-    if len(text) == 0:
-        return text
-    if len(text) == 1:
-        return text.upper()
-    return text[0].upper() + text[1:]
-
-
-def lower_first_letter(text):
-    if len(text) == 0:
-        return text
-    if len(text) == 1:
-        return text.lower()
-    return text[0].lower() + text[1:]
-
-
-_exc = {"J.-C.": [{ORTH: "J."}, {ORTH: "-C."}]}
-
-
-for exc_data in [
-    {ORTH: "av."},
-    {ORTH: "janv."},
-    {ORTH: "févr."},
-    {ORTH: "avr."},
-    {ORTH: "juill."},
-    {ORTH: "sept."},
-    {ORTH: "oct."},
-    {ORTH: "nov."},
-    {ORTH: "déc."},
-    {ORTH: "apr."},
-    {ORTH: "Dr."},
-    {ORTH: "M."},
-    {ORTH: "Mr."},
-    {ORTH: "Mme."},
-    {ORTH: "Mlle."},
-    {ORTH: "n°"},
-    {ORTH: "d°"},
-    {ORTH: "St."},
-    {ORTH: "Ste."},
-]:
-    _exc[exc_data[ORTH]] = [exc_data]
-
-
-for orth in [
-    "après-midi",
-    "au-delà",
-    "au-dessus",
-    "celle-ci",
-    "celles-ci",
-    "celui-ci",
-    "cf.",
-    "ci-dessous",
-    "elle-même",
-    "en-dessous",
-    "etc.",
-    "jusque-là",
-    "lui-même",
-    "MM.",
-    "No.",
-    "peut-être",
-    "pp.",
-    "quelques-uns",
-    "rendez-vous",
-    "Vol.",
-]:
-    _exc[orth] = [{ORTH: orth}]
-
-
-for verb in [
-    "a",
-    "est",
-    "semble",
-    "indique",
-    "moque",
-    "passe",
-]:
-    for orth in [verb, verb.title()]:
-        for pronoun in ["elle", "il", "on"]:
-            token = f"{orth}-t-{pronoun}"
-            _exc[token] = [{ORTH: orth}, {ORTH: "-t"}, {ORTH: "-" + pronoun}]
-
-for verb in ["est"]:
-    for orth in [verb, verb.title()]:
-        _exc[f"{orth}-ce"] = [{ORTH: orth}, {ORTH: "-ce"}]
-
-
-for pre in ["qu'", "n'"]:
-    for orth in [pre, pre.title()]:
-        _exc[f"{orth}est-ce"] = [{ORTH: orth}, {ORTH: "est"}, {ORTH: "-ce"}]
-
-
-for verb, pronoun in [("est", "il"), ("EST", "IL")]:
-    _exc[f"{verb}-{pronoun}"] = [{ORTH: verb}, {ORTH: "-" + pronoun}]
-
-
-for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
-    _exc[f"{s}'{verb}-{pronoun}"] = [
-        {ORTH: s + "'"},
-        {ORTH: verb},
-        {ORTH: "-" + pronoun},
-    ]
-
-
-_infixes_exc = []  # type: ignore[var-annotated]
-orig_elision = "'"
-orig_hyphen = "-"
-
-# loop through the elison and hyphen characters, and try to substitute the ones that weren't used in the original list
-for infix in FR_BASE_EXCEPTIONS:
-    variants_infix = {infix}
-    for elision_char in [x for x in ELISION if x != orig_elision]:
-        variants_infix.update(
-            [word.replace(orig_elision, elision_char) for word in variants_infix]
-        )
-    for hyphen_char in [x for x in ["-", "‐"] if x != orig_hyphen]:
-        variants_infix.update(
-            [word.replace(orig_hyphen, hyphen_char) for word in variants_infix]
-        )
-    variants_infix.update([upper_first_letter(word) for word in variants_infix])
-    _infixes_exc.extend(variants_infix)
-
-for orth in _infixes_exc:
-    _exc[orth] = [{ORTH: orth}]
-
-
-_hyphen_prefix = [
-    "a[ée]ro",
-    "abat",
-    "a[fg]ro",
-    "after",
-    "aigues?",
-    "am[ée]ricano",
-    "anglo",
-    "anti",
-    "apr[èe]s",
-    "arabo",
-    "arcs?",
-    "archi",
-    "arrières?",
-    "audio",
-    "avant",
-    "avion",
-    "auto",
-    "banc",
-    "bas(?:ses?)?",
-    "bateaux?",
-    "bec?",
-    "belles?",
-    "beau",
-    "best",
-    "bio?",
-    "bien",
-    "blanc",
-    "bo[îi]te",
-    "bonn?e?s?",
-    "bois",
-    "bou(?:c|rg)",
-    "b[êe]ta",
-    "cache",
-    "cap(?:ello)?",
-    "casse",
-    "castel",
-    "champ",
-    "chapelle",
-    "ch[âa]teau(?:neuf)?",
-    "chasse",
-    "cha(?:ud|t)e?s?",
-    "chauffe",
-    "chou",
-    "chromo",
-    "claire?s?",
-    "co(?:de|ca)?",
-    "compte",
-    "contre",
-    "cordon",
-    "coupe?",
-    "courte?s?",
-    "couvre",
-    "crash",
-    "crise",
-    "croche",
-    "cross",
-    "cyber",
-    "côte",
-    "demi",
-    "di(?:sney)?",
-    "dix",
-    "d[ée]s?",
-    "dys",
-    "ex?",
-    "émirato",
-    "entre",
-    "est",
-    "ethno",
-    "ex",
-    "extra",
-    "extrême",
-    "[ée]co",
-    "faux",
-    "fil",
-    "fort",
-    "franco?s?",
-    "gallo",
-    "gardes?",
-    "gastro",
-    "grande?",
-    "gratte",
-    "gr[ée]co",
-    "gros",
-    "g[ée]o",
-    "haute?s?",
-    "homm?es?",
-    "hors",
-    "hyper",
-    "indo",
-    "infra",
-    "inter",
-    "intra",
-    "islamo",
-    "italo",
-    "jean",
-    "labio",
-    "latino",
-    "live",
-    "lot",
-    "louis",
-    "m[ai]cro",
-    "mal",
-    "médio",
-    "mesnil",
-    "mi(?:ni)?",
-    "mono",
-    "mont?s?",
-    "moyen",
-    "multi",
-    "m[ée]cano",
-    "m[ée]dico",
-    "m[ée]do",
-    "m[ée]ta",
-    "mots?",
-    "neuro",
-    "noix",
-    "non",
-    "nord",
-    "notre",
-    "n[ée]o",
-    "ouest",
-    "outre",
-    "ouvre",
-    "passe",
-    "perce",
-    "pharmaco",
-    "ph[oy]to",
-    "pieds?",
-    "pique",
-    "poissons?",
-    "ponce",
-    "pont",
-    "po[rs]t",
-    "pousse",
-    "primo",
-    "pro(?:cès|to)?",
-    "pare",
-    "petite?s?",
-    "plessis",
-    "porte",
-    "pré",
-    "prêchi",
-    "protège",
-    "pseudo",
-    "pêle",
-    "péri",
-    "puy",
-    "quasi",
-    "quatre",
-    "radio",
-    "recourt",
-    "rythmo",
-    "(?:re)?doubles?",
-    "r[ée]",
-    "r[ée]tro",
-    "requin",
-    "sans?",
-    "sa?inte?s?",
-    "semi",
-    "serre",
-    "sino",
-    "socio",
-    "sociale?s?",
-    "soixante",
-    "sous",
-    "su[bdrs]",
-    "super",
-    "taille",
-    "tire",
-    "thermo",
-    "tiers",
-    "tourne",
-    "toute?s?",
-    "tra[iî]ne?",
-    "trans",
-    "trente",
-    "trois",
-    "trousse",
-    "tr(?:i|ou)",
-    "t[ée]l[ée]",
-    "utéro",
-    "vaso",
-    "vi[cd]e",
-    "vid[ée]o",
-    "vie(?:ux|i?lles?|i?l)",
-    "vill(?:e|eneuve|ers|ette|iers|y)",
-    "vingt",
-    "voitures?",
-    "wagons?",
-    "ultra",
-    "à",
-    "[ée]lectro",
-    "[ée]qui",
-    "Fontaine",
-    "La Chapelle",
-    "Marie",
-    "Le Mesnil",
-    "Neuville",
-    "Pierre",
-    "Val",
-    "Vaux",
-]
-
-_regular_exp = [
-    "^a[{hyphen}]sexualis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^arginine[{hyphen}]méthyl[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^binge[{hyphen}]watch[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^black[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^bouche[{hyphen}]por[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^burn[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^by[{hyphen}]pass[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^ch[{elision}]tiis[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
-    "^chape[{hyphen}]chut[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^down[{hyphen}]load[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^[ée]tats[{hyphen}]uni[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^droits?[{hyphen}]de[{hyphen}]l'homm[{al}]+$".format(
-        hyphen=HYPHENS, al=ALPHA_LOWER
-    ),
-    "^fac[{hyphen}]simil[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^fleur[{hyphen}]bleuis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^flic[{hyphen}]flaqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^fox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^google[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^hard[{hyphen}]discount[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^hip[{hyphen}]hop[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^jet[{hyphen}]set[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^knock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^lèche[{hyphen}]bott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^litho[{hyphen}]typographi[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^lock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^lombri[{hyphen}]compost[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^mac[{hyphen}]adamis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^marque[{hyphen}]pag[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^mouton[{hyphen}]noiris[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^new[{hyphen}]york[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^pair[{hyphen}]programm[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^people[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^plan[{hyphen}]socialis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^premier[{hyphen}]ministr[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^prud[{elision}]hom[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
-    "^réarc[{hyphen}]bout[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^refox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^remicro[{hyphen}]ond[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^repique[{hyphen}]niqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^repetit[{hyphen}]déjeun[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^rick[{hyphen}]roll[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^rond[{hyphen}]ponn[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^shift[{hyphen}]cliqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^soudo[{hyphen}]bras[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^stabilo[{hyphen}]boss[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^strip[{hyphen}]teas[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^terra[{hyphen}]form[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^teuf[{hyphen}]teuf[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^yo[{hyphen}]yo[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^zig[{hyphen}]zag[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
-    "^z[{elision}]yeut[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
-]
-# catching cases like faux-vampire
-_regular_exp += [
-    "^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
-        prefix=p,
-        hyphen=HYPHENS,  # putting the - first in the [] range avoids having to use a backslash
-        elision=ELISION,
-        al=ALPHA_LOWER,
-    )
-    for p in _hyphen_prefix
-]
-
-# catching cases like entr'abat
-_elision_prefix = ["r?é?entr", "grande?s?", "r"]
-_regular_exp += [
-    "^{prefix}[{elision}][{al}][{hyphen}{al}{elision}]*$".format(
-        prefix=p, elision=ELISION, hyphen=HYPHENS, al=ALPHA_LOWER
-    )
-    for p in _elision_prefix
-]
-
-# catching cases like saut-de-ski, pet-en-l'air
-_hyphen_combination = [
-    "l[èe]s?",
-    "la",
-    "en",
-    "des?",
-    "d[eu]",
-    "sur",
-    "sous",
-    "aux?",
-    "à",
-    "et",
-    "près",
-    "saint",
-]
-_regular_exp += [
-    "^[{a}]+[{hyphen}]{hyphen_combo}[{hyphen}](?:l[{elision}])?[{a}]+$".format(
-        hyphen_combo=hc, elision=ELISION, hyphen=HYPHENS, a=ALPHA
-    )
-    for hc in _hyphen_combination
-]

+_exc = {
+    "St": [{ORTH: "St", NORM: "Saint"}],
+    "St.": [{ORTH: "St.", NORM: "Saint"}],
+    "Ste": [{ORTH: "Ste", NORM: "Sainte"}],
+    "Mme": [{ORTH: "Mme", NORM: "Madame"}],
+    "Mr": [{ORTH: "Mr", NORM: "Monsieur"}],
+    "Mr.": [{ORTH: "Mr.", NORM: "Monsieur"}],
+    "M.": [{ORTH: "M.", NORM: "Monsieur"}],
+    "Mlle": [{ORTH: "Mlle", NORM: "Mademoiselle"}],
+    "Dr": [{ORTH: "Dr", NORM: "Docteur"}],
+    "Dr.": [{ORTH: "Dr.", NORM: "Docteur"}],
+    "Dresse": [{ORTH: "Dresse", NORM: "Doctoresse"}],
+    "Drsse": [{ORTH: "Drsse", NORM: "Doctoresse"}],
+    "etc": [{ORTH: "etc", NORM: "etcaetera"}],
+    "etc.": [{ORTH: "etc.", NORM: "etcaetera"}],
+    # months
+    "jan.": [{ORTH: "jan.", NORM: "janvier"}],
+    "janv.": [{ORTH: "janv.", NORM: "janvier"}],
+    "fév.": [{ORTH: "fév.", NORM: "février"}],
+    "févr.": [{ORTH: "févr.", NORM: "avril"}],
+    "avr.": [{ORTH: "avr.", NORM: "avril"}],
+    "av.": [{ORTH: "av.", NORM: "juin"}],
+    "juil.": [{ORTH: "juil.", NORM: "juillet"}],
+    "juill.": [{ORTH: "juill.", NORM: "juillet"}],
+    "sept.": [{ORTH: "sept.", NORM: "septembre"}],
+    "oct.": [{ORTH: "oct.", NORM: "octobre"}],
+    "nov.": [{ORTH: "nov.", NORM: "novembre"}],
+    "déc.": [{ORTH: "déc.", NORM: "décembre"}],
+    "dec.": [{ORTH: "dec.", NORM: "décembre"}],
+    # days
+    "lun.": [{ORTH: "lun.", NORM: "lundi"}],
+    "mar.": [{ORTH: "mar.", NORM: "mardi"}],
+    "mer.": [{ORTH: "mer.", NORM: "mercredi"}],
+    "jeu.": [{ORTH: "jeu.", NORM: "jeudi"}],
+    "ven.": [{ORTH: "ven.", NORM: "vendredi"}],
+    "sam.": [{ORTH: "sam.", NORM: "samedi"}],
+    "dim.": [{ORTH: "dim.", NORM: "dimanche"}],
+}

 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
-TOKEN_MATCH = re.compile(
-    "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
-).match