spaCy/spacy/lang/fr/tokenizer_exceptions.py

import re

_hyphen = "-–—"
_apostrophe = "'`´’"

# fmt: off
_suffix_inversion = r"|".join([
    "je", "tu", "on", "il", "elle", "iel",
    "nous", "vous", "elles", "ils", "iels",
    # écoutons-les
    "moi", "toi", "lui", "leur",
    "eux",
    fr"en(?![{_hyphen}])", "y",
    # écoutons-les
    fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])",
    # a-t-il, pourra-t'on, dis-m'en plus
    fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?",
    "là", "ici",
])
_prefix_elision = r"|".join([
    "n", "s", "c", "d", "j", "m", "t", "l", "qu",
    # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person').
    fr"quelqu(?![{_apostrophe}]un[ex]*\b)",  # quelque
    "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
])
# fmt: on

_elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])"
_inversion = (
    rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)"
)

TOKEN_MATCH = re.compile(
    r"(?iu)" + r"|".join([_inversion, _elision])
)
# _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"]