spaCy/spacy/lang/fr/tokenizer_exceptions.py
2024-03-15 08:45:23 +01:00

37 lines
1.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
_hyphen = "-–—"
_apostrophe = "'`´"
# fmt: off
_suffix_inversion = r"|".join([
"je", "tu", "on", "il", "elle", "iel",
"nous", "vous", "elles", "ils", "iels",
# écoutons-les
"moi", "toi", "lui", "leur",
"eux",
fr"en(?![{_hyphen}])", "y",
# écoutons-les
fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])",
# a-t-il, pourra-t'on, dis-m'en plus
fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?",
"", "ici",
])
_prefix_elision = r"|".join([
"n", "s", "c", "d", "j", "m", "t", "l", "qu",
# i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person').
fr"quelqu(?![{_apostrophe}]un[ex]*\b)", # quelque
"jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
])
# fmt: on
_elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])"
_inversion = (
rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)"
)
TOKEN_MATCH = re.compile(
r"(?iu)" + r"|".join([_inversion, _elision])
)
# _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"]