mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 23:35:47 +03:00
37 lines
1.1 KiB
Python
37 lines
1.1 KiB
Python
import re
|
||
|
||
_hyphen = "-–—"
|
||
_apostrophe = "'`´’"
|
||
|
||
# fmt: off
|
||
_suffix_inversion = r"|".join([
|
||
"je", "tu", "on", "il", "elle", "iel",
|
||
"nous", "vous", "elles", "ils", "iels",
|
||
# écoutons-les
|
||
"moi", "toi", "lui", "leur",
|
||
"eux",
|
||
fr"en(?![{_hyphen}])", "y",
|
||
# écoutons-les
|
||
fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])",
|
||
# a-t-il, pourra-t'on, dis-m'en plus
|
||
fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?",
|
||
"là", "ici",
|
||
])
|
||
_prefix_elision = r"|".join([
|
||
"n", "s", "c", "d", "j", "m", "t", "l", "qu",
|
||
# i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person').
|
||
fr"quelqu(?![{_apostrophe}]un[ex]*\b)", # quelque
|
||
"jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
|
||
])
|
||
# fmt: on
|
||
|
||
_elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])"
|
||
_inversion = (
|
||
rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)"
|
||
)
|
||
|
||
TOKEN_MATCH = re.compile(
|
||
r"(?iu)" + r"|".join([_inversion, _elision])
|
||
)
|
||
# _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"]
|