From ef4c65598a0d28a67fa2f0ba6c08b75e3b295ede Mon Sep 17 00:00:00 2001
From: thjbbvlt <thibault2@ik.me>
Date: Fri, 15 Mar 2024 11:55:27 +0100
Subject: [PATCH] =?UTF-8?q?works=20modifi=C3=A9=C2=A0:=20=20=20=20=20=20?=
 =?UTF-8?q?=20=20=20=5F=5Finit=5F=5F.py=20modifi=C3=A9=C2=A0:=20=20=20=20?=
 =?UTF-8?q?=20=20=20=20=20punctuation.py=20modifi=C3=A9=C2=A0:=20=20=20=20?=
 =?UTF-8?q?=20=20=20=20=20tokenizer=5Fexceptions.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 spacy/lang/fr/__init__.py             | 16 +++--
 spacy/lang/fr/punctuation.py          | 89 +++++++++++++++++++--------
 spacy/lang/fr/tokenizer_exceptions.py | 49 +++++----------
 3 files changed, 92 insertions(+), 62 deletions(-)

diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index a8bc7f53e..b332fe816 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -5,10 +5,14 @@ from thinc.api import Model
 from ...language import BaseDefaults, Language
 from .lemmatizer import FrenchLemmatizer
 from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .punctuation import (
+    TOKENIZER_INFIXES,
+    TOKENIZER_PREFIXES,
+    TOKENIZER_SUFFIXES,
+)
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 
 class FrenchDefaults(BaseDefaults):
@@ -16,7 +20,6 @@ class FrenchDefaults(BaseDefaults):
     prefixes = TOKENIZER_PREFIXES
     infixes = TOKENIZER_INFIXES
     suffixes = TOKENIZER_SUFFIXES
-    token_match = TOKEN_MATCH
     lex_attr_getters = LEX_ATTRS
     syntax_iterators = SYNTAX_ITERATORS
     stop_words = STOP_WORDS
@@ -47,7 +50,12 @@ def make_lemmatizer(
     scorer: Optional[Callable],
 ):
     return FrenchLemmatizer(
-        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+        nlp.vocab,
+        model,
+        name,
+        mode=mode,
+        overwrite=overwrite,
+        scorer=scorer,
     )
 
 
diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py
index a3b178a2f..21e92647f 100644
--- a/spacy/lang/fr/punctuation.py
+++ b/spacy/lang/fr/punctuation.py
@@ -7,29 +7,80 @@ from ..char_classes import (
     LIST_ELLIPSES,
     LIST_PUNCT,
     LIST_QUOTES,
+    LIST_ICONS,
     UNITS,
-    merge_chars,
 )
-from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 
-ELISION = "' ’".replace(" ", "")
+ELISION = " ' ` ´ ’ ".replace(" ", "")
 HYPHENS = r"- – — ‐ ‑".replace(" ", "")
-_prefixes_elision = "d l n"
-_prefixes_elision += " " + _prefixes_elision.upper()
-_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
-_hyphen_suffixes += " " + _hyphen_suffixes.upper()
 
-
-_prefixes = TOKENIZER_PREFIXES + [
-    r"(?:({pe})[{el}])(?=[{a}])".format(
-        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
-    )
+# fmt: off
+_suffix_inversion = [
+    "je", "tu", "on", "il", "elle", "iel",
+    "nous", "vous", "elles", "ils", "iels",
+    "moi", "toi", "lui", "leur", "eux",
+    # to avoid matching: Villar-le-bois
+    fr"la(?![{HYPHENS}])", 
+    fr"le(?![{HYPHENS}])", 
+    fr"les(?![{HYPHENS}])",
+    fr"en(?![{HYPHENS}])", "y",
+    # a-t-on, a-t'on
+    fr"t[{HYPHENS}]??[{ELISION}]?", 
+    fr"m[{ELISION}]?",
+    "là", "ici",
 ]
+_prefix_elision = [
+    "n", "s", "c", "d", "j", "m", "t", "l", "qu",
+    # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which would lacks the idea of 'one person').
+    fr"quelqu(?![{ELISION}]un[ex]*\b)",
+    "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
+]
+# fmt: on
 
-_suffixes = (
+
+def upperandtitle(a):
+    """[alors, on] -> [alors, Alors, ALORS, on, On, ON]"""
+
+    def fn(i):
+        t = i[0].upper() + i[1:]
+        u = i.upper()
+        return [i, t] if t == u else [i, t, u]
+
+    return [x for y in [fn(i) for i in a] for x in y]
+
+
+_suffix_inversion = r"|".join(upperandtitle(_suffix_inversion))
+_prefix_elision = r"|".join(upperandtitle(_prefix_elision))
+
+_elision = rf"(?:\b(?:{_prefix_elision})[{ELISION}])"
+_inversion = (
+    rf"(?:(?<=[^\W\d])[{HYPHENS}]\b(?:{_suffix_inversion})\b)"
+)
+
+TOKENIZER_PREFIXES = [_elision]
+
+TOKENIZER_INFIXES = (
+    # base list without hyphen regex
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+    # plus conditionnal hyphen
+    + [_inversion]
+)
+
+TOKENIZER_SUFFIXES = (
+    # base list, les hyphens and english things such as "'s"
     LIST_PUNCT
     + LIST_ELLIPSES
     + LIST_QUOTES
+    + LIST_ICONS
     + [
         r"(?<=[0-9])\+",
         r"(?<=°[FfCcKk])\.",  # °C. -> ["°C", "."]
@@ -40,17 +91,5 @@ _suffixes = (
             al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
         ),
         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
-        r"(?<=[{a}])[{h}]({hs})".format(
-            a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
-        ),
     ]
 )
-
-_infixes = TOKENIZER_INFIXES + [
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
-]
-
-
-TOKENIZER_PREFIXES = _prefixes
-TOKENIZER_SUFFIXES = _suffixes
-TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index 4e16a7c25..f3471e257 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -1,36 +1,19 @@
-import re
+from ...util import update_exc
+from ...symbols import NORM, ORTH
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
-_hyphen = "-–—"
-_apostrophe = "'`´’"
 
-# fmt: off
-_suffix_inversion = r"|".join([
-    "je", "tu", "on", "il", "elle", "iel",
-    "nous", "vous", "elles", "ils", "iels",
-    # écoutons-les
-    "moi", "toi", "lui", "leur",
-    "eux",
-    fr"en(?![{_hyphen}])", "y",
-    # écoutons-les
-    fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])",
-    # a-t-il, pourra-t'on, dis-m'en plus
-    fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?",
-    "là", "ici",
-])
-_prefix_elision = r"|".join([
-    "n", "s", "c", "d", "j", "m", "t", "l", "qu",
-    # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person').
-    fr"quelqu(?![{_apostrophe}]un[ex]*\b)",  # quelque
-    "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
-])
-# fmt: on
+_exc = {
+    "St": [{ORTH: "St", NORM: "Saint"}],
+    "Ste": [{ORTH: "Ste", NORM: "Sainte"}],
+    "Mme": [{ORTH: "Mme", NORM: "Madame"}],
+    "Mr.": [{ORTH: "Mr", NORM: "Monsieur"}],
+    "M.": [{ORTH: "M.", NORM: "Monsieur"}],
+    "Mlle": [{ORTH: "Mlle", NORM: "Mademoiselle"}],
+    "Dr": [{ORTH: "Dr", NORM: "Docteur"}],
+    "Dresse": [{ORTH: "Dresse", NORM: "Doctoresse"}],
+    "Drsse": [{ORTH: "Drsse", NORM: "Doctoresse"}],
+    "etc": [{ORTH: "etc", NORM: "etcaetera"}],
+}
 
-_elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])"
-_inversion = (
-    rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)"
-)
-
-TOKEN_MATCH = re.compile(
-    r"(?iu)" + r"|".join([_inversion, _elision])
-)
-# _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"]
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)