works

modifié : __init__.py modifié : punctuation.py modifié : tokenizer_exceptions.py
2025-07-24 23:19:45 +03:00 · 2024-03-15 11:55:27 +01:00 · 2024-03-15 11:55:27 +01:00 · ef4c65598a
commit ef4c65598a
parent 5a3928fe1e
3 changed files with 92 additions and 62 deletions
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -5,10 +5,14 @@ from thinc.api import Model
 from ...language import BaseDefaults, Language
 from .lemmatizer import FrenchLemmatizer
 from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .punctuation import (
+    TOKENIZER_INFIXES,
+    TOKENIZER_PREFIXES,
+    TOKENIZER_SUFFIXES,
+)
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


 class FrenchDefaults(BaseDefaults):
@ -16,7 +20,6 @@ class FrenchDefaults(BaseDefaults):
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
-    token_match = TOKEN_MATCH
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
    stop_words = STOP_WORDS
@ -47,7 +50,12 @@ def make_lemmatizer(
    scorer: Optional[Callable],
 ):
    return FrenchLemmatizer(
-        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+        nlp.vocab,
+        model,
+        name,
+        mode=mode,
+        overwrite=overwrite,
+        scorer=scorer,
    )


--- a/spacy/lang/fr/punctuation.py
+++ b/spacy/lang/fr/punctuation.py
@ -7,29 +7,80 @@ from ..char_classes import (
    LIST_ELLIPSES,
    LIST_PUNCT,
    LIST_QUOTES,
+    LIST_ICONS,
    UNITS,
-    merge_chars,
 )
-from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES

-ELISION = "' ’".replace(" ", "")
+ELISION = " ' ` ´ ’ ".replace(" ", "")
 HYPHENS = r"- – — ‐ ‑".replace(" ", "")
-_prefixes_elision = "d l n"
-_prefixes_elision += " " + _prefixes_elision.upper()
-_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
-_hyphen_suffixes += " " + _hyphen_suffixes.upper()

-
-_prefixes = TOKENIZER_PREFIXES + [
-    r"(?:({pe})[{el}])(?=[{a}])".format(
-        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
-    )
+# fmt: off
+_suffix_inversion = [
+    "je", "tu", "on", "il", "elle", "iel",
+    "nous", "vous", "elles", "ils", "iels",
+    "moi", "toi", "lui", "leur", "eux",
+    # to avoid matching: Villar-le-bois
+    fr"la(?![{HYPHENS}])", 
+    fr"le(?![{HYPHENS}])", 
+    fr"les(?![{HYPHENS}])",
+    fr"en(?![{HYPHENS}])", "y",
+    # a-t-on, a-t'on
+    fr"t[{HYPHENS}]??[{ELISION}]?", 
+    fr"m[{ELISION}]?",
+    "là", "ici",
 ]
+_prefix_elision = [
+    "n", "s", "c", "d", "j", "m", "t", "l", "qu",
+    # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which would lacks the idea of 'one person').
+    fr"quelqu(?![{ELISION}]un[ex]*\b)",
+    "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
+]
+# fmt: on

-_suffixes = (
+
+def upperandtitle(a):
+    """[alors, on] -> [alors, Alors, ALORS, on, On, ON]"""
+
+    def fn(i):
+        t = i[0].upper() + i[1:]
+        u = i.upper()
+        return [i, t] if t == u else [i, t, u]
+
+    return [x for y in [fn(i) for i in a] for x in y]
+
+
+_suffix_inversion = r"|".join(upperandtitle(_suffix_inversion))
+_prefix_elision = r"|".join(upperandtitle(_prefix_elision))
+
+_elision = rf"(?:\b(?:{_prefix_elision})[{ELISION}])"
+_inversion = (
+    rf"(?:(?<=[^\W\d])[{HYPHENS}]\b(?:{_suffix_inversion})\b)"
+)
+
+TOKENIZER_PREFIXES = [_elision]
+
+TOKENIZER_INFIXES = (
+    # base list without hyphen regex
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+    # plus conditionnal hyphen
+    + [_inversion]
+)
+
+TOKENIZER_SUFFIXES = (
+    # base list, les hyphens and english things such as "'s"
    LIST_PUNCT
    + LIST_ELLIPSES
    + LIST_QUOTES
+    + LIST_ICONS
    + [
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",  # °C. -> ["°C", "."]
@ -40,17 +91,5 @@ _suffixes = (
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
        ),
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
-        r"(?<=[{a}])[{h}]({hs})".format(
-            a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
-        ),
    ]
 )
-
-_infixes = TOKENIZER_INFIXES + [
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
-]
-
-
-TOKENIZER_PREFIXES = _prefixes
-TOKENIZER_SUFFIXES = _suffixes
-TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -1,36 +1,19 @@
-import re
+from ...util import update_exc
+from ...symbols import NORM, ORTH
+from ..tokenizer_exceptions import BASE_EXCEPTIONS

-_hyphen = "-–—"
-_apostrophe = "'`´’"

-# fmt: off
-_suffix_inversion = r"|".join([
-    "je", "tu", "on", "il", "elle", "iel",
-    "nous", "vous", "elles", "ils", "iels",
-    # écoutons-les
-    "moi", "toi", "lui", "leur",
-    "eux",
-    fr"en(?![{_hyphen}])", "y",
-    # écoutons-les
-    fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])",
-    # a-t-il, pourra-t'on, dis-m'en plus
-    fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?",
-    "là", "ici",
-])
-_prefix_elision = r"|".join([
-    "n", "s", "c", "d", "j", "m", "t", "l", "qu",
-    # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person').
-    fr"quelqu(?![{_apostrophe}]un[ex]*\b)",  # quelque
-    "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
-])
-# fmt: on
+_exc = {
+    "St": [{ORTH: "St", NORM: "Saint"}],
+    "Ste": [{ORTH: "Ste", NORM: "Sainte"}],
+    "Mme": [{ORTH: "Mme", NORM: "Madame"}],
+    "Mr.": [{ORTH: "Mr", NORM: "Monsieur"}],
+    "M.": [{ORTH: "M.", NORM: "Monsieur"}],
+    "Mlle": [{ORTH: "Mlle", NORM: "Mademoiselle"}],
+    "Dr": [{ORTH: "Dr", NORM: "Docteur"}],
+    "Dresse": [{ORTH: "Dresse", NORM: "Doctoresse"}],
+    "Drsse": [{ORTH: "Drsse", NORM: "Doctoresse"}],
+    "etc": [{ORTH: "etc", NORM: "etcaetera"}],
+}

-_elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])"
-_inversion = (
-    rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)"
-)
-
-TOKEN_MATCH = re.compile(
-    r"(?iu)" + r"|".join([_inversion, _elision])
-)
-# _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"]
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)