From ef4c65598a0d28a67fa2f0ba6c08b75e3b295ede Mon Sep 17 00:00:00 2001 From: thjbbvlt Date: Fri, 15 Mar 2024 11:55:27 +0100 Subject: [PATCH] =?UTF-8?q?works=20modifi=C3=A9=C2=A0:=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=5F=5Finit=5F=5F.py=20modifi=C3=A9=C2=A0:=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20punctuation.py=20modifi=C3=A9=C2=A0:=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20tokenizer=5Fexceptions.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spacy/lang/fr/__init__.py | 16 +++-- spacy/lang/fr/punctuation.py | 89 +++++++++++++++++++-------- spacy/lang/fr/tokenizer_exceptions.py | 49 +++++---------- 3 files changed, 92 insertions(+), 62 deletions(-) diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index a8bc7f53e..b332fe816 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -5,10 +5,14 @@ from thinc.api import Model from ...language import BaseDefaults, Language from .lemmatizer import FrenchLemmatizer from .lex_attrs import LEX_ATTRS -from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES +from .punctuation import ( + TOKENIZER_INFIXES, + TOKENIZER_PREFIXES, + TOKENIZER_SUFFIXES, +) from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS -from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS class FrenchDefaults(BaseDefaults): @@ -16,7 +20,6 @@ class FrenchDefaults(BaseDefaults): prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES - token_match = TOKEN_MATCH lex_attr_getters = LEX_ATTRS syntax_iterators = SYNTAX_ITERATORS stop_words = STOP_WORDS @@ -47,7 +50,12 @@ def make_lemmatizer( scorer: Optional[Callable], ): return FrenchLemmatizer( - nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + nlp.vocab, + model, + name, + mode=mode, + overwrite=overwrite, + scorer=scorer, ) diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index a3b178a2f..21e92647f 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -7,29 +7,80 @@ from ..char_classes import ( LIST_ELLIPSES, LIST_PUNCT, LIST_QUOTES, + LIST_ICONS, UNITS, - merge_chars, ) -from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES -ELISION = "' ’".replace(" ", "") +ELISION = " ' ` ´ ’ ".replace(" ", "") HYPHENS = r"- – — ‐ ‑".replace(" ", "") -_prefixes_elision = "d l n" -_prefixes_elision += " " + _prefixes_elision.upper() -_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous" -_hyphen_suffixes += " " + _hyphen_suffixes.upper() - -_prefixes = TOKENIZER_PREFIXES + [ - r"(?:({pe})[{el}])(?=[{a}])".format( - a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) - ) +# fmt: off +_suffix_inversion = [ + "je", "tu", "on", "il", "elle", "iel", + "nous", "vous", "elles", "ils", "iels", + "moi", "toi", "lui", "leur", "eux", + # to avoid matching: Villar-le-bois + fr"la(?![{HYPHENS}])", + fr"le(?![{HYPHENS}])", + fr"les(?![{HYPHENS}])", + fr"en(?![{HYPHENS}])", "y", + # a-t-on, a-t'on + fr"t[{HYPHENS}]??[{ELISION}]?", + fr"m[{ELISION}]?", + "là", "ici", ] +_prefix_elision = [ + "n", "s", "c", "d", "j", "m", "t", "l", "qu", + # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which would lacks the idea of 'one person'). + fr"quelqu(?![{ELISION}]un[ex]*\b)", + "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu", +] +# fmt: on -_suffixes = ( + +def upperandtitle(a): + """[alors, on] -> [alors, Alors, ALORS, on, On, ON]""" + + def fn(i): + t = i[0].upper() + i[1:] + u = i.upper() + return [i, t] if t == u else [i, t, u] + + return [x for y in [fn(i) for i in a] for x in y] + + +_suffix_inversion = r"|".join(upperandtitle(_suffix_inversion)) +_prefix_elision = r"|".join(upperandtitle(_prefix_elision)) + +_elision = rf"(?:\b(?:{_prefix_elision})[{ELISION}])" +_inversion = ( + rf"(?:(?<=[^\W\d])[{HYPHENS}]\b(?:{_suffix_inversion})\b)" +) + +TOKENIZER_PREFIXES = [_elision] + +TOKENIZER_INFIXES = ( + # base list without hyphen regex + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] + # plus conditionnal hyphen + + [_inversion] +) + +TOKENIZER_SUFFIXES = ( + # base list, les hyphens and english things such as "'s" LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + + LIST_ICONS + [ r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."] @@ -40,17 +91,5 @@ _suffixes = ( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), - r"(?<=[{a}])[{h}]({hs})".format( - a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes) - ), ] ) - -_infixes = TOKENIZER_INFIXES + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) -] - - -TOKENIZER_PREFIXES = _prefixes -TOKENIZER_SUFFIXES = _suffixes -TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 4e16a7c25..f3471e257 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -1,36 +1,19 @@ -import re +from ...util import update_exc +from ...symbols import NORM, ORTH +from ..tokenizer_exceptions import BASE_EXCEPTIONS -_hyphen = "-–—" -_apostrophe = "'`´’" -# fmt: off -_suffix_inversion = r"|".join([ - "je", "tu", "on", "il", "elle", "iel", - "nous", "vous", "elles", "ils", "iels", - # écoutons-les - "moi", "toi", "lui", "leur", - "eux", - fr"en(?![{_hyphen}])", "y", - # écoutons-les - fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])", - # a-t-il, pourra-t'on, dis-m'en plus - fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?", - "là", "ici", -]) -_prefix_elision = r"|".join([ - "n", "s", "c", "d", "j", "m", "t", "l", "qu", - # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person'). - fr"quelqu(?![{_apostrophe}]un[ex]*\b)", # quelque - "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu", -]) -# fmt: on +_exc = { + "St": [{ORTH: "St", NORM: "Saint"}], + "Ste": [{ORTH: "Ste", NORM: "Sainte"}], + "Mme": [{ORTH: "Mme", NORM: "Madame"}], + "Mr.": [{ORTH: "Mr", NORM: "Monsieur"}], + "M.": [{ORTH: "M.", NORM: "Monsieur"}], + "Mlle": [{ORTH: "Mlle", NORM: "Mademoiselle"}], + "Dr": [{ORTH: "Dr", NORM: "Docteur"}], + "Dresse": [{ORTH: "Dresse", NORM: "Doctoresse"}], + "Drsse": [{ORTH: "Drsse", NORM: "Doctoresse"}], + "etc": [{ORTH: "etc", NORM: "etcaetera"}], +} -_elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])" -_inversion = ( - rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)" -) - -TOKEN_MATCH = re.compile( - r"(?iu)" + r"|".join([_inversion, _elision]) -) -# _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"] +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)