modifié :         __init__.py
modifié :         punctuation.py
modifié :         tokenizer_exceptions.py
This commit is contained in:
thjbbvlt 2024-03-15 11:55:27 +01:00
parent 5a3928fe1e
commit ef4c65598a
3 changed files with 92 additions and 62 deletions

View File

@ -5,10 +5,14 @@ from thinc.api import Model
from ...language import BaseDefaults, Language
from .lemmatizer import FrenchLemmatizer
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .punctuation import (
TOKENIZER_INFIXES,
TOKENIZER_PREFIXES,
TOKENIZER_SUFFIXES,
)
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class FrenchDefaults(BaseDefaults):
@ -16,7 +20,6 @@ class FrenchDefaults(BaseDefaults):
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
@ -47,7 +50,12 @@ def make_lemmatizer(
scorer: Optional[Callable],
):
return FrenchLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
nlp.vocab,
model,
name,
mode=mode,
overwrite=overwrite,
scorer=scorer,
)

View File

@ -7,29 +7,80 @@ from ..char_classes import (
LIST_ELLIPSES,
LIST_PUNCT,
LIST_QUOTES,
LIST_ICONS,
UNITS,
merge_chars,
)
from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
ELISION = "' ".replace(" ", "")
ELISION = " ' ` ´ ".replace(" ", "")
HYPHENS = r"- ".replace(" ", "")
_prefixes_elision = "d l n"
_prefixes_elision += " " + _prefixes_elision.upper()
_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
_hyphen_suffixes += " " + _hyphen_suffixes.upper()
_prefixes = TOKENIZER_PREFIXES + [
r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
# fmt: off
_suffix_inversion = [
"je", "tu", "on", "il", "elle", "iel",
"nous", "vous", "elles", "ils", "iels",
"moi", "toi", "lui", "leur", "eux",
# to avoid matching: Villar-le-bois
fr"la(?![{HYPHENS}])",
fr"le(?![{HYPHENS}])",
fr"les(?![{HYPHENS}])",
fr"en(?![{HYPHENS}])", "y",
# a-t-on, a-t'on
fr"t[{HYPHENS}]??[{ELISION}]?",
fr"m[{ELISION}]?",
"", "ici",
]
_prefix_elision = [
"n", "s", "c", "d", "j", "m", "t", "l", "qu",
# i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which would lacks the idea of 'one person').
fr"quelqu(?![{ELISION}]un[ex]*\b)",
"jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
]
# fmt: on
_suffixes = (
def upperandtitle(a):
"""[alors, on] -> [alors, Alors, ALORS, on, On, ON]"""
def fn(i):
t = i[0].upper() + i[1:]
u = i.upper()
return [i, t] if t == u else [i, t, u]
return [x for y in [fn(i) for i in a] for x in y]
_suffix_inversion = r"|".join(upperandtitle(_suffix_inversion))
_prefix_elision = r"|".join(upperandtitle(_prefix_elision))
_elision = rf"(?:\b(?:{_prefix_elision})[{ELISION}])"
_inversion = (
rf"(?:(?<=[^\W\d])[{HYPHENS}]\b(?:{_suffix_inversion})\b)"
)
TOKENIZER_PREFIXES = [_elision]
TOKENIZER_INFIXES = (
# base list without hyphen regex
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
# plus conditionnal hyphen
+ [_inversion]
)
TOKENIZER_SUFFIXES = (
# base list, les hyphens and english things such as "'s"
LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_ICONS
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."]
@ -40,17 +91,5 @@ _suffixes = (
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
r"(?<=[{a}])[{h}]({hs})".format(
a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
),
]
)
_infixes = TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
]
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

View File

@ -1,36 +1,19 @@
import re
from ...util import update_exc
from ...symbols import NORM, ORTH
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_hyphen = "-–—"
_apostrophe = "'`´"
# fmt: off
_suffix_inversion = r"|".join([
"je", "tu", "on", "il", "elle", "iel",
"nous", "vous", "elles", "ils", "iels",
# écoutons-les
"moi", "toi", "lui", "leur",
"eux",
fr"en(?![{_hyphen}])", "y",
# écoutons-les
fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])",
# a-t-il, pourra-t'on, dis-m'en plus
fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?",
"", "ici",
])
_prefix_elision = r"|".join([
"n", "s", "c", "d", "j", "m", "t", "l", "qu",
# i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person').
fr"quelqu(?![{_apostrophe}]un[ex]*\b)", # quelque
"jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
])
# fmt: on
_exc = {
"St": [{ORTH: "St", NORM: "Saint"}],
"Ste": [{ORTH: "Ste", NORM: "Sainte"}],
"Mme": [{ORTH: "Mme", NORM: "Madame"}],
"Mr.": [{ORTH: "Mr", NORM: "Monsieur"}],
"M.": [{ORTH: "M.", NORM: "Monsieur"}],
"Mlle": [{ORTH: "Mlle", NORM: "Mademoiselle"}],
"Dr": [{ORTH: "Dr", NORM: "Docteur"}],
"Dresse": [{ORTH: "Dresse", NORM: "Doctoresse"}],
"Drsse": [{ORTH: "Drsse", NORM: "Doctoresse"}],
"etc": [{ORTH: "etc", NORM: "etcaetera"}],
}
_elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])"
_inversion = (
rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)"
)
TOKEN_MATCH = re.compile(
r"(?iu)" + r"|".join([_inversion, _elision])
)
# _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)