mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 09:12:21 +03:00
Merge aee667a781
into b3c46c315e
This commit is contained in:
commit
3e577b358e
|
@ -8,7 +8,7 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class FrenchDefaults(BaseDefaults):
|
class FrenchDefaults(BaseDefaults):
|
||||||
|
@ -16,7 +16,6 @@ class FrenchDefaults(BaseDefaults):
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
token_match = TOKEN_MATCH
|
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
@ -47,7 +46,12 @@ def make_lemmatizer(
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return FrenchLemmatizer(
|
return FrenchLemmatizer(
|
||||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
nlp.vocab,
|
||||||
|
model,
|
||||||
|
name,
|
||||||
|
mode=mode,
|
||||||
|
overwrite=overwrite,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -5,7 +5,6 @@ Example sentences to test spaCy and its language models.
|
||||||
>>> docs = nlp.pipe(sentences)
|
>>> docs = nlp.pipe(sentences)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars",
|
"Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars",
|
||||||
"Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs",
|
"Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs",
|
||||||
|
@ -19,4 +18,18 @@ sentences = [
|
||||||
"Qui est le président de la France ?",
|
"Qui est le président de la France ?",
|
||||||
"Où est la capitale des États-Unis ?",
|
"Où est la capitale des États-Unis ?",
|
||||||
"Quand est né Barack Obama ?",
|
"Quand est né Barack Obama ?",
|
||||||
|
"Où va-t'on?",
|
||||||
|
"Je ne sais pas mais on y va depuis le 2023-12-21.",
|
||||||
|
"Qu'en est-t-il des autres?",
|
||||||
|
"Sont-iels à Villar-le-ruisseau?",
|
||||||
|
"Et les non-humain-es?",
|
||||||
|
"Et le produit anti-nominaliste?",
|
||||||
|
"T'en as? Tu m'en donnnes?",
|
||||||
|
"Sinon mets-en un peu par terre.",
|
||||||
|
"il n'y a plus rien ici, enfin j'crois, nos p'tites affaires ont été enl'vées.",
|
||||||
|
"aujourd'hui, c'est comme ça.",
|
||||||
|
"un.e directeur.ice, des employé.es.",
|
||||||
|
"des non-humain-es étaient là aussi, visiblement heureux·ses.",
|
||||||
|
"j'ai trouvé ça surhttps://site_inexistant.fr/accueil#milieu ou www.quelque_part.com/ je pense.",
|
||||||
|
"ou alors le 21/12 oui c'est ça c'était le 21/12/2023... ou alors le 12.02.2005",
|
||||||
]
|
]
|
||||||
|
|
|
@ -5,31 +5,83 @@ from ..char_classes import (
|
||||||
CONCAT_QUOTES,
|
CONCAT_QUOTES,
|
||||||
CURRENCY,
|
CURRENCY,
|
||||||
LIST_ELLIPSES,
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
LIST_PUNCT,
|
LIST_PUNCT,
|
||||||
LIST_QUOTES,
|
LIST_QUOTES,
|
||||||
UNITS,
|
UNITS,
|
||||||
merge_chars,
|
|
||||||
)
|
)
|
||||||
from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
|
||||||
|
|
||||||
ELISION = "' ’".replace(" ", "")
|
ELISION = " ' ` ´ ’ ".replace(" ", "")
|
||||||
HYPHENS = r"- – — ‐ ‑".replace(" ", "")
|
HYPHENS = r"- – — ‐ ‑".replace(" ", "")
|
||||||
_prefixes_elision = "d l n"
|
|
||||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
|
||||||
_hyphen_suffixes = "ce clés elle en il ils je là moi nous on t vous"
|
|
||||||
_hyphen_suffixes += " " + _hyphen_suffixes.upper()
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
_prefixes = TOKENIZER_PREFIXES + [
|
_suffix_inversion = [
|
||||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
"je", "tu", "on", "il", "elle", "iel",
|
||||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
"nous", "vous", "elles", "ils", "iels",
|
||||||
)
|
"moi", "toi", "lui", "leur", "eux", "elleux",
|
||||||
|
"ce", "ici", "là",
|
||||||
|
# to avoid matching: Villar-le-bois
|
||||||
|
fr"la(?![{HYPHENS}])",
|
||||||
|
fr"le(?![{HYPHENS}])",
|
||||||
|
fr"les(?![{HYPHENS}])",
|
||||||
|
fr"en(?![{HYPHENS}])", "y",
|
||||||
|
# a-t-on, a-t'on
|
||||||
|
fr"t[{HYPHENS}]??[{ELISION}]?",
|
||||||
|
fr"m[{ELISION}]?",
|
||||||
]
|
]
|
||||||
|
_prefix_elision = [
|
||||||
|
"n", "s", "c", "d", "j", "m", "t", "l", "qu",
|
||||||
|
# i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which would lacks the idea of 'one person').
|
||||||
|
fr"quelqu(?![{ELISION}]un[ex]*\b)",
|
||||||
|
"jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
_suffixes = (
|
|
||||||
|
def upperandtitle(a):
|
||||||
|
"""[alors, on] -> [alors, Alors, ALORS, on, On, ON]"""
|
||||||
|
|
||||||
|
def fn(i):
|
||||||
|
t = i[0].upper() + i[1:]
|
||||||
|
u = i.upper()
|
||||||
|
return [i, t] if t == u else [i, t, u]
|
||||||
|
|
||||||
|
return [x for y in [fn(i) for i in a] for x in y]
|
||||||
|
|
||||||
|
|
||||||
|
_elision = rf"(?:\b(?:{r'|'.join(upperandtitle(_prefix_elision))})[{ELISION}])"
|
||||||
|
_inversion = (
|
||||||
|
rf"(?:(?<=[^\W\d])[{HYPHENS}]\b(?:{r'|'.join(upperandtitle(_suffix_inversion))})\b)"
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = [_elision]
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = (
|
||||||
|
# base list without hyphen regex
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
# r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
# "2024-12-20" should be one token, so i remove "\-" from the above default regex
|
||||||
|
r"(?<=[0-9])[+\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=\d),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}]),(?=[\d])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
# conditionnal hyphen: verb-subject inversion
|
||||||
|
+ [_inversion]
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_SUFFIXES = (
|
||||||
|
# base list, les hyphens and english things such as "'s"
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
+ LIST_QUOTES
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[0-9])\+",
|
r"(?<=[0-9])\+",
|
||||||
r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."]
|
r"(?<=°[FfCcKk])\.", # °C. -> ["°C", "."]
|
||||||
|
@ -40,17 +92,5 @@ _suffixes = (
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||||
),
|
),
|
||||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}])[{h}]({hs})".format(
|
|
||||||
a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
_infixes = TOKENIZER_INFIXES + [
|
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = _prefixes
|
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
|
||||||
|
|
|
@ -1,443 +1,44 @@
|
||||||
import re
|
from ...symbols import NORM, ORTH
|
||||||
|
|
||||||
from ...symbols import ORTH
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
from ..char_classes import ALPHA, ALPHA_LOWER
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from .punctuation import ELISION, HYPHENS
|
|
||||||
|
|
||||||
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
|
||||||
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
|
||||||
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
|
|
||||||
|
|
||||||
|
|
||||||
def upper_first_letter(text):
|
|
||||||
if len(text) == 0:
|
|
||||||
return text
|
|
||||||
if len(text) == 1:
|
|
||||||
return text.upper()
|
|
||||||
return text[0].upper() + text[1:]
|
|
||||||
|
|
||||||
|
|
||||||
def lower_first_letter(text):
|
|
||||||
if len(text) == 0:
|
|
||||||
return text
|
|
||||||
if len(text) == 1:
|
|
||||||
return text.lower()
|
|
||||||
return text[0].lower() + text[1:]
|
|
||||||
|
|
||||||
|
|
||||||
_exc = {"J.-C.": [{ORTH: "J."}, {ORTH: "-C."}]}
|
|
||||||
|
|
||||||
|
|
||||||
for exc_data in [
|
|
||||||
{ORTH: "av."},
|
|
||||||
{ORTH: "janv."},
|
|
||||||
{ORTH: "févr."},
|
|
||||||
{ORTH: "avr."},
|
|
||||||
{ORTH: "juill."},
|
|
||||||
{ORTH: "sept."},
|
|
||||||
{ORTH: "oct."},
|
|
||||||
{ORTH: "nov."},
|
|
||||||
{ORTH: "déc."},
|
|
||||||
{ORTH: "apr."},
|
|
||||||
{ORTH: "Dr."},
|
|
||||||
{ORTH: "M."},
|
|
||||||
{ORTH: "Mr."},
|
|
||||||
{ORTH: "Mme."},
|
|
||||||
{ORTH: "Mlle."},
|
|
||||||
{ORTH: "n°"},
|
|
||||||
{ORTH: "d°"},
|
|
||||||
{ORTH: "St."},
|
|
||||||
{ORTH: "Ste."},
|
|
||||||
]:
|
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
|
||||||
|
|
||||||
|
|
||||||
for orth in [
|
|
||||||
"après-midi",
|
|
||||||
"au-delà",
|
|
||||||
"au-dessus",
|
|
||||||
"celle-ci",
|
|
||||||
"celles-ci",
|
|
||||||
"celui-ci",
|
|
||||||
"cf.",
|
|
||||||
"ci-dessous",
|
|
||||||
"elle-même",
|
|
||||||
"en-dessous",
|
|
||||||
"etc.",
|
|
||||||
"jusque-là",
|
|
||||||
"lui-même",
|
|
||||||
"MM.",
|
|
||||||
"No.",
|
|
||||||
"peut-être",
|
|
||||||
"pp.",
|
|
||||||
"quelques-uns",
|
|
||||||
"rendez-vous",
|
|
||||||
"Vol.",
|
|
||||||
]:
|
|
||||||
_exc[orth] = [{ORTH: orth}]
|
|
||||||
|
|
||||||
|
|
||||||
for verb in [
|
|
||||||
"a",
|
|
||||||
"est",
|
|
||||||
"semble",
|
|
||||||
"indique",
|
|
||||||
"moque",
|
|
||||||
"passe",
|
|
||||||
]:
|
|
||||||
for orth in [verb, verb.title()]:
|
|
||||||
for pronoun in ["elle", "il", "on"]:
|
|
||||||
token = f"{orth}-t-{pronoun}"
|
|
||||||
_exc[token] = [{ORTH: orth}, {ORTH: "-t"}, {ORTH: "-" + pronoun}]
|
|
||||||
|
|
||||||
for verb in ["est"]:
|
|
||||||
for orth in [verb, verb.title()]:
|
|
||||||
_exc[f"{orth}-ce"] = [{ORTH: orth}, {ORTH: "-ce"}]
|
|
||||||
|
|
||||||
|
|
||||||
for pre in ["qu'", "n'"]:
|
|
||||||
for orth in [pre, pre.title()]:
|
|
||||||
_exc[f"{orth}est-ce"] = [{ORTH: orth}, {ORTH: "est"}, {ORTH: "-ce"}]
|
|
||||||
|
|
||||||
|
|
||||||
for verb, pronoun in [("est", "il"), ("EST", "IL")]:
|
|
||||||
_exc[f"{verb}-{pronoun}"] = [{ORTH: verb}, {ORTH: "-" + pronoun}]
|
|
||||||
|
|
||||||
|
|
||||||
for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
|
|
||||||
_exc[f"{s}'{verb}-{pronoun}"] = [
|
|
||||||
{ORTH: s + "'"},
|
|
||||||
{ORTH: verb},
|
|
||||||
{ORTH: "-" + pronoun},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
_infixes_exc = [] # type: ignore[var-annotated]
|
|
||||||
orig_elision = "'"
|
|
||||||
orig_hyphen = "-"
|
|
||||||
|
|
||||||
# loop through the elison and hyphen characters, and try to substitute the ones that weren't used in the original list
|
|
||||||
for infix in FR_BASE_EXCEPTIONS:
|
|
||||||
variants_infix = {infix}
|
|
||||||
for elision_char in [x for x in ELISION if x != orig_elision]:
|
|
||||||
variants_infix.update(
|
|
||||||
[word.replace(orig_elision, elision_char) for word in variants_infix]
|
|
||||||
)
|
|
||||||
for hyphen_char in [x for x in ["-", "‐"] if x != orig_hyphen]:
|
|
||||||
variants_infix.update(
|
|
||||||
[word.replace(orig_hyphen, hyphen_char) for word in variants_infix]
|
|
||||||
)
|
|
||||||
variants_infix.update([upper_first_letter(word) for word in variants_infix])
|
|
||||||
_infixes_exc.extend(variants_infix)
|
|
||||||
|
|
||||||
for orth in _infixes_exc:
|
|
||||||
_exc[orth] = [{ORTH: orth}]
|
|
||||||
|
|
||||||
|
|
||||||
_hyphen_prefix = [
|
|
||||||
"a[ée]ro",
|
|
||||||
"abat",
|
|
||||||
"a[fg]ro",
|
|
||||||
"after",
|
|
||||||
"aigues?",
|
|
||||||
"am[ée]ricano",
|
|
||||||
"anglo",
|
|
||||||
"anti",
|
|
||||||
"apr[èe]s",
|
|
||||||
"arabo",
|
|
||||||
"arcs?",
|
|
||||||
"archi",
|
|
||||||
"arrières?",
|
|
||||||
"audio",
|
|
||||||
"avant",
|
|
||||||
"avion",
|
|
||||||
"auto",
|
|
||||||
"banc",
|
|
||||||
"bas(?:ses?)?",
|
|
||||||
"bateaux?",
|
|
||||||
"bec?",
|
|
||||||
"belles?",
|
|
||||||
"beau",
|
|
||||||
"best",
|
|
||||||
"bio?",
|
|
||||||
"bien",
|
|
||||||
"blanc",
|
|
||||||
"bo[îi]te",
|
|
||||||
"bonn?e?s?",
|
|
||||||
"bois",
|
|
||||||
"bou(?:c|rg)",
|
|
||||||
"b[êe]ta",
|
|
||||||
"cache",
|
|
||||||
"cap(?:ello)?",
|
|
||||||
"casse",
|
|
||||||
"castel",
|
|
||||||
"champ",
|
|
||||||
"chapelle",
|
|
||||||
"ch[âa]teau(?:neuf)?",
|
|
||||||
"chasse",
|
|
||||||
"cha(?:ud|t)e?s?",
|
|
||||||
"chauffe",
|
|
||||||
"chou",
|
|
||||||
"chromo",
|
|
||||||
"claire?s?",
|
|
||||||
"co(?:de|ca)?",
|
|
||||||
"compte",
|
|
||||||
"contre",
|
|
||||||
"cordon",
|
|
||||||
"coupe?",
|
|
||||||
"courte?s?",
|
|
||||||
"couvre",
|
|
||||||
"crash",
|
|
||||||
"crise",
|
|
||||||
"croche",
|
|
||||||
"cross",
|
|
||||||
"cyber",
|
|
||||||
"côte",
|
|
||||||
"demi",
|
|
||||||
"di(?:sney)?",
|
|
||||||
"dix",
|
|
||||||
"d[ée]s?",
|
|
||||||
"dys",
|
|
||||||
"ex?",
|
|
||||||
"émirato",
|
|
||||||
"entre",
|
|
||||||
"est",
|
|
||||||
"ethno",
|
|
||||||
"ex",
|
|
||||||
"extra",
|
|
||||||
"extrême",
|
|
||||||
"[ée]co",
|
|
||||||
"faux",
|
|
||||||
"fil",
|
|
||||||
"fort",
|
|
||||||
"franco?s?",
|
|
||||||
"gallo",
|
|
||||||
"gardes?",
|
|
||||||
"gastro",
|
|
||||||
"grande?",
|
|
||||||
"gratte",
|
|
||||||
"gr[ée]co",
|
|
||||||
"gros",
|
|
||||||
"g[ée]o",
|
|
||||||
"haute?s?",
|
|
||||||
"homm?es?",
|
|
||||||
"hors",
|
|
||||||
"hyper",
|
|
||||||
"indo",
|
|
||||||
"infra",
|
|
||||||
"inter",
|
|
||||||
"intra",
|
|
||||||
"islamo",
|
|
||||||
"italo",
|
|
||||||
"jean",
|
|
||||||
"labio",
|
|
||||||
"latino",
|
|
||||||
"live",
|
|
||||||
"lot",
|
|
||||||
"louis",
|
|
||||||
"m[ai]cro",
|
|
||||||
"mal",
|
|
||||||
"médio",
|
|
||||||
"mesnil",
|
|
||||||
"mi(?:ni)?",
|
|
||||||
"mono",
|
|
||||||
"mont?s?",
|
|
||||||
"moyen",
|
|
||||||
"multi",
|
|
||||||
"m[ée]cano",
|
|
||||||
"m[ée]dico",
|
|
||||||
"m[ée]do",
|
|
||||||
"m[ée]ta",
|
|
||||||
"mots?",
|
|
||||||
"neuro",
|
|
||||||
"noix",
|
|
||||||
"non",
|
|
||||||
"nord",
|
|
||||||
"notre",
|
|
||||||
"n[ée]o",
|
|
||||||
"ouest",
|
|
||||||
"outre",
|
|
||||||
"ouvre",
|
|
||||||
"passe",
|
|
||||||
"perce",
|
|
||||||
"pharmaco",
|
|
||||||
"ph[oy]to",
|
|
||||||
"pieds?",
|
|
||||||
"pique",
|
|
||||||
"poissons?",
|
|
||||||
"ponce",
|
|
||||||
"pont",
|
|
||||||
"po[rs]t",
|
|
||||||
"pousse",
|
|
||||||
"primo",
|
|
||||||
"pro(?:cès|to)?",
|
|
||||||
"pare",
|
|
||||||
"petite?s?",
|
|
||||||
"plessis",
|
|
||||||
"porte",
|
|
||||||
"pré",
|
|
||||||
"prêchi",
|
|
||||||
"protège",
|
|
||||||
"pseudo",
|
|
||||||
"pêle",
|
|
||||||
"péri",
|
|
||||||
"puy",
|
|
||||||
"quasi",
|
|
||||||
"quatre",
|
|
||||||
"radio",
|
|
||||||
"recourt",
|
|
||||||
"rythmo",
|
|
||||||
"(?:re)?doubles?",
|
|
||||||
"r[ée]",
|
|
||||||
"r[ée]tro",
|
|
||||||
"requin",
|
|
||||||
"sans?",
|
|
||||||
"sa?inte?s?",
|
|
||||||
"semi",
|
|
||||||
"serre",
|
|
||||||
"sino",
|
|
||||||
"socio",
|
|
||||||
"sociale?s?",
|
|
||||||
"soixante",
|
|
||||||
"sous",
|
|
||||||
"su[bdrs]",
|
|
||||||
"super",
|
|
||||||
"taille",
|
|
||||||
"tire",
|
|
||||||
"thermo",
|
|
||||||
"tiers",
|
|
||||||
"tourne",
|
|
||||||
"toute?s?",
|
|
||||||
"tra[iî]ne?",
|
|
||||||
"trans",
|
|
||||||
"trente",
|
|
||||||
"trois",
|
|
||||||
"trousse",
|
|
||||||
"tr(?:i|ou)",
|
|
||||||
"t[ée]l[ée]",
|
|
||||||
"utéro",
|
|
||||||
"vaso",
|
|
||||||
"vi[cd]e",
|
|
||||||
"vid[ée]o",
|
|
||||||
"vie(?:ux|i?lles?|i?l)",
|
|
||||||
"vill(?:e|eneuve|ers|ette|iers|y)",
|
|
||||||
"vingt",
|
|
||||||
"voitures?",
|
|
||||||
"wagons?",
|
|
||||||
"ultra",
|
|
||||||
"à",
|
|
||||||
"[ée]lectro",
|
|
||||||
"[ée]qui",
|
|
||||||
"Fontaine",
|
|
||||||
"La Chapelle",
|
|
||||||
"Marie",
|
|
||||||
"Le Mesnil",
|
|
||||||
"Neuville",
|
|
||||||
"Pierre",
|
|
||||||
"Val",
|
|
||||||
"Vaux",
|
|
||||||
]
|
|
||||||
|
|
||||||
_regular_exp = [
|
|
||||||
"^a[{hyphen}]sexualis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^arginine[{hyphen}]méthyl[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^binge[{hyphen}]watch[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^black[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^bouche[{hyphen}]por[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^burn[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^by[{hyphen}]pass[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^ch[{elision}]tiis[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
|
|
||||||
"^chape[{hyphen}]chut[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^down[{hyphen}]load[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^[ée]tats[{hyphen}]uni[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^droits?[{hyphen}]de[{hyphen}]l'homm[{al}]+$".format(
|
|
||||||
hyphen=HYPHENS, al=ALPHA_LOWER
|
|
||||||
),
|
|
||||||
"^fac[{hyphen}]simil[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^fleur[{hyphen}]bleuis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^flic[{hyphen}]flaqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^fox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^google[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^hard[{hyphen}]discount[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^hip[{hyphen}]hop[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^jet[{hyphen}]set[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^knock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^lèche[{hyphen}]bott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^litho[{hyphen}]typographi[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^lock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^lombri[{hyphen}]compost[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^mac[{hyphen}]adamis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^marque[{hyphen}]pag[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^mouton[{hyphen}]noiris[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^new[{hyphen}]york[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^pair[{hyphen}]programm[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^people[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^plan[{hyphen}]socialis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^premier[{hyphen}]ministr[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^prud[{elision}]hom[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
|
|
||||||
"^réarc[{hyphen}]bout[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^refox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^remicro[{hyphen}]ond[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^repique[{hyphen}]niqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^repetit[{hyphen}]déjeun[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^rick[{hyphen}]roll[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^rond[{hyphen}]ponn[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^shift[{hyphen}]cliqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^soudo[{hyphen}]bras[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^stabilo[{hyphen}]boss[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^strip[{hyphen}]teas[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^terra[{hyphen}]form[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^teuf[{hyphen}]teuf[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^yo[{hyphen}]yo[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^zig[{hyphen}]zag[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
|
|
||||||
"^z[{elision}]yeut[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
|
|
||||||
]
|
|
||||||
# catching cases like faux-vampire
|
|
||||||
_regular_exp += [
|
|
||||||
"^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
|
|
||||||
prefix=p,
|
|
||||||
hyphen=HYPHENS, # putting the - first in the [] range avoids having to use a backslash
|
|
||||||
elision=ELISION,
|
|
||||||
al=ALPHA_LOWER,
|
|
||||||
)
|
|
||||||
for p in _hyphen_prefix
|
|
||||||
]
|
|
||||||
|
|
||||||
# catching cases like entr'abat
|
|
||||||
_elision_prefix = ["r?é?entr", "grande?s?", "r"]
|
|
||||||
_regular_exp += [
|
|
||||||
"^{prefix}[{elision}][{al}][{hyphen}{al}{elision}]*$".format(
|
|
||||||
prefix=p, elision=ELISION, hyphen=HYPHENS, al=ALPHA_LOWER
|
|
||||||
)
|
|
||||||
for p in _elision_prefix
|
|
||||||
]
|
|
||||||
|
|
||||||
# catching cases like saut-de-ski, pet-en-l'air
|
|
||||||
_hyphen_combination = [
|
|
||||||
"l[èe]s?",
|
|
||||||
"la",
|
|
||||||
"en",
|
|
||||||
"des?",
|
|
||||||
"d[eu]",
|
|
||||||
"sur",
|
|
||||||
"sous",
|
|
||||||
"aux?",
|
|
||||||
"à",
|
|
||||||
"et",
|
|
||||||
"près",
|
|
||||||
"saint",
|
|
||||||
]
|
|
||||||
_regular_exp += [
|
|
||||||
"^[{a}]+[{hyphen}]{hyphen_combo}[{hyphen}](?:l[{elision}])?[{a}]+$".format(
|
|
||||||
hyphen_combo=hc, elision=ELISION, hyphen=HYPHENS, a=ALPHA
|
|
||||||
)
|
|
||||||
for hc in _hyphen_combination
|
|
||||||
]
|
|
||||||
|
|
||||||
|
_exc = {
|
||||||
|
"St": [{ORTH: "St", NORM: "Saint"}],
|
||||||
|
"St.": [{ORTH: "St.", NORM: "Saint"}],
|
||||||
|
"Ste": [{ORTH: "Ste", NORM: "Sainte"}],
|
||||||
|
"Mme": [{ORTH: "Mme", NORM: "Madame"}],
|
||||||
|
"Mr": [{ORTH: "Mr", NORM: "Monsieur"}],
|
||||||
|
"Mr.": [{ORTH: "Mr.", NORM: "Monsieur"}],
|
||||||
|
"M.": [{ORTH: "M.", NORM: "Monsieur"}],
|
||||||
|
"Mlle": [{ORTH: "Mlle", NORM: "Mademoiselle"}],
|
||||||
|
"Dr": [{ORTH: "Dr", NORM: "Docteur"}],
|
||||||
|
"Dr.": [{ORTH: "Dr.", NORM: "Docteur"}],
|
||||||
|
"Dresse": [{ORTH: "Dresse", NORM: "Doctoresse"}],
|
||||||
|
"Drsse": [{ORTH: "Drsse", NORM: "Doctoresse"}],
|
||||||
|
"etc": [{ORTH: "etc", NORM: "etcaetera"}],
|
||||||
|
"etc.": [{ORTH: "etc.", NORM: "etcaetera"}],
|
||||||
|
# months
|
||||||
|
"jan.": [{ORTH: "jan.", NORM: "janvier"}],
|
||||||
|
"janv.": [{ORTH: "janv.", NORM: "janvier"}],
|
||||||
|
"fév.": [{ORTH: "fév.", NORM: "février"}],
|
||||||
|
"févr.": [{ORTH: "févr.", NORM: "avril"}],
|
||||||
|
"avr.": [{ORTH: "avr.", NORM: "avril"}],
|
||||||
|
"av.": [{ORTH: "av.", NORM: "juin"}],
|
||||||
|
"juil.": [{ORTH: "juil.", NORM: "juillet"}],
|
||||||
|
"juill.": [{ORTH: "juill.", NORM: "juillet"}],
|
||||||
|
"sept.": [{ORTH: "sept.", NORM: "septembre"}],
|
||||||
|
"oct.": [{ORTH: "oct.", NORM: "octobre"}],
|
||||||
|
"nov.": [{ORTH: "nov.", NORM: "novembre"}],
|
||||||
|
"déc.": [{ORTH: "déc.", NORM: "décembre"}],
|
||||||
|
"dec.": [{ORTH: "dec.", NORM: "décembre"}],
|
||||||
|
# days
|
||||||
|
"lun.": [{ORTH: "lun.", NORM: "lundi"}],
|
||||||
|
"mar.": [{ORTH: "mar.", NORM: "mardi"}],
|
||||||
|
"mer.": [{ORTH: "mer.", NORM: "mercredi"}],
|
||||||
|
"jeu.": [{ORTH: "jeu.", NORM: "jeudi"}],
|
||||||
|
"ven.": [{ORTH: "ven.", NORM: "vendredi"}],
|
||||||
|
"sam.": [{ORTH: "sam.", NORM: "samedi"}],
|
||||||
|
"dim.": [{ORTH: "dim.", NORM: "dimanche"}],
|
||||||
|
}
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
TOKEN_MATCH = re.compile(
|
|
||||||
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
|
||||||
).match
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user