French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)

* merge changes of PR 3023 into develop branch instead of master

* further deletions from exception list according to PR 3023
This commit is contained in:
Sofie 2018-12-16 18:04:55 +01:00 committed by Ines Montani
parent 7bbdffd36e
commit c6ad557cea
3 changed files with 155 additions and 18080 deletions

File diff suppressed because it is too large Load Diff

View File

@ -53,7 +53,7 @@ for exc_data in [
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
for orth in FR_BASE_EXCEPTIONS + ["etc."]: for orth in ["etc."]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
@ -93,14 +93,18 @@ for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
_infixes_exc = [] _infixes_exc = []
for elision_char in ELISION: orig_elision = "'"
for hyphen_char in ["-", ""]: orig_hyphen = '-'
_infixes_exc += [
infix.replace("'", elision_char).replace("-", hyphen_char) # loop through the elison and hyphen characters, and try to substitute the ones that weren't used in the original list
for infix in FR_BASE_EXCEPTIONS for infix in FR_BASE_EXCEPTIONS:
] variants_infix = {infix}
_infixes_exc += [upper_first_letter(word) for word in _infixes_exc] for elision_char in [x for x in ELISION if x != orig_elision]:
_infixes_exc = list(set(_infixes_exc)) variants_infix.update([word.replace(orig_elision, elision_char) for word in variants_infix])
for hyphen_char in [x for x in ['-', ''] if x != orig_hyphen]:
variants_infix.update([word.replace(orig_hyphen, hyphen_char) for word in variants_infix])
variants_infix.update([upper_first_letter(word) for word in variants_infix])
_infixes_exc.extend(variants_infix)
for orth in _infixes_exc: for orth in _infixes_exc:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
@ -111,6 +115,7 @@ _hyphen_prefix = [
"abat", "abat",
"a[fg]ro", "a[fg]ro",
"after", "after",
"aigues?",
"am[ée]ricano", "am[ée]ricano",
"anglo", "anglo",
"anti", "anti",
@ -120,24 +125,33 @@ _hyphen_prefix = [
"archi", "archi",
"arrières?", "arrières?",
"avant", "avant",
"avion",
"auto", "auto",
"banc", "banc",
"bas(?:ses?)?", "bas(?:ses?)?",
"bateaux?",
"bec?", "bec?",
"belles?",
"beau",
"best", "best",
"bio?", "bio?",
"bien", "bien",
"blanc", "blanc",
"bo[îi]te", "bo[îi]te",
"bonn?e?s?",
"bois", "bois",
"bou(?:c|rg)", "bou(?:c|rg)",
"b[êe]ta", "b[êe]ta",
"cache", "cache",
"cap(?:ello)?", "cap(?:ello)?",
"casse",
"castel",
"champ", "champ",
"chapelle", "chapelle",
"ch[âa]teau", "ch[âa]teau(?:neuf)?",
"chasse",
"cha(?:ud|t)e?s?", "cha(?:ud|t)e?s?",
"chauffe",
"chou", "chou",
"chromo", "chromo",
"claire?s?", "claire?s?",
@ -146,7 +160,8 @@ _hyphen_prefix = [
"contre", "contre",
"cordon", "cordon",
"coupe?", "coupe?",
"court", "courte?s?",
"couvre",
"crash", "crash",
"crise", "crise",
"croche", "croche",
@ -155,15 +170,19 @@ _hyphen_prefix = [
"côte", "côte",
"demi", "demi",
"di(?:sney)?", "di(?:sney)?",
"dix",
"d[ée]s?", "d[ée]s?",
"double",
"dys", "dys",
"ex?",
"émirato",
"entre", "entre",
"est", "est",
"ethno", "ethno",
"ex",
"extra", "extra",
"extrême", "extrême",
"[ée]co", "[ée]co",
"faux",
"fil", "fil",
"fort", "fort",
"franco?s?", "franco?s?",
@ -176,6 +195,8 @@ _hyphen_prefix = [
"gros", "gros",
"g[ée]o", "g[ée]o",
"haute?s?", "haute?s?",
"homm?es?",
"hors",
"hyper", "hyper",
"indo", "indo",
"infra", "infra",
@ -190,6 +211,8 @@ _hyphen_prefix = [
"lot", "lot",
"louis", "louis",
"m[ai]cro", "m[ai]cro",
"mal",
"médio",
"mesnil", "mesnil",
"mi(?:ni)?", "mi(?:ni)?",
"mono", "mono",
@ -201,6 +224,7 @@ _hyphen_prefix = [
"m[ée]do", "m[ée]do",
"m[ée]ta", "m[ée]ta",
"mots?", "mots?",
"neuro",
"noix", "noix",
"non", "non",
"nord", "nord",
@ -213,60 +237,137 @@ _hyphen_prefix = [
"perce", "perce",
"pharmaco", "pharmaco",
"ph[oy]to", "ph[oy]to",
"pieds?",
"pique", "pique",
"poissons?", "poissons?",
"ponce", "ponce",
"pont", "pont",
"po[rs]t", "po[rs]t",
"pousse",
"primo", "primo",
"pro(?:cès|to)?", "pro(?:cès|to)?",
"pare", "pare",
"petite?", "petite?s?",
"plessis",
"porte", "porte",
"pré", "pré",
"prêchi", "prêchi",
"protège",
"pseudo", "pseudo",
"pêle", "pêle",
"péri", "péri",
"puy", "puy",
"quasi", "quasi",
"quatre",
"radio",
"recourt", "recourt",
"rythmo", "rythmo",
"(?:re)?doubles?",
"r[ée]", "r[ée]",
"r[ée]tro", "r[ée]tro",
"sans", "requin",
"sainte?s?", "sans?",
"sa?inte?s?",
"semi", "semi",
"social", "serre",
"sino",
"socio",
"sociale?s?",
"soixante",
"sous", "sous",
"su[bdr]", "su[bdrs]",
"super", "super",
"taille",
"tire", "tire",
"thermo", "thermo",
"tiers", "tiers",
"tourne",
"toute?s?",
"tra[iî]ne?",
"trans", "trans",
"trente",
"trois",
"trousse",
"tr(?:i|ou)", "tr(?:i|ou)",
"t[ée]l[ée]", "t[ée]l[ée]",
"utéro",
"vaso",
"vi[cd]e", "vi[cd]e",
"vid[ée]o", "vid[ée]o",
"vie(?:ux|illes?)", "vie(?:ux|i?lles?|i?l)",
"vill(?:e|eneuve|ers|ette|iers|y)", "vill(?:e|eneuve|ers|ette|iers|y)",
"vingt",
"voitures?",
"wagons?",
"ultra", "ultra",
"à", "à",
"[ée]lectro", "[ée]lectro",
"[ée]qui", "[ée]qui",
"Fontaine",
"La Chapelle",
"Marie",
"Le Mesnil",
"Neuville",
"Pierre",
"Val",
"Vaux",
] ]
_elision_prefix = ["entr", "grande?s?"]
_other_hyphens = "".join([h for h in HYPHENS if h != "-"]) _other_hyphens = "".join([h for h in HYPHENS if h != "-"])
_regular_exp = [ _regular_exp = [
"^droits?[{hyphen}]de[{hyphen}]l'homm[{alpha}]+$".format( "^a[{hyphen}]sexualis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
hyphen=HYPHENS, alpha=ALPHA_LOWER "^arginine[{hyphen}]méthyl[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
), "^binge[{hyphen}]watch[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^black[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^bouche[{hyphen}]por[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^burn[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^by[{hyphen}]pass[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^ch[{elision}]tiis[{alpha}]+$".format(elision=ELISION, alpha=ALPHA_LOWER),
"^chape[{hyphen}]chut[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^down[{hyphen}]load[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^[ée]tats[{hyphen}]uni[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^droits?[{hyphen}]de[{hyphen}]l'homm[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^fac[{hyphen}]simil[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^fleur[{hyphen}]bleuis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^flic[{hyphen}]flaqu[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^fox[{hyphen}]trott[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^google[{hyphen}]is[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^hard[{hyphen}]discount[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^hip[{hyphen}]hop[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^jet[{hyphen}]set[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^knock[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^lèche[{hyphen}]bott[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^litho[{hyphen}]typographi[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^lock[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^lombri[{hyphen}]compost[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^mac[{hyphen}]adamis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^marque[{hyphen}]pag[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^mouton[{hyphen}]noiris[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^new[{hyphen}]york[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^pair[{hyphen}]programm[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^people[{hyphen}]is[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^plan[{hyphen}]socialis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^premier[{hyphen}]ministr[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^prud[{elision}]hom[{alpha}]+$".format(elision=ELISION, alpha=ALPHA_LOWER),
"^réarc[{hyphen}]bout[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^refox[{hyphen}]trott[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^remicro[{hyphen}]ond[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^repique[{hyphen}]niqu[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^repetit[{hyphen}]déjeun[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^rick[{hyphen}]roll[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^rond[{hyphen}]ponn[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^shift[{hyphen}]cliqu[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^soudo[{hyphen}]bras[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^stabilo[{hyphen}]boss[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^strip[{hyphen}]teas[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^terra[{hyphen}]form[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^teuf[{hyphen}]teuf[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^yo[{hyphen}]yo[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^zig[{hyphen}]zag[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER), "^zig[{hyphen}]zag[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
"^prud[{elision}]homm[{alpha}]*$".format(elision=ELISION, alpha=ALPHA_LOWER), "^z[{elision}]yeut[{alpha}]+$".format(elision=ELISION, alpha=ALPHA_LOWER),
] ]
# catching cases like faux-vampire
_regular_exp += [ _regular_exp += [
"^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format( "^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
prefix=p, prefix=p,
@ -277,12 +378,32 @@ _regular_exp += [
) )
for p in _hyphen_prefix for p in _hyphen_prefix
] ]
# catching cases like entr'abat
_elision_prefix = ['r?é?entr', 'grande?s?', 'r']
_regular_exp += [ _regular_exp += [
"^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format( "^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
prefix=p, elision=HYPHENS, hyphen=_other_hyphens, alpha=ALPHA_LOWER prefix=p,
elision=ELISION,
hyphen=_other_hyphens,
alpha=ALPHA_LOWER,
) )
for p in _elision_prefix for p in _elision_prefix
] ]
# catching cases like saut-de-ski, pet-en-l'air
_hyphen_combination = ['l[èe]s?', 'la', 'en', 'des?', 'd[eu]', 'sur', 'sous', 'aux?', 'à', 'et', "près", "saint"]
_regular_exp += [
"^[{alpha}]+[{hyphen}]{hyphen_combo}[{hyphen}](?:l[{elision}])?[{alpha}]+$".format(
hyphen_combo=hc,
elision=ELISION,
hyphen=HYPHENS,
alpha=ALPHA_LOWER,
)
for hc in _hyphen_combination
]
# URLs
_regular_exp.append(URL_PATTERN) _regular_exp.append(URL_PATTERN)

View File

@ -5,7 +5,15 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prudhommal"] "text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prudhommal",
"audio-numérique", "Audio-numérique",
"entr'amis", "entr'abat", "rentr'ouvertes", "grand'hamien",
"Châteauneuf-la-Forêt", "Château-Guibert",
"11-septembre", "11-Septembre", "refox-trottâmes",
"K-POP", "K-Pop", "K-pop", "z'yeutes",
"black-outeront", "états-unienne",
"courtes-pattes", "court-pattes",
"saut-de-ski", "Écourt-Saint-Quentin", "Bout-de-l'Îlien", "pet-en-l'air"]
) )
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text): def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
tokens = fr_tokenizer(text) tokens = fr_tokenizer(text)