mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
French regular expressions instead of extensive exceptions list (on develop) (#3046) (resolves #2679)
* merge changes of PR 3023 into develop branch instead of master * further deletions from exception list according to PR 3023
This commit is contained in:
parent
7bbdffd36e
commit
c6ad557cea
File diff suppressed because it is too large
Load Diff
|
@ -53,7 +53,7 @@ for exc_data in [
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
for orth in FR_BASE_EXCEPTIONS + ["etc."]:
|
for orth in ["etc."]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
@ -93,14 +93,18 @@ for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
|
||||||
|
|
||||||
|
|
||||||
_infixes_exc = []
|
_infixes_exc = []
|
||||||
for elision_char in ELISION:
|
orig_elision = "'"
|
||||||
for hyphen_char in ["-", "‐"]:
|
orig_hyphen = '-'
|
||||||
_infixes_exc += [
|
|
||||||
infix.replace("'", elision_char).replace("-", hyphen_char)
|
# loop through the elison and hyphen characters, and try to substitute the ones that weren't used in the original list
|
||||||
for infix in FR_BASE_EXCEPTIONS
|
for infix in FR_BASE_EXCEPTIONS:
|
||||||
]
|
variants_infix = {infix}
|
||||||
_infixes_exc += [upper_first_letter(word) for word in _infixes_exc]
|
for elision_char in [x for x in ELISION if x != orig_elision]:
|
||||||
_infixes_exc = list(set(_infixes_exc))
|
variants_infix.update([word.replace(orig_elision, elision_char) for word in variants_infix])
|
||||||
|
for hyphen_char in [x for x in ['-', '‐'] if x != orig_hyphen]:
|
||||||
|
variants_infix.update([word.replace(orig_hyphen, hyphen_char) for word in variants_infix])
|
||||||
|
variants_infix.update([upper_first_letter(word) for word in variants_infix])
|
||||||
|
_infixes_exc.extend(variants_infix)
|
||||||
|
|
||||||
for orth in _infixes_exc:
|
for orth in _infixes_exc:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
@ -111,6 +115,7 @@ _hyphen_prefix = [
|
||||||
"abat",
|
"abat",
|
||||||
"a[fg]ro",
|
"a[fg]ro",
|
||||||
"after",
|
"after",
|
||||||
|
"aigues?",
|
||||||
"am[ée]ricano",
|
"am[ée]ricano",
|
||||||
"anglo",
|
"anglo",
|
||||||
"anti",
|
"anti",
|
||||||
|
@ -120,24 +125,33 @@ _hyphen_prefix = [
|
||||||
"archi",
|
"archi",
|
||||||
"arrières?",
|
"arrières?",
|
||||||
"avant",
|
"avant",
|
||||||
|
"avion",
|
||||||
"auto",
|
"auto",
|
||||||
"banc",
|
"banc",
|
||||||
"bas(?:ses?)?",
|
"bas(?:ses?)?",
|
||||||
|
"bateaux?",
|
||||||
"bec?",
|
"bec?",
|
||||||
|
"belles?",
|
||||||
|
"beau",
|
||||||
"best",
|
"best",
|
||||||
"bio?",
|
"bio?",
|
||||||
"bien",
|
"bien",
|
||||||
"blanc",
|
"blanc",
|
||||||
"bo[îi]te",
|
"bo[îi]te",
|
||||||
|
"bonn?e?s?",
|
||||||
"bois",
|
"bois",
|
||||||
"bou(?:c|rg)",
|
"bou(?:c|rg)",
|
||||||
"b[êe]ta",
|
"b[êe]ta",
|
||||||
"cache",
|
"cache",
|
||||||
"cap(?:ello)?",
|
"cap(?:ello)?",
|
||||||
|
"casse",
|
||||||
|
"castel",
|
||||||
"champ",
|
"champ",
|
||||||
"chapelle",
|
"chapelle",
|
||||||
"ch[âa]teau",
|
"ch[âa]teau(?:neuf)?",
|
||||||
|
"chasse",
|
||||||
"cha(?:ud|t)e?s?",
|
"cha(?:ud|t)e?s?",
|
||||||
|
"chauffe",
|
||||||
"chou",
|
"chou",
|
||||||
"chromo",
|
"chromo",
|
||||||
"claire?s?",
|
"claire?s?",
|
||||||
|
@ -146,7 +160,8 @@ _hyphen_prefix = [
|
||||||
"contre",
|
"contre",
|
||||||
"cordon",
|
"cordon",
|
||||||
"coupe?",
|
"coupe?",
|
||||||
"court",
|
"courte?s?",
|
||||||
|
"couvre",
|
||||||
"crash",
|
"crash",
|
||||||
"crise",
|
"crise",
|
||||||
"croche",
|
"croche",
|
||||||
|
@ -155,15 +170,19 @@ _hyphen_prefix = [
|
||||||
"côte",
|
"côte",
|
||||||
"demi",
|
"demi",
|
||||||
"di(?:sney)?",
|
"di(?:sney)?",
|
||||||
|
"dix",
|
||||||
"d[ée]s?",
|
"d[ée]s?",
|
||||||
"double",
|
|
||||||
"dys",
|
"dys",
|
||||||
|
"ex?",
|
||||||
|
"émirato",
|
||||||
"entre",
|
"entre",
|
||||||
"est",
|
"est",
|
||||||
"ethno",
|
"ethno",
|
||||||
|
"ex",
|
||||||
"extra",
|
"extra",
|
||||||
"extrême",
|
"extrême",
|
||||||
"[ée]co",
|
"[ée]co",
|
||||||
|
"faux",
|
||||||
"fil",
|
"fil",
|
||||||
"fort",
|
"fort",
|
||||||
"franco?s?",
|
"franco?s?",
|
||||||
|
@ -176,6 +195,8 @@ _hyphen_prefix = [
|
||||||
"gros",
|
"gros",
|
||||||
"g[ée]o",
|
"g[ée]o",
|
||||||
"haute?s?",
|
"haute?s?",
|
||||||
|
"homm?es?",
|
||||||
|
"hors",
|
||||||
"hyper",
|
"hyper",
|
||||||
"indo",
|
"indo",
|
||||||
"infra",
|
"infra",
|
||||||
|
@ -190,6 +211,8 @@ _hyphen_prefix = [
|
||||||
"lot",
|
"lot",
|
||||||
"louis",
|
"louis",
|
||||||
"m[ai]cro",
|
"m[ai]cro",
|
||||||
|
"mal",
|
||||||
|
"médio",
|
||||||
"mesnil",
|
"mesnil",
|
||||||
"mi(?:ni)?",
|
"mi(?:ni)?",
|
||||||
"mono",
|
"mono",
|
||||||
|
@ -201,6 +224,7 @@ _hyphen_prefix = [
|
||||||
"m[ée]do",
|
"m[ée]do",
|
||||||
"m[ée]ta",
|
"m[ée]ta",
|
||||||
"mots?",
|
"mots?",
|
||||||
|
"neuro",
|
||||||
"noix",
|
"noix",
|
||||||
"non",
|
"non",
|
||||||
"nord",
|
"nord",
|
||||||
|
@ -213,60 +237,137 @@ _hyphen_prefix = [
|
||||||
"perce",
|
"perce",
|
||||||
"pharmaco",
|
"pharmaco",
|
||||||
"ph[oy]to",
|
"ph[oy]to",
|
||||||
|
"pieds?",
|
||||||
"pique",
|
"pique",
|
||||||
"poissons?",
|
"poissons?",
|
||||||
"ponce",
|
"ponce",
|
||||||
"pont",
|
"pont",
|
||||||
"po[rs]t",
|
"po[rs]t",
|
||||||
|
"pousse",
|
||||||
"primo",
|
"primo",
|
||||||
"pro(?:cès|to)?",
|
"pro(?:cès|to)?",
|
||||||
"pare",
|
"pare",
|
||||||
"petite?",
|
"petite?s?",
|
||||||
|
"plessis",
|
||||||
"porte",
|
"porte",
|
||||||
"pré",
|
"pré",
|
||||||
"prêchi",
|
"prêchi",
|
||||||
|
"protège",
|
||||||
"pseudo",
|
"pseudo",
|
||||||
"pêle",
|
"pêle",
|
||||||
"péri",
|
"péri",
|
||||||
"puy",
|
"puy",
|
||||||
"quasi",
|
"quasi",
|
||||||
|
"quatre",
|
||||||
|
"radio",
|
||||||
"recourt",
|
"recourt",
|
||||||
"rythmo",
|
"rythmo",
|
||||||
|
"(?:re)?doubles?",
|
||||||
"r[ée]",
|
"r[ée]",
|
||||||
"r[ée]tro",
|
"r[ée]tro",
|
||||||
"sans",
|
"requin",
|
||||||
"sainte?s?",
|
"sans?",
|
||||||
|
"sa?inte?s?",
|
||||||
"semi",
|
"semi",
|
||||||
"social",
|
"serre",
|
||||||
|
"sino",
|
||||||
|
"socio",
|
||||||
|
"sociale?s?",
|
||||||
|
"soixante",
|
||||||
"sous",
|
"sous",
|
||||||
"su[bdr]",
|
"su[bdrs]",
|
||||||
"super",
|
"super",
|
||||||
|
"taille",
|
||||||
"tire",
|
"tire",
|
||||||
"thermo",
|
"thermo",
|
||||||
"tiers",
|
"tiers",
|
||||||
|
"tourne",
|
||||||
|
"toute?s?",
|
||||||
|
"tra[iî]ne?",
|
||||||
"trans",
|
"trans",
|
||||||
|
"trente",
|
||||||
|
"trois",
|
||||||
|
"trousse",
|
||||||
"tr(?:i|ou)",
|
"tr(?:i|ou)",
|
||||||
"t[ée]l[ée]",
|
"t[ée]l[ée]",
|
||||||
|
"utéro",
|
||||||
|
"vaso",
|
||||||
"vi[cd]e",
|
"vi[cd]e",
|
||||||
"vid[ée]o",
|
"vid[ée]o",
|
||||||
"vie(?:ux|illes?)",
|
"vie(?:ux|i?lles?|i?l)",
|
||||||
"vill(?:e|eneuve|ers|ette|iers|y)",
|
"vill(?:e|eneuve|ers|ette|iers|y)",
|
||||||
|
"vingt",
|
||||||
|
"voitures?",
|
||||||
|
"wagons?",
|
||||||
"ultra",
|
"ultra",
|
||||||
"à",
|
"à",
|
||||||
"[ée]lectro",
|
"[ée]lectro",
|
||||||
"[ée]qui",
|
"[ée]qui",
|
||||||
|
"Fontaine",
|
||||||
|
"La Chapelle",
|
||||||
|
"Marie",
|
||||||
|
"Le Mesnil",
|
||||||
|
"Neuville",
|
||||||
|
"Pierre",
|
||||||
|
"Val",
|
||||||
|
"Vaux",
|
||||||
]
|
]
|
||||||
|
|
||||||
_elision_prefix = ["entr", "grande?s?"]
|
|
||||||
_other_hyphens = "".join([h for h in HYPHENS if h != "-"])
|
_other_hyphens = "".join([h for h in HYPHENS if h != "-"])
|
||||||
|
|
||||||
_regular_exp = [
|
_regular_exp = [
|
||||||
"^droits?[{hyphen}]de[{hyphen}]l'homm[{alpha}]+$".format(
|
"^a[{hyphen}]sexualis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
hyphen=HYPHENS, alpha=ALPHA_LOWER
|
"^arginine[{hyphen}]méthyl[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
),
|
"^binge[{hyphen}]watch[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^black[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^bouche[{hyphen}]por[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^burn[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^by[{hyphen}]pass[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^ch[{elision}]tiis[{alpha}]+$".format(elision=ELISION, alpha=ALPHA_LOWER),
|
||||||
|
"^chape[{hyphen}]chut[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^down[{hyphen}]load[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^[ée]tats[{hyphen}]uni[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^droits?[{hyphen}]de[{hyphen}]l'homm[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^fac[{hyphen}]simil[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^fleur[{hyphen}]bleuis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^flic[{hyphen}]flaqu[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^fox[{hyphen}]trott[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^google[{hyphen}]is[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^hard[{hyphen}]discount[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^hip[{hyphen}]hop[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^jet[{hyphen}]set[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^knock[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^lèche[{hyphen}]bott[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^litho[{hyphen}]typographi[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^lock[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^lombri[{hyphen}]compost[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^mac[{hyphen}]adamis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^marque[{hyphen}]pag[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^mouton[{hyphen}]noiris[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^new[{hyphen}]york[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^pair[{hyphen}]programm[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^people[{hyphen}]is[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^plan[{hyphen}]socialis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^premier[{hyphen}]ministr[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^prud[{elision}]hom[{alpha}]+$".format(elision=ELISION, alpha=ALPHA_LOWER),
|
||||||
|
"^réarc[{hyphen}]bout[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^refox[{hyphen}]trott[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^remicro[{hyphen}]ond[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^repique[{hyphen}]niqu[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^repetit[{hyphen}]déjeun[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^rick[{hyphen}]roll[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^rond[{hyphen}]ponn[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^shift[{hyphen}]cliqu[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^soudo[{hyphen}]bras[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^stabilo[{hyphen}]boss[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^strip[{hyphen}]teas[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^terra[{hyphen}]form[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^teuf[{hyphen}]teuf[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
|
"^yo[{hyphen}]yo[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
"^zig[{hyphen}]zag[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
"^zig[{hyphen}]zag[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
|
||||||
"^prud[{elision}]homm[{alpha}]*$".format(elision=ELISION, alpha=ALPHA_LOWER),
|
"^z[{elision}]yeut[{alpha}]+$".format(elision=ELISION, alpha=ALPHA_LOWER),
|
||||||
]
|
]
|
||||||
|
# catching cases like faux-vampire
|
||||||
_regular_exp += [
|
_regular_exp += [
|
||||||
"^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
|
"^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
|
||||||
prefix=p,
|
prefix=p,
|
||||||
|
@ -277,12 +378,32 @@ _regular_exp += [
|
||||||
)
|
)
|
||||||
for p in _hyphen_prefix
|
for p in _hyphen_prefix
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# catching cases like entr'abat
|
||||||
|
_elision_prefix = ['r?é?entr', 'grande?s?', 'r']
|
||||||
_regular_exp += [
|
_regular_exp += [
|
||||||
"^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
|
"^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
|
||||||
prefix=p, elision=HYPHENS, hyphen=_other_hyphens, alpha=ALPHA_LOWER
|
prefix=p,
|
||||||
|
elision=ELISION,
|
||||||
|
hyphen=_other_hyphens,
|
||||||
|
alpha=ALPHA_LOWER,
|
||||||
)
|
)
|
||||||
for p in _elision_prefix
|
for p in _elision_prefix
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# catching cases like saut-de-ski, pet-en-l'air
|
||||||
|
_hyphen_combination = ['l[èe]s?', 'la', 'en', 'des?', 'd[eu]', 'sur', 'sous', 'aux?', 'à', 'et', "près", "saint"]
|
||||||
|
_regular_exp += [
|
||||||
|
"^[{alpha}]+[{hyphen}]{hyphen_combo}[{hyphen}](?:l[{elision}])?[{alpha}]+$".format(
|
||||||
|
hyphen_combo=hc,
|
||||||
|
elision=ELISION,
|
||||||
|
hyphen=HYPHENS,
|
||||||
|
alpha=ALPHA_LOWER,
|
||||||
|
)
|
||||||
|
for hc in _hyphen_combination
|
||||||
|
]
|
||||||
|
|
||||||
|
# URLs
|
||||||
_regular_exp.append(URL_PATTERN)
|
_regular_exp.append(URL_PATTERN)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,15 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal"]
|
"text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal",
|
||||||
|
"audio-numérique", "Audio-numérique",
|
||||||
|
"entr'amis", "entr'abat", "rentr'ouvertes", "grand'hamien",
|
||||||
|
"Châteauneuf-la-Forêt", "Château-Guibert",
|
||||||
|
"11-septembre", "11-Septembre", "refox-trottâmes",
|
||||||
|
"K-POP", "K-Pop", "K-pop", "z'yeutes",
|
||||||
|
"black-outeront", "états-unienne",
|
||||||
|
"courtes-pattes", "court-pattes",
|
||||||
|
"saut-de-ski", "Écourt-Saint-Quentin", "Bout-de-l'Îlien", "pet-en-l'air"]
|
||||||
)
|
)
|
||||||
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user