Merge tokenizer exceptions from PR #802

This commit is contained in:
Raphaël Bournhonesque 2017-02-09 16:30:16 +01:00
parent 85f951ca99
commit 5d706ab95d
2 changed files with 38 additions and 4 deletions

View File

@ -13039,7 +13039,6 @@ Javerlhac-et-la-Chapelle-Saint-Robert
Javron-les-Chapelles
JAX-RPC
JAX-RS
J.-C.
Jeannois-Mitissien
jeans-de-gand
jeans-de-janten

View File

@ -75,15 +75,49 @@ def get_tokenizer_exceptions():
"déc.": [
{LEMMA: "décembre", ORTH: "déc."}
],
"apr.": [
{LEMMA: "après", ORTH: "apr."}
],
"J.-C.": [
{LEMMA: "Jésus", ORTH: "J."},
{LEMMA: "Christ", ORTH: "-C."}
],
"Dr.": [
{LEMMA: "docteur", ORTH: "Dr."}
],
"M.": [
{LEMMA: "monsieur", ORTH: "M."}
],
"Mr.": [
{LEMMA: "monsieur", ORTH: "Mr."}
],
"Mme.": [
{LEMMA: "madame", ORTH: "Mme."}
],
"Mlle.": [
{LEMMA: "mademoiselle", ORTH: "Mlle."}
],
"": [
{LEMMA: "numéro", ORTH: ""}
],
"": [
{LEMMA: "degrés", ORTH: ""}
],
"St.": [
{LEMMA: "saint", ORTH: "St."}
],
"Ste.": [
{LEMMA: "sainte", ORTH: "Ste."}
]
}
ABBREVIATIONS_2 = [
"Dr.",
"etc.",
]
VERBS = {}
for verb, verb_lemma in (("a", "avoir"), ("est", "être"), ("semble", "sembler"), ("indique", "indiquer"),
for verb, verb_lemma in (("a", "avoir"), ("est", "être"),
("semble", "sembler"), ("indique", "indiquer"),
("moque", "moquer"), ("passe", "passer")):
for pronoun in ("elle", "il", "on"):
token = "{}-t-{}".format(verb, pronoun)
@ -98,7 +132,8 @@ def get_tokenizer_exceptions():
{LEMMA: 'ce', ORTH: '-ce'}
]
for pre, pre_lemma in (("qu'", "que"), ("Qu'", "Que"), ("n'", "ne"), ("N'", "Ne")):
for pre, pre_lemma in (("qu'", "que"), ("Qu'", "Que"), ("n'", "ne"),
("N'", "Ne")):
VERBS['{}est-ce'.format(pre)] = [
{LEMMA: pre_lemma, ORTH: pre},
{LEMMA: 'être', ORTH: "est"},