Improvement of rules now title insentive and have same declaration format

This commit is contained in:
Gregory Howard 2017-04-27 10:23:56 +02:00
parent ed5f094451
commit ad8129cb45

View File

@ -117,24 +117,28 @@ def get_tokenizer_exceptions():
for verb, verb_lemma in (("a", "avoir"), ("est", "être"), for verb, verb_lemma in (("a", "avoir"), ("est", "être"),
("semble", "sembler"), ("indique", "indiquer"), ("semble", "sembler"), ("indique", "indiquer"),
("moque", "moquer"), ("passe", "passer")): ("moque", "moquer"), ("passe", "passer")):
for orth in [verb,verb.title()]:
for pronoun in ("elle", "il", "on"): for pronoun in ("elle", "il", "on"):
token = "{}-t-{}".format(verb, pronoun) token = "{}-t-{}".format(orth, pronoun)
VERBS[token] = [ VERBS[token] = [
{LEMMA: verb_lemma, ORTH: verb}, {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
{LEMMA: "t", ORTH: "-t"}, {LEMMA: "t", ORTH: "-t"},
{LEMMA: pronoun, ORTH: "-" + pronoun} {LEMMA: pronoun, ORTH: "-" + pronoun}
] ]
VERBS['est-ce'] = [ for verb, verb_lemma in [("est","être")]:
{LEMMA: 'être', ORTH: "est"}, for orth in [verb,verb.title()]:
token = "{}-ce".format(orth)
VERBS[token] = [
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
{LEMMA: 'ce', ORTH: '-ce'} {LEMMA: 'ce', ORTH: '-ce'}
] ]
for pre, pre_lemma in (("qu'", "que"), ("Qu'", "Que"), ("n'", "ne"), for pre, pre_lemma in (("qu'", "que"), ("n'", "ne")):
("N'", "Ne")): for orth in [pre,pre.title()]:
VERBS['{}est-ce'.format(pre)] = [ VERBS['{}est-ce'.format(orth)] = [
{LEMMA: pre_lemma, ORTH: pre}, {LEMMA: pre_lemma, ORTH: orth},
{LEMMA: 'être', ORTH: "est"}, {LEMMA: 'être', ORTH: "est", TAG: "VERB"},
{LEMMA: 'ce', ORTH: '-ce'} {LEMMA: 'ce', ORTH: '-ce'}
] ]