French lemmatization: adding lemmas for adverbs and irregular lemmas for function words (#3131)

* adding adverbs and irregular cases for empty words

* adding adverbs and irregular cases for empty words

* adding adverbs and irregular cases for empty words

* updating contributor agreement for amperinet
This commit is contained in:
Amandine Périnet 2019-01-10 15:41:15 +01:00 committed by Matthew Honnibal
parent 7b064542f7
commit ee24e2534d
9 changed files with 906 additions and 655 deletions

View File

@ -101,6 +101,6 @@ mark both statements:
| Name | Amandine Périnet |
| Company name (if applicable) | 365Talents |
| Title or role (if applicable) | Data Science Researcher |
| Date | 28/12/2018 |
| Date | 09/01/2019 |
| GitHub username | amperinet |
| Website (optional) | |

View File

@ -4,20 +4,24 @@ from __future__ import unicode_literals
from .lookup import LOOKUP
from ._adjectives import ADJECTIVES
from ._adjectives_irreg import ADJECTIVES_IRREG
from ._adp_irreg import ADP_IRREG
from ._adverbs import ADVERBS
from ._auxiliary_verbs_irreg import AUXILIARY_VERBS_IRREG
from ._cconj_irreg import CCONJ_IRREG
from ._dets_irreg import DETS_IRREG
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES
from ._nouns import NOUNS
from ._nouns_irreg import NOUNS_IRREG
from ._pronouns_irreg import PRONOUNS_IRREG
from ._sconj_irreg import SCONJ_IRREG
from ._verbs import VERBS
from ._verbs_irreg import VERBS_IRREG
from ._dets_irreg import DETS_IRREG
from ._pronouns_irreg import PRONOUNS_IRREG
from ._auxiliary_verbs_irreg import AUXILIARY_VERBS_IRREG
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'noun': NOUNS_IRREG, 'verb': VERBS_IRREG,
'det': DETS_IRREG, 'pron': PRONOUNS_IRREG, 'aux': AUXILIARY_VERBS_IRREG}
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adp': ADP_IRREG, 'aux': AUXILIARY_VERBS_IRREG,
'cconj': CCONJ_IRREG, 'det': DETS_IRREG, 'noun': NOUNS_IRREG, 'verb': VERBS_IRREG,
'pron': PRONOUNS_IRREG, 'sconj': SCONJ_IRREG}
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES}

View File

@ -0,0 +1,24 @@
# coding: utf8
from __future__ import unicode_literals
ADP_IRREG = {
"a": ("à",),
"apr.": ("après",),
"aux": ("à",),
"av.": ("avant",),
"avt": ("avant",),
"cf.": ("cf",),
"conf.": ("cf",),
"confer": ("cf",),
"d'": ("de",),
"des": ("de",),
"du": ("de",),
"jusqu'": ("jusque",),
"pdt": ("pendant",),
"+": ("plus",),
"pr": ("pour",),
"/": ("sur",),
"versus": ("vs",),
"vs.": ("vs",)
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
CCONJ_IRREG = {
"&": ("et",),
"c-à-d": ("c'est-à-dire",),
"c.-à.-d.": ("c'est-à-dire",),
"càd": ("c'est-à-dire",),
"&": ("et",),
"et|ou": ("et-ou",),
"et/ou": ("et-ou",),
"i.e.": ("c'est-à-dire",),
"ie": ("c'est-à-dire",),
"ou/et": ("et-ou",),
"+": ("plus",)
}

View File

@ -4,20 +4,27 @@ from __future__ import unicode_literals
DETS_IRREG = {
"aucune": ("aucun",),
"cents": ("cent",),
"certaine": ("certain",),
"certaines": ("certain",),
"certains": ("certain",),
"ces": ("ce",),
"cet": ("ce",),
"cette": ("ce",),
"cents": ("cent",),
"certaines": ("certains",),
"des": ("un",),
"différentes": ("différents",),
"diverse": ("divers",),
"diverses": ("divers",),
"du": ("de",),
"la": ("le",),
"les": ("le",),
"l'": ("le",),
"laquelle": ("lequel",),
"les": ("le",),
"lesdites": ("ledit",),
"lesdits": ("ledit",),
"leurs": ("leur",),
"lesquelles": ("lequel",),
"lesquels": ("lequel",),
"leurs": ("leur",),
"l'": ("le",),
"mainte": ("maint",),
"maintes": ("maint",),
"maints": ("maint",),
@ -27,23 +34,29 @@ DETS_IRREG = {
"nulle": ("nul",),
"nulles": ("nul",),
"nuls": ("nul",),
"pareille": ("pareil",),
"pareilles": ("pareil",),
"pareils": ("pareil",),
"quelle": ("quel",),
"quelles": ("quel",),
"quels": ("quel",),
"quelqu'": ("quelque",),
"qq": ("quelque",),
"qqes": ("quelque",),
"qqs": ("quelque",),
"quelques": ("quelque",),
"quelqu'": ("quelque",),
"quels": ("quel",),
"sa": ("son",),
"ses": ("son",),
"telle": ("tel",),
"telles": ("tel",),
"tels": ("tel",),
"ta": ("ton",),
"telles": ("tel",),
"telle": ("tel",),
"tels": ("tel",),
"tes": ("ton",),
"tous": ("tout",),
"toute": ("tout",),
"toutes": ("tout",),
"des": ("un",),
"toute": ("tout",),
"une": ("un",),
"vingts": ("vingt",),
"vot'": ("votre",),
"vos": ("votre",)
}

View File

@ -4,37 +4,89 @@ from __future__ import unicode_literals
PRONOUNS_IRREG = {
"aucune": ("aucun",),
"celle-ci": ("celui-ci",),
"celles-ci": ("celui-ci",),
"ceux-ci": ("celui-ci",),
"celle-là": ("celui-là",),
"celles-là": ("celui-là",),
"ceux-là": ("celui-là",),
"autres": ("autre",),
"ça": ("cela",),
"c'": ("ce",),
"celle": ("celui",),
"celle-ci": ("celui-ci",),
"celle-là": ("celui-là",),
"celles": ("celui",),
"ceux": ("celui",),
"celles-ci": ("celui-ci",),
"celles-là": ("celui-là",),
"certaines": ("certains",),
"ceux": ("celui",),
"ceux-ci": ("celui-ci",),
"ceux-là": ("celui-là",),
"chacune": ("chacun",),
"-elle": ("lui",),
"elle": ("lui",),
"elle-même": ("lui-même",),
"-elles": ("lui",),
"elles": ("lui",),
"elles-mêmes": ("lui-même",),
"eux": ("lui",),
"eux-mêmes": ("lui-même",),
"icelle": ("icelui",),
"icelles": ("icelui",),
"iceux": ("icelui",),
"-il": ("il",),
"-ils": ("il",),
"ils": ("il",),
"-je": ("je",),
"j'": ("je",),
"la": ("le",),
"les": ("le",),
"laquelle": ("lequel",),
"l'autre": ("l'autre",),
"les": ("le",),
"lesquelles": ("lequel",),
"lesquels": ("lequel",),
"elle-même": ("lui-même",),
"elles-mêmes": ("lui-même",),
"eux-mêmes": ("lui-même",),
"-leur": ("leur",),
"l'on": ("on",),
"-lui": ("lui",),
"l'une": ("l'un",),
"mêmes": ("même",),
"-m'": ("me",),
"m'": ("me",),
"-moi": ("moi",),
"nous-mêmes": ("nous-même",),
"-nous": ("nous",),
"-on": ("on",),
"qqchose": ("quelque chose",),
"qqch": ("quelque chose",),
"qqc": ("quelque chose",),
"qqn": ("quelqu'un",),
"quelle": ("quel",),
"quelles": ("quel",),
"quels": ("quel",),
"quelques-unes": ("quelqu'un",),
"quelques-uns": ("quelqu'un",),
"quelques-unes": ("quelques-uns",),
"quelque-une": ("quelqu'un",),
"quelqu'une": ("quelqu'un",),
"quels": ("quel",),
"qu": ("que",),
"telle": ("tel",),
"s'": ("se",),
"-t-elle": ("elle",),
"-t-elles": ("elle",),
"telles": ("tel",),
"telle": ("tel",),
"tels": ("tel",),
"toutes": ("tous",),
"-t-en": ("en",),
"-t-il": ("il",),
"-t-ils": ("il",),
"-toi": ("toi",),
"-t-on": ("on",),
"tous": ("tout",),
"toutes": ("tout",),
"toute": ("tout",),
"-t'": ("te",),
"t'": ("te",),
"-tu": ("tu",),
"-t-y": ("y",),
"unes": ("un",),
"une": ("un",),
"uns": ("un",),
"vous-mêmes": ("vous-même",),
"vous-même": ("vous-même",),
"-vous": ("vous",),
"-vs": ("vous",),
"vs": ("vous",),
"-y": ("y",)
}

View File

@ -0,0 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
SCONJ_IRREG = {
"lorsqu'": ("lorsque",),
"pac'que": ("parce que",),
"pac'qu'": ("parce que",),
"parc'que": ("parce que",),
"parc'qu'": ("parce que",),
"paske": ("parce que",),
"pask'": ("parce que",),
"pcq": ("parce que",),
"+": ("plus",),
"puisqu'": ("puisque",),
"qd": ("quand",),
"quoiqu'": ("quoique",),
"qu'": ("que",)
}

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP, SCONJ, CCONJ
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
from .lookup import LOOKUP
@ -9,7 +9,7 @@ from .lookup import LOOKUP
French language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better French language support.
The parts of speech 'ADV', 'PRON', 'DET' and 'AUX' are added to use the
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use the
rule-based lemmatization. As a last resort, the lemmatizer checks in
the lookup table.
'''
@ -34,16 +34,22 @@ class FrenchLemmatizer(object):
univ_pos = 'verb'
elif univ_pos in (ADJ, 'ADJ', 'adj'):
univ_pos = 'adj'
elif univ_pos in (ADP, 'ADP', 'adp'):
univ_pos = 'adp'
elif univ_pos in (ADV, 'ADV', 'adv'):
univ_pos = 'adv'
elif univ_pos in (PRON, 'PRON', 'pron'):
univ_pos = 'pron'
elif univ_pos in (DET, 'DET', 'det'):
univ_pos = 'det'
elif univ_pos in (AUX, 'AUX', 'aux'):
univ_pos = 'aux'
elif univ_pos in (CCONJ, 'CCONJ', 'cconj'):
univ_pos = 'cconj'
elif univ_pos in (DET, 'DET', 'det'):
univ_pos = 'det'
elif univ_pos in (PRON, 'PRON', 'pron'):
univ_pos = 'pron'
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
univ_pos = 'punct'
elif univ_pos in (SCONJ, 'SCONJ', 'sconj'):
univ_pos = 'sconj'
else:
return [self.lookup(string)]
# See Issue #435 for example of where this logic is requied.