mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
French lemmatization: adding lemmas for adverbs and irregular lemmas for function words (#3131)
* adding adverbs and irregular cases for empty words * adding adverbs and irregular cases for empty words * adding adverbs and irregular cases for empty words * updating contributor agreement for amperinet
This commit is contained in:
parent
7b064542f7
commit
ee24e2534d
2
.github/contributors/amperinet.md
vendored
2
.github/contributors/amperinet.md
vendored
|
@ -101,6 +101,6 @@ mark both statements:
|
|||
| Name | Amandine Périnet |
|
||||
| Company name (if applicable) | 365Talents |
|
||||
| Title or role (if applicable) | Data Science Researcher |
|
||||
| Date | 28/12/2018 |
|
||||
| Date | 09/01/2019 |
|
||||
| GitHub username | amperinet |
|
||||
| Website (optional) | |
|
||||
|
|
|
@ -4,20 +4,24 @@ from __future__ import unicode_literals
|
|||
from .lookup import LOOKUP
|
||||
from ._adjectives import ADJECTIVES
|
||||
from ._adjectives_irreg import ADJECTIVES_IRREG
|
||||
from ._adp_irreg import ADP_IRREG
|
||||
from ._adverbs import ADVERBS
|
||||
from ._auxiliary_verbs_irreg import AUXILIARY_VERBS_IRREG
|
||||
from ._cconj_irreg import CCONJ_IRREG
|
||||
from ._dets_irreg import DETS_IRREG
|
||||
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES
|
||||
from ._nouns import NOUNS
|
||||
from ._nouns_irreg import NOUNS_IRREG
|
||||
from ._pronouns_irreg import PRONOUNS_IRREG
|
||||
from ._sconj_irreg import SCONJ_IRREG
|
||||
from ._verbs import VERBS
|
||||
from ._verbs_irreg import VERBS_IRREG
|
||||
from ._dets_irreg import DETS_IRREG
|
||||
from ._pronouns_irreg import PRONOUNS_IRREG
|
||||
from ._auxiliary_verbs_irreg import AUXILIARY_VERBS_IRREG
|
||||
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES
|
||||
|
||||
|
||||
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
|
||||
|
||||
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'noun': NOUNS_IRREG, 'verb': VERBS_IRREG,
|
||||
'det': DETS_IRREG, 'pron': PRONOUNS_IRREG, 'aux': AUXILIARY_VERBS_IRREG}
|
||||
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adp': ADP_IRREG, 'aux': AUXILIARY_VERBS_IRREG,
|
||||
'cconj': CCONJ_IRREG, 'det': DETS_IRREG, 'noun': NOUNS_IRREG, 'verb': VERBS_IRREG,
|
||||
'pron': PRONOUNS_IRREG, 'sconj': SCONJ_IRREG}
|
||||
|
||||
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES}
|
||||
|
|
24
spacy/lang/fr/lemmatizer/_adp_irreg.py
Normal file
24
spacy/lang/fr/lemmatizer/_adp_irreg.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
ADP_IRREG = {
|
||||
"a": ("à",),
|
||||
"apr.": ("après",),
|
||||
"aux": ("à",),
|
||||
"av.": ("avant",),
|
||||
"avt": ("avant",),
|
||||
"cf.": ("cf",),
|
||||
"conf.": ("cf",),
|
||||
"confer": ("cf",),
|
||||
"d'": ("de",),
|
||||
"des": ("de",),
|
||||
"du": ("de",),
|
||||
"jusqu'": ("jusque",),
|
||||
"pdt": ("pendant",),
|
||||
"+": ("plus",),
|
||||
"pr": ("pour",),
|
||||
"/": ("sur",),
|
||||
"versus": ("vs",),
|
||||
"vs.": ("vs",)
|
||||
}
|
File diff suppressed because it is too large
Load Diff
17
spacy/lang/fr/lemmatizer/_cconj_irreg.py
Normal file
17
spacy/lang/fr/lemmatizer/_cconj_irreg.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
CCONJ_IRREG = {
|
||||
"&": ("et",),
|
||||
"c-à-d": ("c'est-à-dire",),
|
||||
"c.-à.-d.": ("c'est-à-dire",),
|
||||
"càd": ("c'est-à-dire",),
|
||||
"&": ("et",),
|
||||
"et|ou": ("et-ou",),
|
||||
"et/ou": ("et-ou",),
|
||||
"i.e.": ("c'est-à-dire",),
|
||||
"ie": ("c'est-à-dire",),
|
||||
"ou/et": ("et-ou",),
|
||||
"+": ("plus",)
|
||||
}
|
|
@ -1,49 +1,62 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
DETS_IRREG = {
|
||||
"aucune": ("aucun",),
|
||||
"ces": ("ce",),
|
||||
"cet": ("ce",),
|
||||
"cette": ("ce",),
|
||||
"cents": ("cent",),
|
||||
"certaines": ("certains",),
|
||||
"différentes": ("différents",),
|
||||
"diverses": ("divers",),
|
||||
"la": ("le",),
|
||||
"les": ("le",),
|
||||
"l'": ("le",),
|
||||
"laquelle": ("lequel",),
|
||||
"lesquelles": ("lequel",),
|
||||
"lesquels": ("lequel",),
|
||||
"leurs": ("leur",),
|
||||
"mainte": ("maint",),
|
||||
"maintes": ("maint",),
|
||||
"maints": ("maint",),
|
||||
"ma": ("mon",),
|
||||
"mes": ("mon",),
|
||||
"nos": ("notre",),
|
||||
"nulle": ("nul",),
|
||||
"nulles": ("nul",),
|
||||
"nuls": ("nul",),
|
||||
"quelle": ("quel",),
|
||||
"quelles": ("quel",),
|
||||
"quels": ("quel",),
|
||||
"quelqu'": ("quelque",),
|
||||
"quelques": ("quelque",),
|
||||
"sa": ("son",),
|
||||
"ses": ("son",),
|
||||
"telle": ("tel",),
|
||||
"telles": ("tel",),
|
||||
"tels": ("tel",),
|
||||
"ta": ("ton",),
|
||||
"tes": ("ton",),
|
||||
"tous": ("tout",),
|
||||
"toute": ("tout",),
|
||||
"toutes": ("tout",),
|
||||
"des": ("un",),
|
||||
"une": ("un",),
|
||||
"vingts": ("vingt",),
|
||||
"vos": ("votre",)
|
||||
}
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
DETS_IRREG = {
|
||||
"aucune": ("aucun",),
|
||||
"cents": ("cent",),
|
||||
"certaine": ("certain",),
|
||||
"certaines": ("certain",),
|
||||
"certains": ("certain",),
|
||||
"ces": ("ce",),
|
||||
"cet": ("ce",),
|
||||
"cette": ("ce",),
|
||||
"des": ("un",),
|
||||
"différentes": ("différents",),
|
||||
"diverse": ("divers",),
|
||||
"diverses": ("divers",),
|
||||
"du": ("de",),
|
||||
"la": ("le",),
|
||||
"laquelle": ("lequel",),
|
||||
"les": ("le",),
|
||||
"lesdites": ("ledit",),
|
||||
"lesdits": ("ledit",),
|
||||
"leurs": ("leur",),
|
||||
"lesquelles": ("lequel",),
|
||||
"lesquels": ("lequel",),
|
||||
"l'": ("le",),
|
||||
"mainte": ("maint",),
|
||||
"maintes": ("maint",),
|
||||
"maints": ("maint",),
|
||||
"ma": ("mon",),
|
||||
"mes": ("mon",),
|
||||
"nos": ("notre",),
|
||||
"nulle": ("nul",),
|
||||
"nulles": ("nul",),
|
||||
"nuls": ("nul",),
|
||||
"pareille": ("pareil",),
|
||||
"pareilles": ("pareil",),
|
||||
"pareils": ("pareil",),
|
||||
"quelle": ("quel",),
|
||||
"quelles": ("quel",),
|
||||
"qq": ("quelque",),
|
||||
"qqes": ("quelque",),
|
||||
"qqs": ("quelque",),
|
||||
"quelques": ("quelque",),
|
||||
"quelqu'": ("quelque",),
|
||||
"quels": ("quel",),
|
||||
"sa": ("son",),
|
||||
"ses": ("son",),
|
||||
"ta": ("ton",),
|
||||
"telles": ("tel",),
|
||||
"telle": ("tel",),
|
||||
"tels": ("tel",),
|
||||
"tes": ("ton",),
|
||||
"tous": ("tout",),
|
||||
"toutes": ("tout",),
|
||||
"toute": ("tout",),
|
||||
"une": ("un",),
|
||||
"vingts": ("vingt",),
|
||||
"vot'": ("votre",),
|
||||
"vos": ("votre",)
|
||||
}
|
||||
|
|
|
@ -1,40 +1,92 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
PRONOUNS_IRREG = {
|
||||
"aucune": ("aucun",),
|
||||
"celle-ci": ("celui-ci",),
|
||||
"celles-ci": ("celui-ci",),
|
||||
"ceux-ci": ("celui-ci",),
|
||||
"celle-là": ("celui-là",),
|
||||
"celles-là": ("celui-là",),
|
||||
"ceux-là": ("celui-là",),
|
||||
"celle": ("celui",),
|
||||
"celles": ("celui",),
|
||||
"ceux": ("celui",),
|
||||
"certaines": ("certains",),
|
||||
"chacune": ("chacun",),
|
||||
"icelle": ("icelui",),
|
||||
"icelles": ("icelui",),
|
||||
"iceux": ("icelui",),
|
||||
"la": ("le",),
|
||||
"les": ("le",),
|
||||
"laquelle": ("lequel",),
|
||||
"lesquelles": ("lequel",),
|
||||
"lesquels": ("lequel",),
|
||||
"elle-même": ("lui-même",),
|
||||
"elles-mêmes": ("lui-même",),
|
||||
"eux-mêmes": ("lui-même",),
|
||||
"quelle": ("quel",),
|
||||
"quelles": ("quel",),
|
||||
"quels": ("quel",),
|
||||
"quelques-unes": ("quelqu'un",),
|
||||
"quelques-uns": ("quelqu'un",),
|
||||
"quelque-une": ("quelqu'un",),
|
||||
"qu": ("que",),
|
||||
"telle": ("tel",),
|
||||
"telles": ("tel",),
|
||||
"tels": ("tel",),
|
||||
"toutes": ("tous",),
|
||||
}
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
PRONOUNS_IRREG = {
|
||||
"aucune": ("aucun",),
|
||||
"autres": ("autre",),
|
||||
"ça": ("cela",),
|
||||
"c'": ("ce",),
|
||||
"celle": ("celui",),
|
||||
"celle-ci": ("celui-ci",),
|
||||
"celle-là": ("celui-là",),
|
||||
"celles": ("celui",),
|
||||
"celles-ci": ("celui-ci",),
|
||||
"celles-là": ("celui-là",),
|
||||
"certaines": ("certains",),
|
||||
"ceux": ("celui",),
|
||||
"ceux-ci": ("celui-ci",),
|
||||
"ceux-là": ("celui-là",),
|
||||
"chacune": ("chacun",),
|
||||
"-elle": ("lui",),
|
||||
"elle": ("lui",),
|
||||
"elle-même": ("lui-même",),
|
||||
"-elles": ("lui",),
|
||||
"elles": ("lui",),
|
||||
"elles-mêmes": ("lui-même",),
|
||||
"eux": ("lui",),
|
||||
"eux-mêmes": ("lui-même",),
|
||||
"icelle": ("icelui",),
|
||||
"icelles": ("icelui",),
|
||||
"iceux": ("icelui",),
|
||||
"-il": ("il",),
|
||||
"-ils": ("il",),
|
||||
"ils": ("il",),
|
||||
"-je": ("je",),
|
||||
"j'": ("je",),
|
||||
"la": ("le",),
|
||||
"laquelle": ("lequel",),
|
||||
"l'autre": ("l'autre",),
|
||||
"les": ("le",),
|
||||
"lesquelles": ("lequel",),
|
||||
"lesquels": ("lequel",),
|
||||
"-leur": ("leur",),
|
||||
"l'on": ("on",),
|
||||
"-lui": ("lui",),
|
||||
"l'une": ("l'un",),
|
||||
"mêmes": ("même",),
|
||||
"-m'": ("me",),
|
||||
"m'": ("me",),
|
||||
"-moi": ("moi",),
|
||||
"nous-mêmes": ("nous-même",),
|
||||
"-nous": ("nous",),
|
||||
"-on": ("on",),
|
||||
"qqchose": ("quelque chose",),
|
||||
"qqch": ("quelque chose",),
|
||||
"qqc": ("quelque chose",),
|
||||
"qqn": ("quelqu'un",),
|
||||
"quelle": ("quel",),
|
||||
"quelles": ("quel",),
|
||||
"quelques-unes": ("quelques-uns",),
|
||||
"quelque-une": ("quelqu'un",),
|
||||
"quelqu'une": ("quelqu'un",),
|
||||
"quels": ("quel",),
|
||||
"qu": ("que",),
|
||||
"s'": ("se",),
|
||||
"-t-elle": ("elle",),
|
||||
"-t-elles": ("elle",),
|
||||
"telles": ("tel",),
|
||||
"telle": ("tel",),
|
||||
"tels": ("tel",),
|
||||
"-t-en": ("en",),
|
||||
"-t-il": ("il",),
|
||||
"-t-ils": ("il",),
|
||||
"-toi": ("toi",),
|
||||
"-t-on": ("on",),
|
||||
"tous": ("tout",),
|
||||
"toutes": ("tout",),
|
||||
"toute": ("tout",),
|
||||
"-t'": ("te",),
|
||||
"t'": ("te",),
|
||||
"-tu": ("tu",),
|
||||
"-t-y": ("y",),
|
||||
"unes": ("un",),
|
||||
"une": ("un",),
|
||||
"uns": ("un",),
|
||||
"vous-mêmes": ("vous-même",),
|
||||
"vous-même": ("vous-même",),
|
||||
"-vous": ("vous",),
|
||||
"-vs": ("vous",),
|
||||
"vs": ("vous",),
|
||||
"-y": ("y",)
|
||||
}
|
||||
|
|
19
spacy/lang/fr/lemmatizer/_sconj_irreg.py
Normal file
19
spacy/lang/fr/lemmatizer/_sconj_irreg.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
SCONJ_IRREG = {
|
||||
"lorsqu'": ("lorsque",),
|
||||
"pac'que": ("parce que",),
|
||||
"pac'qu'": ("parce que",),
|
||||
"parc'que": ("parce que",),
|
||||
"parc'qu'": ("parce que",),
|
||||
"paske": ("parce que",),
|
||||
"pask'": ("parce que",),
|
||||
"pcq": ("parce que",),
|
||||
"+": ("plus",),
|
||||
"puisqu'": ("puisque",),
|
||||
"qd": ("quand",),
|
||||
"quoiqu'": ("quoique",),
|
||||
"qu'": ("que",)
|
||||
}
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT
|
||||
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP, SCONJ, CCONJ
|
||||
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||
from .lookup import LOOKUP
|
||||
|
||||
|
@ -9,7 +9,7 @@ from .lookup import LOOKUP
|
|||
French language lemmatizer applies the default rule based lemmatization
|
||||
procedure with some modifications for better French language support.
|
||||
|
||||
The parts of speech 'ADV', 'PRON', 'DET' and 'AUX' are added to use the
|
||||
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use the
|
||||
rule-based lemmatization. As a last resort, the lemmatizer checks in
|
||||
the lookup table.
|
||||
'''
|
||||
|
@ -34,16 +34,22 @@ class FrenchLemmatizer(object):
|
|||
univ_pos = 'verb'
|
||||
elif univ_pos in (ADJ, 'ADJ', 'adj'):
|
||||
univ_pos = 'adj'
|
||||
elif univ_pos in (ADP, 'ADP', 'adp'):
|
||||
univ_pos = 'adp'
|
||||
elif univ_pos in (ADV, 'ADV', 'adv'):
|
||||
univ_pos = 'adv'
|
||||
elif univ_pos in (PRON, 'PRON', 'pron'):
|
||||
univ_pos = 'pron'
|
||||
elif univ_pos in (DET, 'DET', 'det'):
|
||||
univ_pos = 'det'
|
||||
elif univ_pos in (AUX, 'AUX', 'aux'):
|
||||
univ_pos = 'aux'
|
||||
elif univ_pos in (CCONJ, 'CCONJ', 'cconj'):
|
||||
univ_pos = 'cconj'
|
||||
elif univ_pos in (DET, 'DET', 'det'):
|
||||
univ_pos = 'det'
|
||||
elif univ_pos in (PRON, 'PRON', 'pron'):
|
||||
univ_pos = 'pron'
|
||||
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
||||
univ_pos = 'punct'
|
||||
elif univ_pos in (SCONJ, 'SCONJ', 'sconj'):
|
||||
univ_pos = 'sconj'
|
||||
else:
|
||||
return [self.lookup(string)]
|
||||
# See Issue #435 for example of where this logic is requied.
|
||||
|
|
Loading…
Reference in New Issue
Block a user