mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
French lemmatization: adding lemmas for adverbs and irregular lemmas for function words (#3131)
* adding adverbs and irregular cases for empty words * adding adverbs and irregular cases for empty words * adding adverbs and irregular cases for empty words * updating contributor agreement for amperinet
This commit is contained in:
parent
7b064542f7
commit
ee24e2534d
2
.github/contributors/amperinet.md
vendored
2
.github/contributors/amperinet.md
vendored
|
@ -101,6 +101,6 @@ mark both statements:
|
||||||
| Name | Amandine Périnet |
|
| Name | Amandine Périnet |
|
||||||
| Company name (if applicable) | 365Talents |
|
| Company name (if applicable) | 365Talents |
|
||||||
| Title or role (if applicable) | Data Science Researcher |
|
| Title or role (if applicable) | Data Science Researcher |
|
||||||
| Date | 28/12/2018 |
|
| Date | 09/01/2019 |
|
||||||
| GitHub username | amperinet |
|
| GitHub username | amperinet |
|
||||||
| Website (optional) | |
|
| Website (optional) | |
|
||||||
|
|
|
@ -4,20 +4,24 @@ from __future__ import unicode_literals
|
||||||
from .lookup import LOOKUP
|
from .lookup import LOOKUP
|
||||||
from ._adjectives import ADJECTIVES
|
from ._adjectives import ADJECTIVES
|
||||||
from ._adjectives_irreg import ADJECTIVES_IRREG
|
from ._adjectives_irreg import ADJECTIVES_IRREG
|
||||||
|
from ._adp_irreg import ADP_IRREG
|
||||||
from ._adverbs import ADVERBS
|
from ._adverbs import ADVERBS
|
||||||
|
from ._auxiliary_verbs_irreg import AUXILIARY_VERBS_IRREG
|
||||||
|
from ._cconj_irreg import CCONJ_IRREG
|
||||||
|
from ._dets_irreg import DETS_IRREG
|
||||||
|
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES
|
||||||
from ._nouns import NOUNS
|
from ._nouns import NOUNS
|
||||||
from ._nouns_irreg import NOUNS_IRREG
|
from ._nouns_irreg import NOUNS_IRREG
|
||||||
|
from ._pronouns_irreg import PRONOUNS_IRREG
|
||||||
|
from ._sconj_irreg import SCONJ_IRREG
|
||||||
from ._verbs import VERBS
|
from ._verbs import VERBS
|
||||||
from ._verbs_irreg import VERBS_IRREG
|
from ._verbs_irreg import VERBS_IRREG
|
||||||
from ._dets_irreg import DETS_IRREG
|
|
||||||
from ._pronouns_irreg import PRONOUNS_IRREG
|
|
||||||
from ._auxiliary_verbs_irreg import AUXILIARY_VERBS_IRREG
|
|
||||||
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES
|
|
||||||
|
|
||||||
|
|
||||||
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
|
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
|
||||||
|
|
||||||
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'noun': NOUNS_IRREG, 'verb': VERBS_IRREG,
|
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adp': ADP_IRREG, 'aux': AUXILIARY_VERBS_IRREG,
|
||||||
'det': DETS_IRREG, 'pron': PRONOUNS_IRREG, 'aux': AUXILIARY_VERBS_IRREG}
|
'cconj': CCONJ_IRREG, 'det': DETS_IRREG, 'noun': NOUNS_IRREG, 'verb': VERBS_IRREG,
|
||||||
|
'pron': PRONOUNS_IRREG, 'sconj': SCONJ_IRREG}
|
||||||
|
|
||||||
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES}
|
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES}
|
||||||
|
|
24
spacy/lang/fr/lemmatizer/_adp_irreg.py
Normal file
24
spacy/lang/fr/lemmatizer/_adp_irreg.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
ADP_IRREG = {
|
||||||
|
"a": ("à",),
|
||||||
|
"apr.": ("après",),
|
||||||
|
"aux": ("à",),
|
||||||
|
"av.": ("avant",),
|
||||||
|
"avt": ("avant",),
|
||||||
|
"cf.": ("cf",),
|
||||||
|
"conf.": ("cf",),
|
||||||
|
"confer": ("cf",),
|
||||||
|
"d'": ("de",),
|
||||||
|
"des": ("de",),
|
||||||
|
"du": ("de",),
|
||||||
|
"jusqu'": ("jusque",),
|
||||||
|
"pdt": ("pendant",),
|
||||||
|
"+": ("plus",),
|
||||||
|
"pr": ("pour",),
|
||||||
|
"/": ("sur",),
|
||||||
|
"versus": ("vs",),
|
||||||
|
"vs.": ("vs",)
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
17
spacy/lang/fr/lemmatizer/_cconj_irreg.py
Normal file
17
spacy/lang/fr/lemmatizer/_cconj_irreg.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
CCONJ_IRREG = {
|
||||||
|
"&": ("et",),
|
||||||
|
"c-à-d": ("c'est-à-dire",),
|
||||||
|
"c.-à.-d.": ("c'est-à-dire",),
|
||||||
|
"càd": ("c'est-à-dire",),
|
||||||
|
"&": ("et",),
|
||||||
|
"et|ou": ("et-ou",),
|
||||||
|
"et/ou": ("et-ou",),
|
||||||
|
"i.e.": ("c'est-à-dire",),
|
||||||
|
"ie": ("c'est-à-dire",),
|
||||||
|
"ou/et": ("et-ou",),
|
||||||
|
"+": ("plus",)
|
||||||
|
}
|
|
@ -4,20 +4,27 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
DETS_IRREG = {
|
DETS_IRREG = {
|
||||||
"aucune": ("aucun",),
|
"aucune": ("aucun",),
|
||||||
|
"cents": ("cent",),
|
||||||
|
"certaine": ("certain",),
|
||||||
|
"certaines": ("certain",),
|
||||||
|
"certains": ("certain",),
|
||||||
"ces": ("ce",),
|
"ces": ("ce",),
|
||||||
"cet": ("ce",),
|
"cet": ("ce",),
|
||||||
"cette": ("ce",),
|
"cette": ("ce",),
|
||||||
"cents": ("cent",),
|
"des": ("un",),
|
||||||
"certaines": ("certains",),
|
|
||||||
"différentes": ("différents",),
|
"différentes": ("différents",),
|
||||||
|
"diverse": ("divers",),
|
||||||
"diverses": ("divers",),
|
"diverses": ("divers",),
|
||||||
|
"du": ("de",),
|
||||||
"la": ("le",),
|
"la": ("le",),
|
||||||
"les": ("le",),
|
|
||||||
"l'": ("le",),
|
|
||||||
"laquelle": ("lequel",),
|
"laquelle": ("lequel",),
|
||||||
|
"les": ("le",),
|
||||||
|
"lesdites": ("ledit",),
|
||||||
|
"lesdits": ("ledit",),
|
||||||
|
"leurs": ("leur",),
|
||||||
"lesquelles": ("lequel",),
|
"lesquelles": ("lequel",),
|
||||||
"lesquels": ("lequel",),
|
"lesquels": ("lequel",),
|
||||||
"leurs": ("leur",),
|
"l'": ("le",),
|
||||||
"mainte": ("maint",),
|
"mainte": ("maint",),
|
||||||
"maintes": ("maint",),
|
"maintes": ("maint",),
|
||||||
"maints": ("maint",),
|
"maints": ("maint",),
|
||||||
|
@ -27,23 +34,29 @@ DETS_IRREG = {
|
||||||
"nulle": ("nul",),
|
"nulle": ("nul",),
|
||||||
"nulles": ("nul",),
|
"nulles": ("nul",),
|
||||||
"nuls": ("nul",),
|
"nuls": ("nul",),
|
||||||
|
"pareille": ("pareil",),
|
||||||
|
"pareilles": ("pareil",),
|
||||||
|
"pareils": ("pareil",),
|
||||||
"quelle": ("quel",),
|
"quelle": ("quel",),
|
||||||
"quelles": ("quel",),
|
"quelles": ("quel",),
|
||||||
"quels": ("quel",),
|
"qq": ("quelque",),
|
||||||
"quelqu'": ("quelque",),
|
"qqes": ("quelque",),
|
||||||
|
"qqs": ("quelque",),
|
||||||
"quelques": ("quelque",),
|
"quelques": ("quelque",),
|
||||||
|
"quelqu'": ("quelque",),
|
||||||
|
"quels": ("quel",),
|
||||||
"sa": ("son",),
|
"sa": ("son",),
|
||||||
"ses": ("son",),
|
"ses": ("son",),
|
||||||
"telle": ("tel",),
|
|
||||||
"telles": ("tel",),
|
|
||||||
"tels": ("tel",),
|
|
||||||
"ta": ("ton",),
|
"ta": ("ton",),
|
||||||
|
"telles": ("tel",),
|
||||||
|
"telle": ("tel",),
|
||||||
|
"tels": ("tel",),
|
||||||
"tes": ("ton",),
|
"tes": ("ton",),
|
||||||
"tous": ("tout",),
|
"tous": ("tout",),
|
||||||
"toute": ("tout",),
|
|
||||||
"toutes": ("tout",),
|
"toutes": ("tout",),
|
||||||
"des": ("un",),
|
"toute": ("tout",),
|
||||||
"une": ("un",),
|
"une": ("un",),
|
||||||
"vingts": ("vingt",),
|
"vingts": ("vingt",),
|
||||||
|
"vot'": ("votre",),
|
||||||
"vos": ("votre",)
|
"vos": ("votre",)
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,37 +4,89 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
PRONOUNS_IRREG = {
|
PRONOUNS_IRREG = {
|
||||||
"aucune": ("aucun",),
|
"aucune": ("aucun",),
|
||||||
"celle-ci": ("celui-ci",),
|
"autres": ("autre",),
|
||||||
"celles-ci": ("celui-ci",),
|
"ça": ("cela",),
|
||||||
"ceux-ci": ("celui-ci",),
|
"c'": ("ce",),
|
||||||
"celle-là": ("celui-là",),
|
|
||||||
"celles-là": ("celui-là",),
|
|
||||||
"ceux-là": ("celui-là",),
|
|
||||||
"celle": ("celui",),
|
"celle": ("celui",),
|
||||||
|
"celle-ci": ("celui-ci",),
|
||||||
|
"celle-là": ("celui-là",),
|
||||||
"celles": ("celui",),
|
"celles": ("celui",),
|
||||||
"ceux": ("celui",),
|
"celles-ci": ("celui-ci",),
|
||||||
|
"celles-là": ("celui-là",),
|
||||||
"certaines": ("certains",),
|
"certaines": ("certains",),
|
||||||
|
"ceux": ("celui",),
|
||||||
|
"ceux-ci": ("celui-ci",),
|
||||||
|
"ceux-là": ("celui-là",),
|
||||||
"chacune": ("chacun",),
|
"chacune": ("chacun",),
|
||||||
|
"-elle": ("lui",),
|
||||||
|
"elle": ("lui",),
|
||||||
|
"elle-même": ("lui-même",),
|
||||||
|
"-elles": ("lui",),
|
||||||
|
"elles": ("lui",),
|
||||||
|
"elles-mêmes": ("lui-même",),
|
||||||
|
"eux": ("lui",),
|
||||||
|
"eux-mêmes": ("lui-même",),
|
||||||
"icelle": ("icelui",),
|
"icelle": ("icelui",),
|
||||||
"icelles": ("icelui",),
|
"icelles": ("icelui",),
|
||||||
"iceux": ("icelui",),
|
"iceux": ("icelui",),
|
||||||
|
"-il": ("il",),
|
||||||
|
"-ils": ("il",),
|
||||||
|
"ils": ("il",),
|
||||||
|
"-je": ("je",),
|
||||||
|
"j'": ("je",),
|
||||||
"la": ("le",),
|
"la": ("le",),
|
||||||
"les": ("le",),
|
|
||||||
"laquelle": ("lequel",),
|
"laquelle": ("lequel",),
|
||||||
|
"l'autre": ("l'autre",),
|
||||||
|
"les": ("le",),
|
||||||
"lesquelles": ("lequel",),
|
"lesquelles": ("lequel",),
|
||||||
"lesquels": ("lequel",),
|
"lesquels": ("lequel",),
|
||||||
"elle-même": ("lui-même",),
|
"-leur": ("leur",),
|
||||||
"elles-mêmes": ("lui-même",),
|
"l'on": ("on",),
|
||||||
"eux-mêmes": ("lui-même",),
|
"-lui": ("lui",),
|
||||||
|
"l'une": ("l'un",),
|
||||||
|
"mêmes": ("même",),
|
||||||
|
"-m'": ("me",),
|
||||||
|
"m'": ("me",),
|
||||||
|
"-moi": ("moi",),
|
||||||
|
"nous-mêmes": ("nous-même",),
|
||||||
|
"-nous": ("nous",),
|
||||||
|
"-on": ("on",),
|
||||||
|
"qqchose": ("quelque chose",),
|
||||||
|
"qqch": ("quelque chose",),
|
||||||
|
"qqc": ("quelque chose",),
|
||||||
|
"qqn": ("quelqu'un",),
|
||||||
"quelle": ("quel",),
|
"quelle": ("quel",),
|
||||||
"quelles": ("quel",),
|
"quelles": ("quel",),
|
||||||
"quels": ("quel",),
|
"quelques-unes": ("quelques-uns",),
|
||||||
"quelques-unes": ("quelqu'un",),
|
|
||||||
"quelques-uns": ("quelqu'un",),
|
|
||||||
"quelque-une": ("quelqu'un",),
|
"quelque-une": ("quelqu'un",),
|
||||||
|
"quelqu'une": ("quelqu'un",),
|
||||||
|
"quels": ("quel",),
|
||||||
"qu": ("que",),
|
"qu": ("que",),
|
||||||
"telle": ("tel",),
|
"s'": ("se",),
|
||||||
|
"-t-elle": ("elle",),
|
||||||
|
"-t-elles": ("elle",),
|
||||||
"telles": ("tel",),
|
"telles": ("tel",),
|
||||||
|
"telle": ("tel",),
|
||||||
"tels": ("tel",),
|
"tels": ("tel",),
|
||||||
"toutes": ("tous",),
|
"-t-en": ("en",),
|
||||||
|
"-t-il": ("il",),
|
||||||
|
"-t-ils": ("il",),
|
||||||
|
"-toi": ("toi",),
|
||||||
|
"-t-on": ("on",),
|
||||||
|
"tous": ("tout",),
|
||||||
|
"toutes": ("tout",),
|
||||||
|
"toute": ("tout",),
|
||||||
|
"-t'": ("te",),
|
||||||
|
"t'": ("te",),
|
||||||
|
"-tu": ("tu",),
|
||||||
|
"-t-y": ("y",),
|
||||||
|
"unes": ("un",),
|
||||||
|
"une": ("un",),
|
||||||
|
"uns": ("un",),
|
||||||
|
"vous-mêmes": ("vous-même",),
|
||||||
|
"vous-même": ("vous-même",),
|
||||||
|
"-vous": ("vous",),
|
||||||
|
"-vs": ("vous",),
|
||||||
|
"vs": ("vous",),
|
||||||
|
"-y": ("y",)
|
||||||
}
|
}
|
19
spacy/lang/fr/lemmatizer/_sconj_irreg.py
Normal file
19
spacy/lang/fr/lemmatizer/_sconj_irreg.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
SCONJ_IRREG = {
|
||||||
|
"lorsqu'": ("lorsque",),
|
||||||
|
"pac'que": ("parce que",),
|
||||||
|
"pac'qu'": ("parce que",),
|
||||||
|
"parc'que": ("parce que",),
|
||||||
|
"parc'qu'": ("parce que",),
|
||||||
|
"paske": ("parce que",),
|
||||||
|
"pask'": ("parce que",),
|
||||||
|
"pcq": ("parce que",),
|
||||||
|
"+": ("plus",),
|
||||||
|
"puisqu'": ("puisque",),
|
||||||
|
"qd": ("quand",),
|
||||||
|
"quoiqu'": ("quoique",),
|
||||||
|
"qu'": ("que",)
|
||||||
|
}
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT
|
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP, SCONJ, CCONJ
|
||||||
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||||
from .lookup import LOOKUP
|
from .lookup import LOOKUP
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@ from .lookup import LOOKUP
|
||||||
French language lemmatizer applies the default rule based lemmatization
|
French language lemmatizer applies the default rule based lemmatization
|
||||||
procedure with some modifications for better French language support.
|
procedure with some modifications for better French language support.
|
||||||
|
|
||||||
The parts of speech 'ADV', 'PRON', 'DET' and 'AUX' are added to use the
|
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use the
|
||||||
rule-based lemmatization. As a last resort, the lemmatizer checks in
|
rule-based lemmatization. As a last resort, the lemmatizer checks in
|
||||||
the lookup table.
|
the lookup table.
|
||||||
'''
|
'''
|
||||||
|
@ -34,16 +34,22 @@ class FrenchLemmatizer(object):
|
||||||
univ_pos = 'verb'
|
univ_pos = 'verb'
|
||||||
elif univ_pos in (ADJ, 'ADJ', 'adj'):
|
elif univ_pos in (ADJ, 'ADJ', 'adj'):
|
||||||
univ_pos = 'adj'
|
univ_pos = 'adj'
|
||||||
|
elif univ_pos in (ADP, 'ADP', 'adp'):
|
||||||
|
univ_pos = 'adp'
|
||||||
elif univ_pos in (ADV, 'ADV', 'adv'):
|
elif univ_pos in (ADV, 'ADV', 'adv'):
|
||||||
univ_pos = 'adv'
|
univ_pos = 'adv'
|
||||||
elif univ_pos in (PRON, 'PRON', 'pron'):
|
|
||||||
univ_pos = 'pron'
|
|
||||||
elif univ_pos in (DET, 'DET', 'det'):
|
|
||||||
univ_pos = 'det'
|
|
||||||
elif univ_pos in (AUX, 'AUX', 'aux'):
|
elif univ_pos in (AUX, 'AUX', 'aux'):
|
||||||
univ_pos = 'aux'
|
univ_pos = 'aux'
|
||||||
|
elif univ_pos in (CCONJ, 'CCONJ', 'cconj'):
|
||||||
|
univ_pos = 'cconj'
|
||||||
|
elif univ_pos in (DET, 'DET', 'det'):
|
||||||
|
univ_pos = 'det'
|
||||||
|
elif univ_pos in (PRON, 'PRON', 'pron'):
|
||||||
|
univ_pos = 'pron'
|
||||||
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
||||||
univ_pos = 'punct'
|
univ_pos = 'punct'
|
||||||
|
elif univ_pos in (SCONJ, 'SCONJ', 'sconj'):
|
||||||
|
univ_pos = 'sconj'
|
||||||
else:
|
else:
|
||||||
return [self.lookup(string)]
|
return [self.lookup(string)]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user