spaCy/spacy/lang/fr/lemmatizer.py
Paul O'Leary McCann aafee5e1b7
Fix lookup usage in French/Catalan (fix #11347) (#11382)
* Fix lookup usage (fix #11347)

Before using the lookups table in the French (and Catalan) lemmatizers,
there's a check to see if the current term is in the table. But it's
checking a string against hashes, so it's always false. Also the table
lookup function is designed so you don't have to do that anyway.

* Use the lookup table directly

* Use string, not token
2022-08-29 10:32:38 +02:00

88 lines
2.9 KiB
Python

from typing import List, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
class FrenchLemmatizer(Lemmatizer):
"""
French language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better French language support.
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
the rule-based lemmatization. As a last resort, the lemmatizer checks in
the lookup table.
"""
@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "rule":
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
return (required, [])
else:
return super().get_lookups_config(mode)
def rule_lemmatize(self, token: Token) -> List[str]:
cache_key = (token.orth, token.pos)
if cache_key in self.cache:
return self.cache[cache_key]
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
elif "lemma_rules" not in self.lookups or univ_pos not in (
"noun",
"verb",
"adj",
"adp",
"adv",
"aux",
"cconj",
"det",
"pron",
"punct",
"sconj",
):
return self.lookup_lemmatize(token)
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
lookup_table = self.lookups.get_table("lemma_lookup", {})
index = index_table.get(univ_pos, {})
exceptions = exc_table.get(univ_pos, {})
rules = rules_table.get(univ_pos, [])
string = string.lower()
forms = []
# first try lookup in table based on upos
if string in index:
forms.append(string)
self.cache[cache_key] = forms
return forms
# then add anything in the exceptions table
forms.extend(exceptions.get(string, []))
# if nothing found yet, use the rules
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
# if still nothing, add the oov forms from rules
if not forms:
forms.extend(oov_forms)
# use lookups, which fall back to the token itself
if not forms:
forms.append(lookup_table.get(string, [string])[0])
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms