from typing import List, Tuple from ...pipeline import Lemmatizer from ...tokens import Token class FrenchLemmatizer(Lemmatizer): """ French language lemmatizer applies the default rule based lemmatization procedure with some modifications for better French language support. The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use the rule-based lemmatization. As a last resort, the lemmatizer checks in the lookup table. """ @classmethod def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: if mode == "rule": required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] return (required, []) else: return super().get_lookups_config(mode) def rule_lemmatize(self, token: Token) -> List[str]: cache_key = (token.orth, token.pos) if cache_key in self.cache: return self.cache[cache_key] string = token.text univ_pos = token.pos_.lower() if univ_pos in ("", "eol", "space"): return [string.lower()] elif "lemma_rules" not in self.lookups or univ_pos not in ( "noun", "verb", "adj", "adp", "adv", "aux", "cconj", "det", "pron", "punct", "sconj", ): return self.lookup_lemmatize(token) index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) lookup_table = self.lookups.get_table("lemma_lookup", {}) index = index_table.get(univ_pos, {}) exceptions = exc_table.get(univ_pos, {}) rules = rules_table.get(univ_pos, []) string = string.lower() forms = [] # first try lookup in table based on upos if string in index: forms.append(string) self.cache[cache_key] = forms return forms # then add anything in the exceptions table forms.extend(exceptions.get(string, [])) # if nothing found yet, use the rules oov_forms = [] if not forms: for old, new in rules: if string.endswith(old): form = string[: len(string) - len(old)] + new if not form: pass elif form in index or not form.isalpha(): forms.append(form) else: oov_forms.append(form) # if still nothing, add the oov forms from rules if not forms: forms.extend(oov_forms) # use lookups, which fall back to the token itself if not forms: forms.append(lookup_table.get(string, [string])[0]) forms = list(dict.fromkeys(forms)) self.cache[cache_key] = forms return forms