import re from typing import List, Optional, Tuple from ...pipeline import Lemmatizer from ...tokens import Token class SpanishLemmatizer(Lemmatizer): """ Spanish rule-based lemmatizer with morph-based rule selection. """ @classmethod def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: if mode == "rule": required = ["lemma_rules", "lemma_rules_groups", "lemma_index", "lemma_exc"] return (required, []) else: return super().get_lookups_config(mode) def rule_lemmatize(self, token: Token) -> List[str]: cache_key = (token.orth, token.pos, str(token.morph)) if cache_key in self.cache: return self.cache[cache_key] string = token.text pos = token.pos_.lower() features = set(token.morph) if pos in ("", "eol", "space"): return [string.lower()] if pos in ( "adp", "cconj", "intj", "part", "propn", "punct", "sconj", "sym", "x", ): if token.is_sent_start and pos != "propn": return [string.lower()] else: return [string] string = string.lower() exc = self.lookups.get_table("lemma_exc").get(pos, {}).get(string) if exc is not None: lemmas = list(exc) else: if pos == "aux": rule_pos = "verb" else: rule_pos = pos rule = self.select_rule(rule_pos, list(features)) index = self.lookups.get_table("lemma_index").get(rule_pos, []) lemmas = getattr(self, "lemmatize_" + rule_pos)( string, features, rule, index ) # Remove duplicates but preserve the ordering lemmas = list(dict.fromkeys(lemmas)) self.cache[cache_key] = lemmas return lemmas def select_rule(self, pos: str, features: List[str]) -> Optional[str]: groups = self.lookups.get_table("lemma_rules_groups") if pos in groups: for group in groups[pos]: if set(group[1]).issubset(features): return group[0] return None def lemmatize_adj( self, word: str, features: List[str], rule: str, index: List[str] ) -> List[str]: """ Lemmatize an adjective. word (str): The word to lemmatize. features (List[str]): The morphological features as a list of Feat=Val pairs. index (List[str]): The POS-specific lookup list. RETURNS (List[str]): The list of lemmas. """ # Initialize empty lists for the generated lemmas possible_lemmas = [] selected_lemmas = [] # Apply lemmatization rules for old, new in self.lookups.get_table("lemma_rules").get(rule, []): possible_lemma = re.sub(old + "$", new, word) if possible_lemma != word: possible_lemmas.append(possible_lemma) # Additional rule for plurals that go from esdrújula to grave and end in # 'n' or 's', e.g., jóvenes -> joven additional_lemmas = [] if "Number=Plur" in features: for possible_lemma in possible_lemmas: if possible_lemma.endswith("n") or possible_lemma.endswith("s"): for old, new in self.lookups.get_table("lemma_rules").get( "accents", [] ): additional_lemmas.append(re.sub(old, new, possible_lemma)) possible_lemmas.extend(additional_lemmas) for lemma in possible_lemmas: if lemma in index: selected_lemmas.append(lemma) # If one or more of the created possible lemmas are in the lookup list, # return all of them if len(selected_lemmas) > 0: return selected_lemmas elif len(possible_lemmas) > 0: return possible_lemmas else: return [word] def lemmatize_adv( self, word: str, features: List[str], rule: str, index: List[str] ) -> List[str]: """ Lemmatize an adverb. word (str): The word to lemmatize. features (List[str]): The morphological features as a list of Feat=Val pairs. index (List[str]): The POS-specific lookup list. RETURNS (List[str]): The list of lemmas. """ # Apply lemmatization rules for old, new in self.lookups.get_table("lemma_rules").get("adverbs", []): if word == old: return [new] # If none of the rules applies, return the original word return [word] def lemmatize_det( self, word: str, features: List[str], rule: str, index: List[str] ) -> List[str]: """ Lemmatize a determiner. word (str): The word to lemmatize. features (List[str]): The morphological features as a list of Feat=Val pairs. index (List[str]): The POS-specific lookup list. RETURNS (List[str]): The list of lemmas. """ # Initialize empty lists for the generated lemmas possible_lemmas = [] selected_lemmas = [] # First, search in rules specific to determiners for old, new in self.lookups.get_table("lemma_rules").get("det", []): if word == old: return [new] # If none of the specific rules apply, search in the common rules for # determiners and pronouns that follow a unique pattern for # lemmatization. If the word is in the list, return the corresponding # lemma. for old, new in self.lookups.get_table("lemma_rules").get( "det_and_pron_fixed", [] ): if word == old: return [new] # If the word is not in the list of unique determiners and pronouns, # apply general rules of lemmatization. Include the original word in the # list of possible lemmas. for old, new in self.lookups.get_table("lemma_rules").get( "det_and_pron_general", [] ): possible_lemma = re.sub(old + "$", new, word) possible_lemmas.append(possible_lemma) possible_lemmas.append(word) if len(possible_lemmas) == 1: return possible_lemmas elif len(possible_lemmas) > 1: for lemma in possible_lemmas: if lemma in index: selected_lemmas.append(lemma) if len(selected_lemmas) >= 1: return selected_lemmas else: return possible_lemmas else: return [] def lemmatize_noun( self, word: str, features: List[str], rule: str, index: List[str] ) -> List[str]: """ Lemmatize a noun. word (str): The word to lemmatize. features (List[str]): The morphological features as a list of Feat=Val pairs. index (List[str]): The POS-specific lookup list. RETURNS (List[str]): The list of lemmas. """ # Initialize empty lists for the generated lemmas possible_lemmas = [] selected_lemmas = [] # Apply lemmatization rules for old, new in self.lookups.get_table("lemma_rules").get(rule, []): possible_lemma = re.sub(old + "$", new, word) if possible_lemma != word: possible_lemmas.append(possible_lemma) # Additional rule for plurals that go from esdrújula to grave and end in # 'n' or 's', e.g., órdenes -> orden, exámenes -> examen additional_lemmas = [] if "Number=Plur" in features: for possible_lemma in possible_lemmas: if possible_lemma.endswith("n") or possible_lemma.endswith("s"): for old, new in self.lookups.get_table("lemma_rules").get( "accents", [] ): additional_lemmas.append(re.sub(old, new, possible_lemma)) possible_lemmas.extend(additional_lemmas) for lemma in possible_lemmas: if lemma in index: selected_lemmas.append(lemma) # If one or more of the created possible lemmas are in the lookup list, # return all of them if len(selected_lemmas) > 0: return selected_lemmas elif len(possible_lemmas) > 0: return possible_lemmas else: return [word] def lemmatize_num( self, word: str, features: List[str], rule: str, index: List[str] ) -> List[str]: """ Lemmatize a numeral. word (str): The word to lemmatize. features (List[str]): The morphological features as a list of Feat=Val pairs. index (List[str]): The POS-specific lookup list. RETURNS (List[str]): The list of lemmas. """ # If the word is in the list of rules for numerals, return the # corresponding lemma for old, new in self.lookups.get_table("lemma_rules").get("num", []): if word == old: return [new] # Normalize punctuation splitted_word = word.split(",") if re.search(r"(\.)([0-9]{3})$", splitted_word[0]): word = re.sub(r"\.", r"", word) word = re.sub(r",", r".", word) return [word] def lemmatize_pron( self, word: str, features: List[str], rule: Optional[str], index: List[str] ) -> List[str]: """ Lemmatize a pronoun. word (str): The word to lemmatize. features (List[str]): The morphological features as a list of Feat=Val pairs. index (List[str]): The POS-specific lookup list. RETURNS (List[str]): The list of lemmas. """ # Initialize empty lists for the generated lemmas possible_lemmas = [] selected_lemmas = [] # First, search in rules specific to pronouns for old, new in self.lookups.get_table("lemma_rules").get("pron", []): if word == old: return [new] # If none of the specific rules apply, search in the common rules for # determiners and pronouns that follow a unique pattern for # lemmatization. If the word is in the list, return the corresponding # lemma. for old, new in self.lookups.get_table("lemma_rules").get( "det_and_pron_fixed", [] ): if word == old: return [new] # If the word is not in the list of unique determiners and pronouns, # apply general rules of lemmatization. Include the original word in the # list of possible lemmas. for old, new in self.lookups.get_table("lemma_rules").get( "det_and_pron_general", [] ): possible_lemma = re.sub(old + "$", new, word) if possible_lemma != word: possible_lemmas.append(possible_lemma) possible_lemmas.append(word) if len(possible_lemmas) == 1: return possible_lemmas elif len(possible_lemmas) > 1: for lemma in possible_lemmas: if lemma in index: selected_lemmas.append(lemma) if len(selected_lemmas) >= 1: return selected_lemmas else: return possible_lemmas else: return [] def lemmatize_verb( self, word: str, features: List[str], rule: Optional[str], index: List[str] ) -> List[str]: """ Lemmatize a verb. word (str): The word to lemmatize. features (List[str]): The morphological features as a list of Feat=Val pairs. index (List[str]): The POS-specific lookup list. RETURNS (List[str]): The list of lemmas. """ # Exceptions for verb+pronoun(s) if "PronType=Prs" in features: return self.lemmatize_verb_pron(word, features, rule, index) # Initialize empty lists for the generated lemmas possible_lemmas = [] selected_lemmas = [] # Apply lemmatization rules rule = str(rule or "") for old, new in self.lookups.get_table("lemma_rules").get(rule, []): possible_lemma = re.sub(old + "$", new, word) if possible_lemma != word: possible_lemmas.append(possible_lemma) for lemma in possible_lemmas: if lemma in index: selected_lemmas.append(lemma) if len(selected_lemmas) == 0: # If none of the possible lemmas are in the lookup list, # apply vocalic alternation rules and search in the lookup list # again for lemma in possible_lemmas: for old, new in self.lookups.get_table("lemma_rules").get( "voc_alt_1", [] ): if old in lemma: for i, char in enumerate(lemma): if char == old: voc_alt_lemma = lemma[:i] + new + lemma[i + 1 :] if voc_alt_lemma in index: selected_lemmas.append(voc_alt_lemma) for old, new in self.lookups.get_table("lemma_rules").get( "voc_alt_2", [] ): if old in lemma: voc_alt_lemma = lemma.replace(old, new, 1) if voc_alt_lemma in index: selected_lemmas.append(voc_alt_lemma) # Additional rule for verbs that lose the accent mark when lemmatized, # e.g., amplían -> ampliar additional_lemmas = [] for possible_lemma in possible_lemmas: for old, new in self.lookups.get_table("lemma_rules").get("accents", []): additional_lemmas.append(re.sub(old, new, possible_lemma)) possible_lemmas.extend(additional_lemmas) # If one or more of the created possible lemmas are in the lookup list, # return all of them if len(selected_lemmas) > 0: return selected_lemmas elif len(possible_lemmas) > 0: return possible_lemmas else: return [word] def lemmatize_verb_pron( self, word: str, features: List[str], rule: Optional[str], index: List[str] ) -> List[str]: # Strip and collect pronouns pron_patt = "^(.*?)([mts]e|l[aeo]s?|n?os)$" prons: List[str] = [] verb = word m = re.search(pron_patt, verb) while m is not None and len(prons) <= 3: verb = re.sub(m.group(2) + "$", "", verb) prons = [m.group(2)] + prons m = re.search(pron_patt, verb) # Strip accents from verb form for old, new in self.lookups.get_table("lemma_rules").get("accents", []): verb = re.sub(old, new, verb) # Lemmatize the verb and pronouns, checking for exceptions exc = self.lookups.get_table("lemma_exc").get("verb", {}).get(verb) if exc is not None: verb_lemma = exc[0] else: rule = self.select_rule("verb", features) verb_lemma = self.lemmatize_verb( verb, features - {"PronType=Prs"}, rule, index # type: ignore[operator] )[0] pron_lemmas = [] for pron in prons: exc = self.lookups.get_table("lemma_exc").get("pron", {}).get(pron) if exc is not None: pron_lemmas.append(exc[0]) else: rule = self.select_rule("pron", features) pron_lemmas.append(self.lemmatize_pron(pron, features, rule, index)[0]) return [verb_lemma + " " + " ".join(pron_lemmas)]