From d17afb4826229d8c1ec3c2191e8f8adfe53a6b06 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 27 Jan 2021 12:21:35 +0100 Subject: [PATCH] Add Spanish rule-based lemmatizer (#6833) * Initial Spanish lemmatizer * Handle merged verb+pron(s) multi-word tokens * Use VERB for AUX rule lookup * Add morph to lemma cache key * Fix aux lookups, minor refactoring * Improve verb+pron handling * Move verb+pron handling into its own method * Check for exceptions (primarily for se) * Collect pronouns in the same (not reversed) order * Only add modified possible lemmas --- spacy/lang/es/__init__.py | 13 + spacy/lang/es/lemmatizer.py | 423 +++++++++++++++++++++++++++ spacy/tests/lang/test_lemmatizers.py | 2 +- 3 files changed, 437 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/es/lemmatizer.py diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 9a47855b1..a0230b850 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -1,6 +1,9 @@ +from typing import Optional +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .lemmatizer import SpanishLemmatizer from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ...language import Language @@ -20,4 +23,14 @@ class Spanish(Language): Defaults = SpanishDefaults +@Spanish.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "overwrite": False}, + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool): + return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + + __all__ = ["Spanish"] diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py new file mode 100644 index 000000000..56f74068d --- /dev/null +++ b/spacy/lang/es/lemmatizer.py @@ -0,0 +1,423 @@ +from typing import List, Optional, Tuple +import re + +from ...pipeline import Lemmatizer +from ...tokens import Token + + +class SpanishLemmatizer(Lemmatizer): + """ + Spanish rule-based lemmatizer with morph-based rule selection. + """ + + @classmethod + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: + if mode == "rule": + required = ["lemma_rules", "lemma_rules_groups", "lemma_index", "lemma_exc"] + return (required, []) + else: + return super().get_lookups_config(mode) + + def rule_lemmatize(self, token: Token) -> List[str]: + cache_key = (token.orth, token.pos, str(token.morph)) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + pos = token.pos_.lower() + features = set(token.morph) + if pos in ("", "eol", "space"): + return [string.lower()] + if pos in ( + "adp", + "cconj", + "intj", + "part", + "propn", + "punct", + "sconj", + "sym", + "x", + ): + if token.is_sent_start and pos != "propn": + return [string.lower()] + else: + return [string] + + string = string.lower() + exc = self.lookups.get_table("lemma_exc").get(pos, {}).get(string) + if exc is not None: + lemmas = list(exc) + else: + if pos == "aux": + rule_pos = "verb" + else: + rule_pos = pos + rule = self.select_rule(rule_pos, features) + index = self.lookups.get_table("lemma_index").get(rule_pos, []) + lemmas = getattr(self, "lemmatize_" + rule_pos)( + string, features, rule, index + ) + # Remove duplicates but preserve the ordering + lemmas = list(dict.fromkeys(lemmas)) + + self.cache[cache_key] = lemmas + return lemmas + + def select_rule(self, pos: str, features: List[str]) -> Optional[str]: + groups = self.lookups.get_table("lemma_rules_groups") + if pos in groups: + for group in groups[pos]: + if set(group[1]).issubset(features): + return group[0] + return None + + def lemmatize_adj( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize an adjective. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # Initialize empty lists for the generated lemmas + possible_lemmas = [] + selected_lemmas = [] + + # Apply lemmatization rules + for old, new in self.lookups.get_table("lemma_rules").get(rule, []): + possible_lemma = re.sub(old + "$", new, word) + if possible_lemma != word: + possible_lemmas.append(possible_lemma) + + # Additional rule for plurals that go from esdrújula to grave and end in + # 'n' or 's', e.g., jóvenes -> joven + additional_lemmas = [] + if "Number=Plur" in features: + for possible_lemma in possible_lemmas: + if possible_lemma.endswith("n") or possible_lemma.endswith("s"): + for old, new in self.lookups.get_table("lemma_rules").get( + "accents", [] + ): + additional_lemmas.append(re.sub(old, new, possible_lemma)) + possible_lemmas.extend(additional_lemmas) + + for lemma in possible_lemmas: + if lemma in index: + selected_lemmas.append(lemma) + # If one or more of the created possible lemmas are in the lookup list, + # return all of them + if len(selected_lemmas) > 0: + return selected_lemmas + elif len(possible_lemmas) > 0: + return possible_lemmas + else: + return [word] + + def lemmatize_adv( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize an adverb. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # Apply lemmatization rules + for old, new in self.lookups.get_table("lemma_rules").get("adverbs", []): + if word == old: + return [new] + + # If none of the rules applies, return the original word + return [word] + + def lemmatize_det( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize a determiner. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # Initialize empty lists for the generated lemmas + possible_lemmas = [] + selected_lemmas = [] + + # First, search in rules specific to determiners + for old, new in self.lookups.get_table("lemma_rules").get("det", []): + if word == old: + return [new] + # If none of the specfic rules apply, search in the common rules for + # determiners and pronouns that follow a unique pattern for + # lemmatization. If the word is in the list, return the corresponding + # lemma. + for old, new in self.lookups.get_table("lemma_rules").get( + "det_and_pron_fixed", [] + ): + if word == old: + return [new] + # If the word is not in the list of unique determiners and pronouns, + # apply general rules of lemmatization. Include the original word in the # list of possible lemmas. + for old, new in self.lookups.get_table("lemma_rules").get( + "det_and_pron_general", [] + ): + possible_lemma = re.sub(old + "$", new, word) + possible_lemmas.append(possible_lemma) + possible_lemmas.append(word) + + if len(possible_lemmas) == 1: + return possible_lemmas + elif len(possible_lemmas) > 1: + for lemma in possible_lemmas: + if lemma in index: + selected_lemmas.append(lemma) + if len(selected_lemmas) >= 1: + return selected_lemmas + else: + return possible_lemmas + + def lemmatize_noun( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize a noun. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # Initialize empty lists for the generated lemmas + possible_lemmas = [] + selected_lemmas = [] + + # Apply lemmatization rules + for old, new in self.lookups.get_table("lemma_rules").get(rule, []): + possible_lemma = re.sub(old + "$", new, word) + if possible_lemma != word: + possible_lemmas.append(possible_lemma) + + # Additional rule for plurals that go from esdrújula to grave and end in + # 'n' or 's', e.g., órdenes -> orden, exámenes -> examen + additional_lemmas = [] + if "Number=Plur" in features: + for possible_lemma in possible_lemmas: + if possible_lemma.endswith("n") or possible_lemma.endswith("s"): + for old, new in self.lookups.get_table("lemma_rules").get( + "accents", [] + ): + additional_lemmas.append(re.sub(old, new, possible_lemma)) + possible_lemmas.extend(additional_lemmas) + + for lemma in possible_lemmas: + if lemma in index: + selected_lemmas.append(lemma) + # If one or more of the created possible lemmas are in the lookup list, + # return all of them + if len(selected_lemmas) > 0: + return selected_lemmas + elif len(possible_lemmas) > 0: + return possible_lemmas + else: + return [word] + + def lemmatize_num( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize a numeral. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # If the word is in the list of rules for numerals, return the + # corresponding lemma + for old, new in self.lookups.get_table("lemma_rules").get("num", []): + if word == old: + return [new] + + # Normalize punctuation + splitted_word = word.split(",") + if re.search(r"(\.)([0-9]{3})$", splitted_word[0]): + word = re.sub(r"\.", r"", word) + word = re.sub(r",", r".", word) + return [word] + + def lemmatize_pron( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize a pronoun. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + + # Initialize empty lists for the generated lemmas + possible_lemmas = [] + selected_lemmas = [] + + # First, search in rules specific to pronouns + for old, new in self.lookups.get_table("lemma_rules").get("pron", []): + if word == old: + return [new] + # If none of the specfic rules apply, search in the common rules for + # determiners and pronouns that follow a unique pattern for + # lemmatization. If the word is in the list, return the corresponding + # lemma. + for old, new in self.lookups.get_table("lemma_rules").get( + "det_and_pron_fixed", [] + ): + if word == old: + return [new] + # If the word is not in the list of unique determiners and pronouns, + # apply general rules of lemmatization. Include the original word in the + # list of possible lemmas. + for old, new in self.lookups.get_table("lemma_rules").get( + "det_and_pron_general", [] + ): + possible_lemma = re.sub(old + "$", new, word) + if possible_lemma != word: + possible_lemmas.append(possible_lemma) + possible_lemmas.append(word) + + if len(possible_lemmas) == 1: + return possible_lemmas + elif len(possible_lemmas) > 1: + for lemma in possible_lemmas: + if lemma in index: + selected_lemmas.append(lemma) + if len(selected_lemmas) >= 1: + return selected_lemmas + else: + return possible_lemmas + + def lemmatize_verb( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + """ + Lemmatize a verb. + + word (str): The word to lemmatize. + features (List[str]): The morphological features as a list of Feat=Val + pairs. + index (List[str]): The POS-specific lookup list. + + RETURNS (List[str]): The list of lemmas. + """ + # Exceptions for verb+pronoun(s) + if "PronType=Prs" in features: + return self.lemmatize_verb_pron(word, features, rule, index) + + # Initialize empty lists for the generated lemmas + possible_lemmas = [] + selected_lemmas = [] + + # Apply lemmatization rules + for old, new in self.lookups.get_table("lemma_rules").get(rule, []): + possible_lemma = re.sub(old + "$", new, word) + if possible_lemma != word: + possible_lemmas.append(possible_lemma) + + for lemma in possible_lemmas: + if lemma in index: + selected_lemmas.append(lemma) + if len(selected_lemmas) == 0: + # If none of the possible lemmas are in the lookup list, + # apply vocalic alternation rules and search in the lookup list + # again + for lemma in possible_lemmas: + for old, new in self.lookups.get_table("lemma_rules").get( + "voc_alt_1", [] + ): + if old in lemma: + for i, char in enumerate(lemma): + if char == old: + voc_alt_lemma = lemma[:i] + new + lemma[i + 1 :] + if voc_alt_lemma in index: + selected_lemmas.append(voc_alt_lemma) + for old, new in self.lookups.get_table("lemma_rules").get( + "voc_alt_2", [] + ): + if old in lemma: + voc_alt_lemma = lemma.replace(old, new, 1) + if voc_alt_lemma in index: + selected_lemmas.append(voc_alt_lemma) + # Additional rule for verbs that lose the accent mark when lemmatized, + # e.g., amplían -> ampliar + additional_lemmas = [] + for possible_lemma in possible_lemmas: + for old, new in self.lookups.get_table("lemma_rules").get("accents", []): + additional_lemmas.append(re.sub(old, new, possible_lemma)) + possible_lemmas.extend(additional_lemmas) + + # If one or more of the created possible lemmas are in the lookup list, + # return all of them + if len(selected_lemmas) > 0: + return selected_lemmas + elif len(possible_lemmas) > 0: + return possible_lemmas + else: + return [word] + + def lemmatize_verb_pron( + self, word: str, features: List[str], rule: str, index: List[str] + ) -> List[str]: + # Strip and collect pronouns + pron_patt = "^(.*?)([mts]e|l[aeo]s?|n?os)$" + prons = [] + verb = word + m = re.search(pron_patt, verb) + while m is not None and len(prons) <= 3: + verb = re.sub(m.group(2) + "$", "", verb) + prons = [m.group(2)] + prons + m = re.search(pron_patt, verb) + # Strip accents from verb form + for old, new in self.lookups.get_table("lemma_rules").get("accents", []): + verb = re.sub(old, new, verb) + # Lemmatize the verb and pronouns, checking for exceptions + exc = self.lookups.get_table("lemma_exc").get("verb", {}).get(verb) + if exc is not None: + verb_lemma = exc[0] + else: + rule = self.select_rule("verb", features) + verb_lemma = self.lemmatize_verb( + verb, features - {"PronType=Prs"}, rule, index + )[0] + pron_lemmas = [] + for pron in prons: + exc = self.lookups.get_table("lemma_exc").get("pron", {}).get(pron) + if exc is not None: + pron_lemmas.append(exc[0]) + else: + rule = self.select_rule("pron", features) + pron_lemmas.append(self.lemmatize_pron(pron, features, rule, index)[0]) + return [verb_lemma + " " + " ".join(pron_lemmas)] diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py index a49d70d6b..e755da22d 100644 --- a/spacy/tests/lang/test_lemmatizers.py +++ b/spacy/tests/lang/test_lemmatizers.py @@ -7,7 +7,7 @@ from spacy.util import get_lang_class # fmt: off # Only include languages with no external dependencies # excluded: ru, uk -# excluded for custom tables: pl +# excluded for custom tables: es, pl LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"] # fmt: on