From 64b22be76e5e9e6fa4e375e395fbd732cf6c2958 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 4 Sep 2024 14:22:13 +0200 Subject: [PATCH] Fix apparent bug in Spanish lemmatizer. Not sure why this emerges in v4 not in v3 --- spacy/lang/es/lemmatizer.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py index ee5d38e84..8989000e0 100644 --- a/spacy/lang/es/lemmatizer.py +++ b/spacy/lang/es/lemmatizer.py @@ -53,12 +53,15 @@ class SpanishLemmatizer(Lemmatizer): else: rule_pos = pos rule = self.select_rule(rule_pos, list(features)) - index = self.lookups.get_table("lemma_index").get(rule_pos, []) - lemmas = getattr(self, "lemmatize_" + rule_pos)( - string, features, rule, index - ) - # Remove duplicates but preserve the ordering - lemmas = list(dict.fromkeys(lemmas)) + if rule is None: + lemmas = [string] + else: + index = self.lookups.get_table("lemma_index").get(rule_pos, []) + lemmas = getattr(self, "lemmatize_" + rule_pos)( + string, features, rule, index + ) + # Remove duplicates but preserve the ordering + lemmas = list(dict.fromkeys(lemmas)) self.cache[cache_key] = lemmas return lemmas @@ -203,6 +206,7 @@ class SpanishLemmatizer(Lemmatizer): word (str): The word to lemmatize. features (List[str]): The morphological features as a list of Feat=Val pairs. + rule (str): The rule ID to use index (List[str]): The POS-specific lookup list. RETURNS (List[str]): The list of lemmas.