Fix apparent bug in Spanish lemmatizer. Not sure why this emerges in v4 not in v3

This commit is contained in:
Matthew Honnibal 2024-09-04 14:22:13 +02:00
parent 4eec3bfad1
commit 64b22be76e

View File

@ -53,12 +53,15 @@ class SpanishLemmatizer(Lemmatizer):
else:
rule_pos = pos
rule = self.select_rule(rule_pos, list(features))
index = self.lookups.get_table("lemma_index").get(rule_pos, [])
lemmas = getattr(self, "lemmatize_" + rule_pos)(
string, features, rule, index
)
# Remove duplicates but preserve the ordering
lemmas = list(dict.fromkeys(lemmas))
if rule is None:
lemmas = [string]
else:
index = self.lookups.get_table("lemma_index").get(rule_pos, [])
lemmas = getattr(self, "lemmatize_" + rule_pos)(
string, features, rule, index
)
# Remove duplicates but preserve the ordering
lemmas = list(dict.fromkeys(lemmas))
self.cache[cache_key] = lemmas
return lemmas
@ -203,6 +206,7 @@ class SpanishLemmatizer(Lemmatizer):
word (str): The word to lemmatize.
features (List[str]): The morphological features as a list of Feat=Val
pairs.
rule (str): The rule ID to use
index (List[str]): The POS-specific lookup list.
RETURNS (List[str]): The list of lemmas.