Fix inconsistent lemmas (#9405)

* Add util function to unique lists and preserve order * Use unique function instead of list(set()) list(set()) has the issue that it's not consistent between runs of the Python interpreter, so order can vary. list(set()) calls were left in a few places where they were behind calls to sorted(). I think in this case the calls to list() can be removed, but this commit doesn't do that. * Use the existing pattern for this
2025-08-01 19:00:20 +03:00 · 2021-10-11 09:38:45 +00:00 · 2021-10-11 09:38:45 +00:00 · fd759a881b
commit fd759a881b
parent fd7edbc645
5 changed files with 6 additions and 6 deletions
--- a/spacy/lang/ca/lemmatizer.py
+++ b/spacy/lang/ca/lemmatizer.py
@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer):
            forms.append(self.lookup_lemmatize(token)[0])
        if not forms:
            forms.append(string)
-        forms = list(set(forms))
+        forms = list(dict.fromkeys(forms))
        self.cache[cache_key] = forms
        return forms
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer):
            forms.append(self.lookup_lemmatize(token)[0])
        if not forms:
            forms.append(string)
-        forms = list(set(forms))
+        forms = list(dict.fromkeys(forms))
        self.cache[cache_key] = forms
        return forms
--- a/spacy/lang/nl/lemmatizer.py
+++ b/spacy/lang/nl/lemmatizer.py
@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer):
                    return forms
                else:
                    oov_forms.append(form)
-        forms = list(set(oov_forms))
+        forms = list(dict.fromkeys(oov_forms))
        # Back-off through remaining return value candidates.
        if forms:
            for form in forms:
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -56,7 +56,7 @@ class RussianLemmatizer(Lemmatizer):
        if not len(filtered_analyses):
            return [string.lower()]
        if morphology is None or (len(morphology) == 1 and POS in morphology):
-            return list(set([analysis.normal_form for analysis in filtered_analyses]))
+            return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
            features_to_compare = ["Case", "Number", "Gender"]
        elif univ_pos == "NUM":
@ -87,7 +87,7 @@ class RussianLemmatizer(Lemmatizer):
                filtered_analyses.append(analysis)
        if not len(filtered_analyses):
            return [string.lower()]
-        return list(set([analysis.normal_form for analysis in filtered_analyses]))
+        return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))

    def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
        string = token.text
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1403,7 +1403,7 @@ def get_arg_names(func: Callable) -> List[str]:
    RETURNS (List[str]): The argument names.
    """
    argspec = inspect.getfullargspec(func)
-    return list(set([*argspec.args, *argspec.kwonlyargs]))
+    return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))


 def combine_score_weights(