From fd759a881b02a7bc3488b1d9c005d5849cfc05f9 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 11 Oct 2021 09:38:45 +0000 Subject: [PATCH] Fix inconsistent lemmas (#9405) * Add util function to unique lists and preserve order * Use unique function instead of list(set()) list(set()) has the issue that it's not consistent between runs of the Python interpreter, so order can vary. list(set()) calls were left in a few places where they were behind calls to sorted(). I think in this case the calls to list() can be removed, but this commit doesn't do that. * Use the existing pattern for this --- spacy/lang/ca/lemmatizer.py | 2 +- spacy/lang/fr/lemmatizer.py | 2 +- spacy/lang/nl/lemmatizer.py | 2 +- spacy/lang/ru/lemmatizer.py | 4 ++-- spacy/util.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/lang/ca/lemmatizer.py b/spacy/lang/ca/lemmatizer.py index 2518eb720..2fd012912 100644 --- a/spacy/lang/ca/lemmatizer.py +++ b/spacy/lang/ca/lemmatizer.py @@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer): forms.append(self.lookup_lemmatize(token)[0]) if not forms: forms.append(string) - forms = list(set(forms)) + forms = list(dict.fromkeys(forms)) self.cache[cache_key] = forms return forms diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index bb5a270ab..c6422cf96 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer): forms.append(self.lookup_lemmatize(token)[0]) if not forms: forms.append(string) - forms = list(set(forms)) + forms = list(dict.fromkeys(forms)) self.cache[cache_key] = forms return forms diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index 6c025dcf6..4f6b2ef30 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer): return forms else: oov_forms.append(form) - forms = list(set(oov_forms)) + forms = list(dict.fromkeys(oov_forms)) # Back-off through remaining return value candidates. if forms: for form in forms: diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 399cd174c..92bec4c8c 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -56,7 +56,7 @@ class RussianLemmatizer(Lemmatizer): if not len(filtered_analyses): return [string.lower()] if morphology is None or (len(morphology) == 1 and POS in morphology): - return list(set([analysis.normal_form for analysis in filtered_analyses])) + return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])) if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): features_to_compare = ["Case", "Number", "Gender"] elif univ_pos == "NUM": @@ -87,7 +87,7 @@ class RussianLemmatizer(Lemmatizer): filtered_analyses.append(analysis) if not len(filtered_analyses): return [string.lower()] - return list(set([analysis.normal_form for analysis in filtered_analyses])) + return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])) def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: string = token.text diff --git a/spacy/util.py b/spacy/util.py index b49bd096f..0aa7c4c17 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1403,7 +1403,7 @@ def get_arg_names(func: Callable) -> List[str]: RETURNS (List[str]): The argument names. """ argspec = inspect.getfullargspec(func) - return list(set([*argspec.args, *argspec.kwonlyargs])) + return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs])) def combine_score_weights(