Fix inconsistent lemmas (#9405)

* Add util function to unique lists and preserve order

* Use unique function instead of list(set())

list(set()) has the issue that it's not consistent between runs of the
Python interpreter, so order can vary.

list(set()) calls were left in a few places where they were behind calls
to sorted(). I think in this case the calls to list() can be removed,
but this commit doesn't do that.

* Use the existing pattern for this
This commit is contained in:
Paul O'Leary McCann 2021-10-11 09:38:45 +00:00 committed by GitHub
parent fd7edbc645
commit fd759a881b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 6 additions and 6 deletions

View File

@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer):
forms.append(self.lookup_lemmatize(token)[0]) forms.append(self.lookup_lemmatize(token)[0])
if not forms: if not forms:
forms.append(string) forms.append(string)
forms = list(set(forms)) forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms self.cache[cache_key] = forms
return forms return forms

View File

@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer):
forms.append(self.lookup_lemmatize(token)[0]) forms.append(self.lookup_lemmatize(token)[0])
if not forms: if not forms:
forms.append(string) forms.append(string)
forms = list(set(forms)) forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms self.cache[cache_key] = forms
return forms return forms

View File

@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer):
return forms return forms
else: else:
oov_forms.append(form) oov_forms.append(form)
forms = list(set(oov_forms)) forms = list(dict.fromkeys(oov_forms))
# Back-off through remaining return value candidates. # Back-off through remaining return value candidates.
if forms: if forms:
for form in forms: for form in forms:

View File

@ -56,7 +56,7 @@ class RussianLemmatizer(Lemmatizer):
if not len(filtered_analyses): if not len(filtered_analyses):
return [string.lower()] return [string.lower()]
if morphology is None or (len(morphology) == 1 and POS in morphology): if morphology is None or (len(morphology) == 1 and POS in morphology):
return list(set([analysis.normal_form for analysis in filtered_analyses])) return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
features_to_compare = ["Case", "Number", "Gender"] features_to_compare = ["Case", "Number", "Gender"]
elif univ_pos == "NUM": elif univ_pos == "NUM":
@ -87,7 +87,7 @@ class RussianLemmatizer(Lemmatizer):
filtered_analyses.append(analysis) filtered_analyses.append(analysis)
if not len(filtered_analyses): if not len(filtered_analyses):
return [string.lower()] return [string.lower()]
return list(set([analysis.normal_form for analysis in filtered_analyses])) return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
string = token.text string = token.text

View File

@ -1403,7 +1403,7 @@ def get_arg_names(func: Callable) -> List[str]:
RETURNS (List[str]): The argument names. RETURNS (List[str]): The argument names.
""" """
argspec = inspect.getfullargspec(func) argspec = inspect.getfullargspec(func)
return list(set([*argspec.args, *argspec.kwonlyargs])) return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))
def combine_score_weights( def combine_score_weights(