mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Fix inconsistent lemmas (#9405)
* Add util function to unique lists and preserve order * Use unique function instead of list(set()) list(set()) has the issue that it's not consistent between runs of the Python interpreter, so order can vary. list(set()) calls were left in a few places where they were behind calls to sorted(). I think in this case the calls to list() can be removed, but this commit doesn't do that. * Use the existing pattern for this
This commit is contained in:
parent
fd7edbc645
commit
fd759a881b
|
@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer):
|
||||||
forms.append(self.lookup_lemmatize(token)[0])
|
forms.append(self.lookup_lemmatize(token)[0])
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
forms = list(set(forms))
|
forms = list(dict.fromkeys(forms))
|
||||||
self.cache[cache_key] = forms
|
self.cache[cache_key] = forms
|
||||||
return forms
|
return forms
|
||||||
|
|
|
@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer):
|
||||||
forms.append(self.lookup_lemmatize(token)[0])
|
forms.append(self.lookup_lemmatize(token)[0])
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
forms = list(set(forms))
|
forms = list(dict.fromkeys(forms))
|
||||||
self.cache[cache_key] = forms
|
self.cache[cache_key] = forms
|
||||||
return forms
|
return forms
|
||||||
|
|
|
@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer):
|
||||||
return forms
|
return forms
|
||||||
else:
|
else:
|
||||||
oov_forms.append(form)
|
oov_forms.append(form)
|
||||||
forms = list(set(oov_forms))
|
forms = list(dict.fromkeys(oov_forms))
|
||||||
# Back-off through remaining return value candidates.
|
# Back-off through remaining return value candidates.
|
||||||
if forms:
|
if forms:
|
||||||
for form in forms:
|
for form in forms:
|
||||||
|
|
|
@ -56,7 +56,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
||||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
|
||||||
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
||||||
features_to_compare = ["Case", "Number", "Gender"]
|
features_to_compare = ["Case", "Number", "Gender"]
|
||||||
elif univ_pos == "NUM":
|
elif univ_pos == "NUM":
|
||||||
|
@ -87,7 +87,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
filtered_analyses.append(analysis)
|
filtered_analyses.append(analysis)
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
|
||||||
|
|
||||||
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||||
string = token.text
|
string = token.text
|
||||||
|
|
|
@ -1403,7 +1403,7 @@ def get_arg_names(func: Callable) -> List[str]:
|
||||||
RETURNS (List[str]): The argument names.
|
RETURNS (List[str]): The argument names.
|
||||||
"""
|
"""
|
||||||
argspec = inspect.getfullargspec(func)
|
argspec = inspect.getfullargspec(func)
|
||||||
return list(set([*argspec.args, *argspec.kwonlyargs]))
|
return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))
|
||||||
|
|
||||||
|
|
||||||
def combine_score_weights(
|
def combine_score_weights(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user