Always make lemmatizer return a list of lemmas, not a set

This commit is contained in:
ines 2017-10-24 16:00:54 +02:00
parent 95f866f99f
commit 8492d5be6d
2 changed files with 4 additions and 4 deletions

View File

@ -26,10 +26,10 @@ class Lemmatizer(object):
elif univ_pos in (PUNCT, 'PUNCT', 'punct'): elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
univ_pos = 'punct' univ_pos = 'punct'
else: else:
return set([string.lower()]) return list(set([string.lower()]))
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology): if self.is_base_form(univ_pos, morphology):
return set([string.lower()]) return list(set([string.lower()]))
lemmas = lemmatize(string, self.index.get(univ_pos, {}), lemmas = lemmatize(string, self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}), self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, [])) self.rules.get(univ_pos, []))
@ -108,4 +108,4 @@ def lemmatize(string, index, exceptions, rules):
forms.extend(oov_forms) forms.extend(oov_forms)
if not forms: if not forms:
forms.append(string) forms.append(string)
return set(forms) return list(set(forms))

View File

@ -172,7 +172,7 @@ cdef class Morphology:
cdef unicode py_string = self.strings[orth] cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None: if self.lemmatizer is None:
return self.strings.add(py_string.lower()) return self.strings.add(py_string.lower())
cdef set lemma_strings cdef list lemma_strings
cdef unicode lemma_string cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
lemma_string = sorted(lemma_strings)[0] lemma_string = sorted(lemma_strings)[0]