From 8492d5be6dd7b9d10bccd97c70fd157a3122abd7 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 16:00:54 +0200 Subject: [PATCH] Always make lemmatizer return a list of lemmas, not a set --- spacy/lemmatizer.py | 6 +++--- spacy/morphology.pyx | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 1f401f63c..f3327a1d7 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -26,10 +26,10 @@ class Lemmatizer(object): elif univ_pos in (PUNCT, 'PUNCT', 'punct'): univ_pos = 'punct' else: - return set([string.lower()]) + return list(set([string.lower()])) # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): - return set([string.lower()]) + return list(set([string.lower()])) lemmas = lemmatize(string, self.index.get(univ_pos, {}), self.exc.get(univ_pos, {}), self.rules.get(univ_pos, [])) @@ -108,4 +108,4 @@ def lemmatize(string, index, exceptions, rules): forms.extend(oov_forms) if not forms: forms.append(string) - return set(forms) + return list(set(forms)) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 7845ab4e7..090a07fe8 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -172,7 +172,7 @@ cdef class Morphology: cdef unicode py_string = self.strings[orth] if self.lemmatizer is None: return self.strings.add(py_string.lower()) - cdef set lemma_strings + cdef list lemma_strings cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) lemma_string = sorted(lemma_strings)[0]