diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 1094d5c09..a38745165 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -335,7 +335,8 @@ cdef class EnPosTagger: cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, pos) lemma_string = sorted(lemma_strings)[0] - lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i + raw_string = lemma_string.encode('utf8') + lemma = self.strings.intern(raw_string, len(raw_string)).i return lemma def load_morph_exceptions(self, dict exc): diff --git a/tests/test_unicode_lemmas.py b/tests/test_unicode_lemmas.py new file mode 100644 index 000000000..42e5866d6 --- /dev/null +++ b/tests/test_unicode_lemmas.py @@ -0,0 +1,15 @@ +# encoding=utf8 +from __future__ import unicode_literals + +from spacy.en import English +import pytest + + +@pytest.fixture +def tokens(): + return English()(u'ćode codé') + + +def test_unicode(tokens): + assert tokens[0].lemma_ == u'ćode' + assert tokens[1].lemma_ == u'codé'