This commit is contained in:
rsomeon 2015-03-25 15:00:49 +00:00
commit ccbb88951b
2 changed files with 17 additions and 1 deletions

View File

@ -335,7 +335,8 @@ cdef class EnPosTagger:
cdef unicode lemma_string cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, pos) lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0] lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i raw_string = lemma_string.encode('utf8')
lemma = self.strings.intern(raw_string, len(raw_string)).i
return lemma return lemma
def load_morph_exceptions(self, dict exc): def load_morph_exceptions(self, dict exc):

View File

@ -0,0 +1,15 @@
# encoding=utf8
from __future__ import unicode_literals
from spacy.en import English
import pytest
@pytest.fixture
def tokens():
return English()(u'ćode codé')
def test_unicode(tokens):
assert tokens[0].lemma_ == u'ćode'
assert tokens[1].lemma_ == u'codé'