From ec25976416131c50f235d2e9ba63c8180252281a Mon Sep 17 00:00:00 2001 From: Someon Date: Wed, 11 Mar 2015 01:23:06 +0200 Subject: [PATCH 1/2] Test lemma_ with unicode input --- tests/test_unicode_lemmas.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/test_unicode_lemmas.py diff --git a/tests/test_unicode_lemmas.py b/tests/test_unicode_lemmas.py new file mode 100644 index 000000000..42e5866d6 --- /dev/null +++ b/tests/test_unicode_lemmas.py @@ -0,0 +1,15 @@ +# encoding=utf8 +from __future__ import unicode_literals + +from spacy.en import English +import pytest + + +@pytest.fixture +def tokens(): + return English()(u'ćode codé') + + +def test_unicode(tokens): + assert tokens[0].lemma_ == u'ćode' + assert tokens[1].lemma_ == u'codé' From 5b8e8b4f7220abe80eb0b63fb6a145477f2832be Mon Sep 17 00:00:00 2001 From: Someon Date: Wed, 11 Mar 2015 01:26:59 +0200 Subject: [PATCH 2/2] Fix lemma_ unicode handling --- spacy/en/pos.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 1094d5c09..a38745165 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -335,7 +335,8 @@ cdef class EnPosTagger: cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, pos) lemma_string = sorted(lemma_strings)[0] - lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i + raw_string = lemma_string.encode('utf8') + lemma = self.strings.intern(raw_string, len(raw_string)).i return lemma def load_morph_exceptions(self, dict exc):