Add Japanese lemmas (#2543)

This info was already available from Mecab, forgot to add it before.
This commit is contained in:
Paul O'Leary McCann 2018-07-13 17:55:14 +09:00 committed by Ines Montani
parent 6042723535
commit 1987f3f784
3 changed files with 25 additions and 1 deletions

View File

@ -82,6 +82,7 @@ class JapaneseTokenizer(object):
for token, dtoken in zip(doc, dtokens): for token, dtoken in zip(doc, dtokens):
token._.mecab_tag = dtoken.pos token._.mecab_tag = dtoken.pos
token.tag_ = resolve_pos(dtoken) token.tag_ = resolve_pos(dtoken)
token.lemma_ = dtoken.lemma
return doc return doc
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to

View File

@ -49,6 +49,11 @@ def RU(request):
pymorphy = pytest.importorskip('pymorphy2') pymorphy = pytest.importorskip('pymorphy2')
return util.get_lang_class('ru')() return util.get_lang_class('ru')()
@pytest.fixture()
def JA(request):
mecab = pytest.importorskip("MeCab")
return util.get_lang_class('ja')()
#@pytest.fixture(params=_languages) #@pytest.fixture(params=_languages)
#def tokenizer(request): #def tokenizer(request):
@ -142,7 +147,7 @@ def da_tokenizer():
@pytest.fixture @pytest.fixture
def ja_tokenizer(): def ja_tokenizer():
janome = pytest.importorskip("MeCab") mecab = pytest.importorskip("MeCab")
return util.get_lang_class('ja').Defaults.create_tokenizer() return util.get_lang_class('ja').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture

View File

@ -0,0 +1,18 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
LEMMAS = (
('新しく', '新しい'),
('赤く', '赤い'),
('すごく', '凄い'),
('いただきました', '頂く'),
('なった', '成る'))
@pytest.mark.parametrize('word,lemma', LEMMAS)
def test_japanese_lemmas(JA, word, lemma):
test_lemma = JA(word)[0].lemma_
assert test_lemma == lemma