mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Add Japanese lemmas (#2543)
This info was already available from Mecab, forgot to add it before.
This commit is contained in:
parent
6042723535
commit
1987f3f784
|
@ -82,6 +82,7 @@ class JapaneseTokenizer(object):
|
|||
for token, dtoken in zip(doc, dtokens):
|
||||
token._.mecab_tag = dtoken.pos
|
||||
token.tag_ = resolve_pos(dtoken)
|
||||
token.lemma_ = dtoken.lemma
|
||||
return doc
|
||||
|
||||
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
||||
|
|
|
@ -49,6 +49,11 @@ def RU(request):
|
|||
pymorphy = pytest.importorskip('pymorphy2')
|
||||
return util.get_lang_class('ru')()
|
||||
|
||||
@pytest.fixture()
|
||||
def JA(request):
|
||||
mecab = pytest.importorskip("MeCab")
|
||||
return util.get_lang_class('ja')()
|
||||
|
||||
|
||||
#@pytest.fixture(params=_languages)
|
||||
#def tokenizer(request):
|
||||
|
@ -142,7 +147,7 @@ def da_tokenizer():
|
|||
|
||||
@pytest.fixture
|
||||
def ja_tokenizer():
|
||||
janome = pytest.importorskip("MeCab")
|
||||
mecab = pytest.importorskip("MeCab")
|
||||
return util.get_lang_class('ja').Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture
|
||||
|
|
18
spacy/tests/lang/ja/test_lemma.py
Normal file
18
spacy/tests/lang/ja/test_lemma.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
LEMMAS = (
|
||||
('新しく', '新しい'),
|
||||
('赤く', '赤い'),
|
||||
('すごく', '凄い'),
|
||||
('いただきました', '頂く'),
|
||||
('なった', '成る'))
|
||||
|
||||
@pytest.mark.parametrize('word,lemma', LEMMAS)
|
||||
def test_japanese_lemmas(JA, word, lemma):
|
||||
test_lemma = JA(word)[0].lemma_
|
||||
assert test_lemma == lemma
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user