mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Add Japanese lemmas (#2543)
This info was already available from Mecab, forgot to add it before.
This commit is contained in:
parent
6042723535
commit
1987f3f784
|
@ -82,6 +82,7 @@ class JapaneseTokenizer(object):
|
||||||
for token, dtoken in zip(doc, dtokens):
|
for token, dtoken in zip(doc, dtokens):
|
||||||
token._.mecab_tag = dtoken.pos
|
token._.mecab_tag = dtoken.pos
|
||||||
token.tag_ = resolve_pos(dtoken)
|
token.tag_ = resolve_pos(dtoken)
|
||||||
|
token.lemma_ = dtoken.lemma
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
||||||
|
|
|
@ -49,6 +49,11 @@ def RU(request):
|
||||||
pymorphy = pytest.importorskip('pymorphy2')
|
pymorphy = pytest.importorskip('pymorphy2')
|
||||||
return util.get_lang_class('ru')()
|
return util.get_lang_class('ru')()
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def JA(request):
|
||||||
|
mecab = pytest.importorskip("MeCab")
|
||||||
|
return util.get_lang_class('ja')()
|
||||||
|
|
||||||
|
|
||||||
#@pytest.fixture(params=_languages)
|
#@pytest.fixture(params=_languages)
|
||||||
#def tokenizer(request):
|
#def tokenizer(request):
|
||||||
|
@ -142,7 +147,7 @@ def da_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def ja_tokenizer():
|
def ja_tokenizer():
|
||||||
janome = pytest.importorskip("MeCab")
|
mecab = pytest.importorskip("MeCab")
|
||||||
return util.get_lang_class('ja').Defaults.create_tokenizer()
|
return util.get_lang_class('ja').Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
18
spacy/tests/lang/ja/test_lemma.py
Normal file
18
spacy/tests/lang/ja/test_lemma.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
LEMMAS = (
|
||||||
|
('新しく', '新しい'),
|
||||||
|
('赤く', '赤い'),
|
||||||
|
('すごく', '凄い'),
|
||||||
|
('いただきました', '頂く'),
|
||||||
|
('なった', '成る'))
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('word,lemma', LEMMAS)
|
||||||
|
def test_japanese_lemmas(JA, word, lemma):
|
||||||
|
test_lemma = JA(word)[0].lemma_
|
||||||
|
assert test_lemma == lemma
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user