Add Japanese lemmas (#2543)

This info was already available from Mecab, forgot to add it before.
2025-07-16 03:02:41 +03:00 · 2018-07-13 17:55:14 +09:00 · 2018-07-13 17:55:14 +09:00 · 1987f3f784
commit 1987f3f784
parent 6042723535
3 changed files with 25 additions and 1 deletions
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -82,6 +82,7 @@ class JapaneseTokenizer(object):
        for token, dtoken in zip(doc, dtokens):
            token._.mecab_tag = dtoken.pos
            token.tag_ = resolve_pos(dtoken)
+            token.lemma_ = dtoken.lemma
        return doc

    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -49,6 +49,11 @@ def RU(request):
    pymorphy = pytest.importorskip('pymorphy2')
    return util.get_lang_class('ru')()

+@pytest.fixture()
+def JA(request):
+    mecab = pytest.importorskip("MeCab")
+    return util.get_lang_class('ja')()
+

 #@pytest.fixture(params=_languages)
 #def tokenizer(request):
@ -142,7 +147,7 @@ def da_tokenizer():

@pytest.fixture
 def ja_tokenizer():
-    janome = pytest.importorskip("MeCab")
+    mecab = pytest.importorskip("MeCab")
    return util.get_lang_class('ja').Defaults.create_tokenizer()

@pytest.fixture
--- a/spacy/tests/lang/ja/test_lemma.py
+++ b/spacy/tests/lang/ja/test_lemma.py
@ -0,0 +1,18 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+LEMMAS = (
+        ('新しく', '新しい'),
+        ('赤く', '赤い'),
+        ('すごく', '凄い'),
+        ('いただきました', '頂く'),
+        ('なった', '成る'))
+
+@pytest.mark.parametrize('word,lemma', LEMMAS)
+def test_japanese_lemmas(JA, word, lemma):
+    test_lemma = JA(word)[0].lemma_
+    assert test_lemma == lemma
+
+