From 1987f3f784ca3f701868a402069a773dd7a4f352 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 13 Jul 2018 17:55:14 +0900 Subject: [PATCH] Add Japanese lemmas (#2543) This info was already available from Mecab, forgot to add it before. --- spacy/lang/ja/__init__.py | 1 + spacy/tests/conftest.py | 7 ++++++- spacy/tests/lang/ja/test_lemma.py | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/lang/ja/test_lemma.py diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 08b9de758..b8553559a 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -82,6 +82,7 @@ class JapaneseTokenizer(object): for token, dtoken in zip(doc, dtokens): token._.mecab_tag = dtoken.pos token.tag_ = resolve_pos(dtoken) + token.lemma_ = dtoken.lemma return doc # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b4dff4d22..ce2618970 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -49,6 +49,11 @@ def RU(request): pymorphy = pytest.importorskip('pymorphy2') return util.get_lang_class('ru')() +@pytest.fixture() +def JA(request): + mecab = pytest.importorskip("MeCab") + return util.get_lang_class('ja')() + #@pytest.fixture(params=_languages) #def tokenizer(request): @@ -142,7 +147,7 @@ def da_tokenizer(): @pytest.fixture def ja_tokenizer(): - janome = pytest.importorskip("MeCab") + mecab = pytest.importorskip("MeCab") return util.get_lang_class('ja').Defaults.create_tokenizer() @pytest.fixture diff --git a/spacy/tests/lang/ja/test_lemma.py b/spacy/tests/lang/ja/test_lemma.py new file mode 100644 index 000000000..9730b8b78 --- /dev/null +++ b/spacy/tests/lang/ja/test_lemma.py @@ -0,0 +1,18 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +LEMMAS = ( + ('新しく', '新しい'), + ('赤く', '赤い'), + ('すごく', '凄い'), + ('いただきました', '頂く'), + ('なった', '成る')) + +@pytest.mark.parametrize('word,lemma', LEMMAS) +def test_japanese_lemmas(JA, word, lemma): + test_lemma = JA(word)[0].lemma_ + assert test_lemma == lemma + +