Port over changes from #1157

This commit is contained in:
ines 2017-10-14 13:11:39 +02:00
parent 9b3f8f9ec3
commit 612224c10d
4 changed files with 50 additions and 6 deletions

View File

@ -4,18 +4,36 @@ from __future__ import unicode_literals, print_function
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG
from ...tokens import Doc from ...tokens import Doc
from ...tokenizer import Tokenizer
class JapaneseTokenizer(object):
def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
try:
from janome.tokenizer import Tokenizer
except ImportError:
raise ImportError("The Japanese tokenizer requires the Janome "
"library: https://github.com/mocobeta/janome")
self.tokenizer = Tokenizer()
def __call__(self, text):
words = [x.surface for x in self.tokenizer.tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
class JapaneseDefaults(Language.Defaults):
@classmethod
def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp)
class Japanese(Language): class Japanese(Language):
lang = 'ja' lang = 'ja'
Defaults = JapaneseDefaults
def make_doc(self, text): def make_doc(self, text):
try: words = self.tokenizer(text)
from janome.tokenizer import Tokenizer
except ImportError:
raise ImportError("The Japanese tokenizer requires the Janome library: "
"https://github.com/mocobeta/janome")
words = [x.surface for x in Tokenizer().tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))

View File

@ -117,6 +117,13 @@ def he_tokenizer():
def nb_tokenizer(): def nb_tokenizer():
return util.get_lang_class('nb').Defaults.create_tokenizer() return util.get_lang_class('nb').Defaults.create_tokenizer()
@pytest.fixture
def ja_tokenizer():
janome = pytest.importorskip("janome")
return util.get_lang_class('ja').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def th_tokenizer(): def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp") pythainlp = pytest.importorskip("pythainlp")

View File

View File

@ -0,0 +1,19 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
TOKENIZER_TESTS = [
("日本語だよ", ['日本語', '', '']),
("東京タワーの近くに住んでいます。", ['東京', 'タワー', '', '近く', '', '住ん', '', '', 'ます', '']),
("吾輩は猫である。", ['吾輩', '', '', '', 'ある', '']),
("月に代わって、お仕置きよ!", ['', '', '代わっ', '', '', 'お仕置き', '', '!']),
("すもももももももものうち", ['すもも', '', 'もも', '', 'もも', '', 'うち'])
]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
tokens = [token.text for token in ja_tokenizer(text)]
assert tokens == expected_tokens