Port over changes from #1157

This commit is contained in:
ines 2017-10-14 13:11:39 +02:00
parent 9b3f8f9ec3
commit 612224c10d
4 changed files with 50 additions and 6 deletions

View File

@ -4,18 +4,36 @@ from __future__ import unicode_literals, print_function
from ...language import Language
from ...attrs import LANG
from ...tokens import Doc
from ...tokenizer import Tokenizer
class JapaneseTokenizer(object):
def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
try:
from janome.tokenizer import Tokenizer
except ImportError:
raise ImportError("The Japanese tokenizer requires the Janome "
"library: https://github.com/mocobeta/janome")
self.tokenizer = Tokenizer()
def __call__(self, text):
words = [x.surface for x in self.tokenizer.tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
class JapaneseDefaults(Language.Defaults):
@classmethod
def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp)
class Japanese(Language):
lang = 'ja'
Defaults = JapaneseDefaults
def make_doc(self, text):
try:
from janome.tokenizer import Tokenizer
except ImportError:
raise ImportError("The Japanese tokenizer requires the Janome library: "
"https://github.com/mocobeta/janome")
words = [x.surface for x in Tokenizer().tokenize(text)]
words = self.tokenizer(text)
return Doc(self.vocab, words=words, spaces=[False]*len(words))

View File

@ -117,6 +117,13 @@ def he_tokenizer():
def nb_tokenizer():
return util.get_lang_class('nb').Defaults.create_tokenizer()
@pytest.fixture
def ja_tokenizer():
janome = pytest.importorskip("janome")
return util.get_lang_class('ja').Defaults.create_tokenizer()
@pytest.fixture
def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp")

View File

View File

@ -0,0 +1,19 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
TOKENIZER_TESTS = [
("日本語だよ", ['日本語', '', '']),
("東京タワーの近くに住んでいます。", ['東京', 'タワー', '', '近く', '', '住ん', '', '', 'ます', '']),
("吾輩は猫である。", ['吾輩', '', '', '', 'ある', '']),
("月に代わって、お仕置きよ!", ['', '', '代わっ', '', '', 'お仕置き', '', '!']),
("すもももももももものうち", ['すもも', '', 'もも', '', 'もも', '', 'うち'])
]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
tokens = [token.text for token in ja_tokenizer(text)]
assert tokens == expected_tokens