Merge pull request #1157 from polm/master

Add basic Japanese Tokenizer Test
This commit is contained in:
Ines Montani 2017-07-10 13:07:11 +02:00 committed by GitHub
commit 9eca6503c1
4 changed files with 50 additions and 8 deletions

View File

@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from ..language import Language, BaseDefaults
from ..tokenizer import Tokenizer
from ..attrs import LANG
from ..tokens import Doc
from .language_data import *
class Japanese(Language):
lang = 'ja'
def make_doc(self, text):
class JapaneseTokenizer(object):
def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
try:
from janome.tokenizer import Tokenizer
except ImportError:
raise ImportError("The Japanese tokenizer requires the Janome library: "
"https://github.com/mocobeta/janome")
words = [x.surface for x in Tokenizer().tokenize(text)]
self.tokenizer = Tokenizer()
def __call__(self, text):
words = [x.surface for x in self.tokenizer.tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
class JapaneseDefaults(BaseDefaults):
@classmethod
def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp)
class Japanese(Language):
lang = 'ja'
Defaults = JapaneseDefaults
def make_doc(self, text):
words = self.tokenizer(text)
return Doc(self.vocab, words=words, spaces=[False]*len(words))

View File

@ -5,6 +5,7 @@ from ..en import English
from ..de import German
from ..es import Spanish
from ..it import Italian
from ..ja import Japanese
from ..fr import French
from ..pt import Portuguese
from ..nl import Dutch
@ -26,7 +27,7 @@ from pathlib import Path
import os
import pytest
# These languages get run through generic tokenizer tests
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
Swedish, Hungarian, Finnish, Bengali, Norwegian]
@ -76,6 +77,12 @@ def fi_tokenizer():
return Finnish.Defaults.create_tokenizer()
@pytest.fixture
def ja_tokenizer():
janome = pytest.importorskip("janome")
return Japanese.Defaults.create_tokenizer()
@pytest.fixture
def sv_tokenizer():
return Swedish.Defaults.create_tokenizer()

View File

View File

@ -0,0 +1,17 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
TOKENIZER_TESTS = [
("日本語だよ", ['日本語', '', '']),
("東京タワーの近くに住んでいます。", ['東京', 'タワー', '', '近く', '', '住ん', '', '', 'ます', '']),
("吾輩は猫である。", ['吾輩', '', '', '', 'ある', '']),
("月に代わって、お仕置きよ!", ['', '', '代わっ', '', '', 'お仕置き', '', '!']),
("すもももももももものうち", ['すもも', '', 'もも', '', 'もも', '', 'うち'])
]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
tokens = [token.text for token in ja_tokenizer(text)]
assert tokens == expected_tokens