Merge pull request #1157 from polm/master

Add basic Japanese Tokenizer Test
This commit is contained in:
Ines Montani 2017-07-10 13:07:11 +02:00 committed by GitHub
commit 9eca6503c1
4 changed files with 50 additions and 8 deletions

View File

@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function
from os import path from os import path
from ..language import Language from ..language import Language, BaseDefaults
from ..tokenizer import Tokenizer
from ..attrs import LANG from ..attrs import LANG
from ..tokens import Doc from ..tokens import Doc
from .language_data import * from .language_data import *
class JapaneseTokenizer(object):
class Japanese(Language): def __init__(self, cls, nlp=None):
lang = 'ja' self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
def make_doc(self, text):
try: try:
from janome.tokenizer import Tokenizer from janome.tokenizer import Tokenizer
except ImportError: except ImportError:
raise ImportError("The Japanese tokenizer requires the Janome library: " raise ImportError("The Japanese tokenizer requires the Janome library: "
"https://github.com/mocobeta/janome") "https://github.com/mocobeta/janome")
words = [x.surface for x in Tokenizer().tokenize(text)] self.tokenizer = Tokenizer()
def __call__(self, text):
words = [x.surface for x in self.tokenizer.tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))
class JapaneseDefaults(BaseDefaults):
@classmethod
def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp)
class Japanese(Language):
lang = 'ja'
Defaults = JapaneseDefaults
def make_doc(self, text):
words = self.tokenizer(text)
return Doc(self.vocab, words=words, spaces=[False]*len(words))

View File

@ -5,6 +5,7 @@ from ..en import English
from ..de import German from ..de import German
from ..es import Spanish from ..es import Spanish
from ..it import Italian from ..it import Italian
from ..ja import Japanese
from ..fr import French from ..fr import French
from ..pt import Portuguese from ..pt import Portuguese
from ..nl import Dutch from ..nl import Dutch
@ -26,7 +27,7 @@ from pathlib import Path
import os import os
import pytest import pytest
# These languages get run through generic tokenizer tests
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
Swedish, Hungarian, Finnish, Bengali, Norwegian] Swedish, Hungarian, Finnish, Bengali, Norwegian]
@ -76,6 +77,12 @@ def fi_tokenizer():
return Finnish.Defaults.create_tokenizer() return Finnish.Defaults.create_tokenizer()
@pytest.fixture
def ja_tokenizer():
janome = pytest.importorskip("janome")
return Japanese.Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def sv_tokenizer(): def sv_tokenizer():
return Swedish.Defaults.create_tokenizer() return Swedish.Defaults.create_tokenizer()

View File

View File

@ -0,0 +1,17 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
TOKENIZER_TESTS = [
("日本語だよ", ['日本語', '', '']),
("東京タワーの近くに住んでいます。", ['東京', 'タワー', '', '近く', '', '住ん', '', '', 'ます', '']),
("吾輩は猫である。", ['吾輩', '', '', '', 'ある', '']),
("月に代わって、お仕置きよ!", ['', '', '代わっ', '', '', 'お仕置き', '', '!']),
("すもももももももものうち", ['すもも', '', 'もも', '', 'もも', '', 'うち'])
]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
tokens = [token.text for token in ja_tokenizer(text)]
assert tokens == expected_tokens