mirror of
https://github.com/explosion/spaCy.git
synced 2024-09-21 19:39:13 +03:00
Merge pull request #1157 from polm/master
Add basic Japanese Tokenizer Test
This commit is contained in:
commit
9eca6503c1
|
@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language, BaseDefaults
|
||||||
|
from ..tokenizer import Tokenizer
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
from .language_data import *
|
from .language_data import *
|
||||||
|
|
||||||
|
class JapaneseTokenizer(object):
|
||||||
class Japanese(Language):
|
def __init__(self, cls, nlp=None):
|
||||||
lang = 'ja'
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
|
|
||||||
def make_doc(self, text):
|
|
||||||
try:
|
try:
|
||||||
from janome.tokenizer import Tokenizer
|
from janome.tokenizer import Tokenizer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
||||||
"https://github.com/mocobeta/janome")
|
"https://github.com/mocobeta/janome")
|
||||||
words = [x.surface for x in Tokenizer().tokenize(text)]
|
self.tokenizer = Tokenizer()
|
||||||
|
|
||||||
|
def __call__(self, text):
|
||||||
|
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
class JapaneseDefaults(BaseDefaults):
|
||||||
|
@classmethod
|
||||||
|
def create_tokenizer(cls, nlp=None):
|
||||||
|
return JapaneseTokenizer(cls, nlp)
|
||||||
|
|
||||||
|
class Japanese(Language):
|
||||||
|
lang = 'ja'
|
||||||
|
|
||||||
|
Defaults = JapaneseDefaults
|
||||||
|
|
||||||
|
def make_doc(self, text):
|
||||||
|
words = self.tokenizer(text)
|
||||||
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ from ..en import English
|
||||||
from ..de import German
|
from ..de import German
|
||||||
from ..es import Spanish
|
from ..es import Spanish
|
||||||
from ..it import Italian
|
from ..it import Italian
|
||||||
|
from ..ja import Japanese
|
||||||
from ..fr import French
|
from ..fr import French
|
||||||
from ..pt import Portuguese
|
from ..pt import Portuguese
|
||||||
from ..nl import Dutch
|
from ..nl import Dutch
|
||||||
|
@ -26,7 +27,7 @@ from pathlib import Path
|
||||||
import os
|
import os
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
# These languages get run through generic tokenizer tests
|
||||||
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
|
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
|
||||||
Swedish, Hungarian, Finnish, Bengali, Norwegian]
|
Swedish, Hungarian, Finnish, Bengali, Norwegian]
|
||||||
|
|
||||||
|
@ -76,6 +77,12 @@ def fi_tokenizer():
|
||||||
return Finnish.Defaults.create_tokenizer()
|
return Finnish.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def ja_tokenizer():
|
||||||
|
janome = pytest.importorskip("janome")
|
||||||
|
return Japanese.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sv_tokenizer():
|
def sv_tokenizer():
|
||||||
return Swedish.Defaults.create_tokenizer()
|
return Swedish.Defaults.create_tokenizer()
|
||||||
|
|
0
spacy/tests/ja/__init__.py
Normal file
0
spacy/tests/ja/__init__.py
Normal file
17
spacy/tests/ja/test_tokenizer.py
Normal file
17
spacy/tests/ja/test_tokenizer.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
TOKENIZER_TESTS = [
|
||||||
|
("日本語だよ", ['日本語', 'だ', 'よ']),
|
||||||
|
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||||
|
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
||||||
|
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
|
||||||
|
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
||||||
|
]
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
||||||
|
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
|
||||||
|
tokens = [token.text for token in ja_tokenizer(text)]
|
||||||
|
assert tokens == expected_tokens
|
Loading…
Reference in New Issue
Block a user