mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Port over changes from #1157
This commit is contained in:
parent
9b3f8f9ec3
commit
612224c10d
|
@ -4,18 +4,36 @@ from __future__ import unicode_literals, print_function
|
|||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...tokens import Doc
|
||||
from ...tokenizer import Tokenizer
|
||||
|
||||
|
||||
class JapaneseTokenizer(object):
|
||||
def __init__(self, cls, nlp=None):
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
try:
|
||||
from janome.tokenizer import Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError("The Japanese tokenizer requires the Janome "
|
||||
"library: https://github.com/mocobeta/janome")
|
||||
self.tokenizer = Tokenizer()
|
||||
|
||||
def __call__(self, text):
|
||||
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return JapaneseTokenizer(cls, nlp)
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
lang = 'ja'
|
||||
Defaults = JapaneseDefaults
|
||||
|
||||
def make_doc(self, text):
|
||||
try:
|
||||
from janome.tokenizer import Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
||||
"https://github.com/mocobeta/janome")
|
||||
words = [x.surface for x in Tokenizer().tokenize(text)]
|
||||
words = self.tokenizer(text)
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
||||
|
|
|
@ -117,6 +117,13 @@ def he_tokenizer():
|
|||
def nb_tokenizer():
|
||||
return util.get_lang_class('nb').Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ja_tokenizer():
|
||||
janome = pytest.importorskip("janome")
|
||||
return util.get_lang_class('ja').Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def th_tokenizer():
|
||||
pythainlp = pytest.importorskip("pythainlp")
|
||||
|
|
0
spacy/tests/lang/ja/__init__.py
Normal file
0
spacy/tests/lang/ja/__init__.py
Normal file
19
spacy/tests/lang/ja/test_tokenizer.py
Normal file
19
spacy/tests/lang/ja/test_tokenizer.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
TOKENIZER_TESTS = [
|
||||
("日本語だよ", ['日本語', 'だ', 'よ']),
|
||||
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
||||
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
|
||||
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
||||
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
|
||||
tokens = [token.text for token in ja_tokenizer(text)]
|
||||
assert tokens == expected_tokens
|
Loading…
Reference in New Issue
Block a user