Make create_tokenizer work with Japanese

This commit is contained in:
Paul O'Leary McCann 2017-06-28 01:18:05 +09:00
parent f69ff15089
commit 84041a2bb5

View File

@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function
from os import path from os import path
from ..language import Language from ..language import Language, BaseDefaults
from ..tokenizer import Tokenizer
from ..attrs import LANG from ..attrs import LANG
from ..tokens import Doc from ..tokens import Doc
from .language_data import * from .language_data import *
class JapaneseTokenizer(object):
class Japanese(Language): def __init__(self, cls, nlp=None):
lang = 'ja' self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
def make_doc(self, text):
try: try:
from janome.tokenizer import Tokenizer from janome.tokenizer import Tokenizer
except ImportError: except ImportError:
raise ImportError("The Japanese tokenizer requires the Janome library: " raise ImportError("The Japanese tokenizer requires the Janome library: "
"https://github.com/mocobeta/janome") "https://github.com/mocobeta/janome")
words = [x.surface for x in Tokenizer().tokenize(text)] self.tokenizer = Tokenizer()
def __call__(self, text):
words = [x.surface for x in self.tokenizer.tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))
class JapaneseDefaults(BaseDefaults):
@classmethod
def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp)
class Japanese(Language):
lang = 'ja'
Defaults = JapaneseDefaults
def make_doc(self, text):
words = self.tokenizer(text)
return Doc(self.vocab, words=words, spaces=[False]*len(words))