From 84041a2bb517841d725781bdd72b1daf4f8e603d Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 28 Jun 2017 01:18:05 +0900 Subject: [PATCH] Make create_tokenizer work with Japanese --- spacy/ja/__init__.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 07e40ada6..1c85ded95 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function from os import path -from ..language import Language +from ..language import Language, BaseDefaults +from ..tokenizer import Tokenizer from ..attrs import LANG from ..tokens import Doc from .language_data import * - -class Japanese(Language): - lang = 'ja' - - def make_doc(self, text): +class JapaneseTokenizer(object): + def __init__(self, cls, nlp=None): + self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: from janome.tokenizer import Tokenizer except ImportError: raise ImportError("The Japanese tokenizer requires the Janome library: " "https://github.com/mocobeta/janome") - words = [x.surface for x in Tokenizer().tokenize(text)] + self.tokenizer = Tokenizer() + + def __call__(self, text): + words = [x.surface for x in self.tokenizer.tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) + +class JapaneseDefaults(BaseDefaults): + @classmethod + def create_tokenizer(cls, nlp=None): + return JapaneseTokenizer(cls, nlp) + +class Japanese(Language): + lang = 'ja' + + Defaults = JapaneseDefaults + + def make_doc(self, text): + words = self.tokenizer(text) + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + +