mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 17:22:25 +03:00
Make create_tokenizer work with Japanese
This commit is contained in:
parent
f69ff15089
commit
84041a2bb5
|
@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language, BaseDefaults
|
||||||
|
from ..tokenizer import Tokenizer
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
from .language_data import *
|
from .language_data import *
|
||||||
|
|
||||||
|
class JapaneseTokenizer(object):
|
||||||
class Japanese(Language):
|
def __init__(self, cls, nlp=None):
|
||||||
lang = 'ja'
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
|
|
||||||
def make_doc(self, text):
|
|
||||||
try:
|
try:
|
||||||
from janome.tokenizer import Tokenizer
|
from janome.tokenizer import Tokenizer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
||||||
"https://github.com/mocobeta/janome")
|
"https://github.com/mocobeta/janome")
|
||||||
words = [x.surface for x in Tokenizer().tokenize(text)]
|
self.tokenizer = Tokenizer()
|
||||||
|
|
||||||
|
def __call__(self, text):
|
||||||
|
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
class JapaneseDefaults(BaseDefaults):
|
||||||
|
@classmethod
|
||||||
|
def create_tokenizer(cls, nlp=None):
|
||||||
|
return JapaneseTokenizer(cls, nlp)
|
||||||
|
|
||||||
|
class Japanese(Language):
|
||||||
|
lang = 'ja'
|
||||||
|
|
||||||
|
Defaults = JapaneseDefaults
|
||||||
|
|
||||||
|
def make_doc(self, text):
|
||||||
|
words = self.tokenizer(text)
|
||||||
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user