Add option to not use Janome for Japanese tokenization

This commit is contained in:
Matthew Honnibal 2018-02-26 09:39:46 +01:00
parent 9b406181cd
commit 5faae803c6
2 changed files with 20 additions and 1 deletions

View File

@ -29,6 +29,7 @@ import conll17_ud_eval
import spacy.lang.zh
spacy.lang.zh.Chinese.Defaults.use_jieba = False
spacy.lang.ja.Chinese.Defaults.use_janome = False
random.seed(0)
numpy.random.seed(0)

View File

@ -35,14 +35,32 @@ class JapaneseTokenizer(object):
def from_disk(self, path, **exclude):
return self
class JapaneseCharacterSegmenter(object):
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = []
spaces = []
doc = self.tokenizer(text)
for token in self.tokenizer(text):
words.extend(list(token.text))
spaces.extend([False]*len(token.text))
spaces[-1] = bool(token.whitespace_)
return Doc(self.vocab, words=words, spaces=spaces)
class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ja'
use_janome = True
@classmethod
def create_tokenizer(cls, nlp=None):
if cls.use_janome:
return JapaneseTokenizer(cls, nlp)
else:
return JapaneseCharacterSegmenter(cls, nlp.vocab)
class Japanese(Language):