mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-29 18:23:06 +03:00
Add option to not use Janome for Japanese tokenization
This commit is contained in:
parent
9b406181cd
commit
5faae803c6
|
@ -29,6 +29,7 @@ import conll17_ud_eval
|
||||||
import spacy.lang.zh
|
import spacy.lang.zh
|
||||||
|
|
||||||
spacy.lang.zh.Chinese.Defaults.use_jieba = False
|
spacy.lang.zh.Chinese.Defaults.use_jieba = False
|
||||||
|
spacy.lang.ja.Chinese.Defaults.use_janome = False
|
||||||
|
|
||||||
random.seed(0)
|
random.seed(0)
|
||||||
numpy.random.seed(0)
|
numpy.random.seed(0)
|
||||||
|
|
|
@ -35,14 +35,32 @@ class JapaneseTokenizer(object):
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
class JapaneseCharacterSegmenter(object):
|
||||||
|
def __init__(self, vocab):
|
||||||
|
self.vocab = vocab
|
||||||
|
|
||||||
|
def __call__(self, text):
|
||||||
|
words = []
|
||||||
|
spaces = []
|
||||||
|
doc = self.tokenizer(text)
|
||||||
|
for token in self.tokenizer(text):
|
||||||
|
words.extend(list(token.text))
|
||||||
|
spaces.extend([False]*len(token.text))
|
||||||
|
spaces[-1] = bool(token.whitespace_)
|
||||||
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'ja'
|
lex_attr_getters[LANG] = lambda text: 'ja'
|
||||||
|
use_janome = True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
return JapaneseTokenizer(cls, nlp)
|
if cls.use_janome:
|
||||||
|
return JapaneseTokenizer(cls, nlp)
|
||||||
|
else:
|
||||||
|
return JapaneseCharacterSegmenter(cls, nlp.vocab)
|
||||||
|
|
||||||
|
|
||||||
class Japanese(Language):
|
class Japanese(Language):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user