mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Add Chinese.Defaults.use_jieba setting, for UD
This commit is contained in:
parent
9ccd0c643b
commit
9b406181cd
|
@ -26,6 +26,10 @@ import cytoolz
|
|||
|
||||
import conll17_ud_eval
|
||||
|
||||
import spacy.lang.zh
|
||||
|
||||
spacy.lang.zh.Chinese.Defaults.use_jieba = False
|
||||
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ from ...tokens import Doc
|
|||
class ChineseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'zh' # for pickling
|
||||
use_jieba = True
|
||||
|
||||
|
||||
class Chinese(Language):
|
||||
|
@ -16,14 +17,25 @@ class Chinese(Language):
|
|||
Defaults = ChineseDefaults # override defaults
|
||||
|
||||
def make_doc(self, text):
|
||||
if self.Defaults.use_jieba:
|
||||
try:
|
||||
import jieba
|
||||
except ImportError:
|
||||
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
||||
"https://github.com/fxsjy/jieba")
|
||||
msg = ("Jieba not installed. Either set Chinese.use_jieba = False, "
|
||||
"or install it https://github.com/fxsjy/jieba")
|
||||
raise ImportError(msg)
|
||||
words = list(jieba.cut(text, cut_all=False))
|
||||
words = [x for x in words if x]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
else:
|
||||
words = []
|
||||
spaces = []
|
||||
doc = self.tokenizer(text)
|
||||
for token in self.tokenizer(text):
|
||||
words.extend(list(token.text))
|
||||
spaces.extend([False]*len(token.text))
|
||||
spaces[-1] = bool(token.whitespace_)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
__all__ = ['Chinese']
|
||||
|
|
Loading…
Reference in New Issue
Block a user