spaCy/spacy/zh/__init__.py
2016-04-28 14:32:27 +02:00

34 lines
884 B
Python

import jieba
from ..language import Language
from ..tokenizer import Tokenizer
from ..tokens.doc import Doc
class JiebaTokenizer(Tokenizer):
def __call__(self, text):
orths = []
spaces = []
for orth, start, end in jieba.tokenize(text):
# TODO: This is wrong if multiple spaces in a row.
if orth == u' ':
spaces[-1] = True
else:
orths.append(orth)
spaces.append(False)
return Doc(self.vocab, orths_and_spaces=zip(orths, spaces))
class CharacterTokenizer(Tokenizer):
def __call__(self, text):
return self.tokens_from_list(list(text))
class Chinese(Language):
lang = u'zh'
@classmethod
def default_tokenizer(cls, package, vocab):
'''Return Jieba-wrapper tokenizer.'''
return JiebaTokenizer.from_package(package, vocab)