spaCy/spacy/zh/__init__.py

34 lines
884 B
Python
Raw Normal View History

2016-04-25 23:20:01 +03:00
import jieba
from ..language import Language
2016-04-25 23:20:01 +03:00
from ..tokenizer import Tokenizer
from ..tokens.doc import Doc
class JiebaTokenizer(Tokenizer):
def __call__(self, text):
orths = []
spaces = []
for orth, start, end in jieba.tokenize(text):
# TODO: This is wrong if multiple spaces in a row.
if orth == u' ':
spaces[-1] = True
else:
orths.append(orth)
spaces.append(False)
return Doc(self.vocab, orths_and_spaces=zip(orths, spaces))
class CharacterTokenizer(Tokenizer):
def __call__(self, text):
return self.tokens_from_list(list(text))
2016-04-28 15:32:27 +03:00
class Chinese(Language):
lang = u'zh'
2016-04-24 20:11:38 +03:00
@classmethod
def default_tokenizer(cls, package, vocab):
'''Return Jieba-wrapper tokenizer.'''
2016-04-28 15:32:27 +03:00
return JiebaTokenizer.from_package(package, vocab)