* Add character tagger for Chinese

This commit is contained in:
Matthew Honnibal 2016-04-25 22:20:01 +02:00
parent b6ccd8d76a
commit e3de3f62cb

View File

@ -1,6 +1,27 @@
import jieba
from ..language import Language from ..language import Language
from .jieba import JiebaTokenizer
from ..tokenizer import Tokenizer
from ..tokens.doc import Doc
class JiebaTokenizer(Tokenizer):
def __call__(self, text):
orths = []
spaces = []
for orth, start, end in jieba.tokenize(text):
# TODO: This is wrong if multiple spaces in a row.
if orth == u' ':
spaces[-1] = True
else:
orths.append(orth)
spaces.append(False)
return Doc(self.vocab, orths_and_spaces=zip(orths, spaces))
class CharacterTokenizer(Tokenizer):
def __call__(self, text):
return self.tokens_from_list(list(text))
class Chinese(Language): class Chinese(Language):
lang = u'zh' lang = u'zh'
@ -8,6 +29,4 @@ class Chinese(Language):
@classmethod @classmethod
def default_tokenizer(cls, package, vocab): def default_tokenizer(cls, package, vocab):
'''Return Jieba-wrapper tokenizer.''' '''Return Jieba-wrapper tokenizer.'''
return JiebaTokenizer.from_package(package, vocab) return CharacterTokenizer.from_package(package, vocab)