From e3de3f62cb6f5a87621bdcf5c55bf6310261af6e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Apr 2016 22:20:01 +0200 Subject: [PATCH] * Add character tagger for Chinese --- spacy/zh/__init__.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/spacy/zh/__init__.py b/spacy/zh/__init__.py index d034c1de8..2667d5442 100644 --- a/spacy/zh/__init__.py +++ b/spacy/zh/__init__.py @@ -1,6 +1,27 @@ +import jieba from ..language import Language -from .jieba import JiebaTokenizer +from ..tokenizer import Tokenizer +from ..tokens.doc import Doc + + +class JiebaTokenizer(Tokenizer): + def __call__(self, text): + orths = [] + spaces = [] + for orth, start, end in jieba.tokenize(text): + # TODO: This is wrong if multiple spaces in a row. + if orth == u' ': + spaces[-1] = True + else: + orths.append(orth) + spaces.append(False) + return Doc(self.vocab, orths_and_spaces=zip(orths, spaces)) + + +class CharacterTokenizer(Tokenizer): + def __call__(self, text): + return self.tokens_from_list(list(text)) class Chinese(Language): lang = u'zh' @@ -8,6 +29,4 @@ class Chinese(Language): @classmethod def default_tokenizer(cls, package, vocab): '''Return Jieba-wrapper tokenizer.''' - return JiebaTokenizer.from_package(package, vocab) - - + return CharacterTokenizer.from_package(package, vocab)