From 9bbd6cf03178cff323272cc6eefb90411c02c8a2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 5 May 2016 11:39:12 +0200 Subject: [PATCH] * Work on Chinese support --- spacy/zh/__init__.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/spacy/zh/__init__.py b/spacy/zh/__init__.py index edd3337b2..243d8525c 100644 --- a/spacy/zh/__init__.py +++ b/spacy/zh/__init__.py @@ -1,5 +1,30 @@ from ..language import Language +from ..tokenizer import Tokenizer +from ..tagger import Tagger + + +class CharacterTokenizer(Tokenizer): + def __call__(self, text): + return self.tokens_from_list(list(text)) class Chinese(Language): lang = u'zh' + + def __call__(self, text): + doc = self.tokenizer.tokens_from_list(list(text)) + self.tagger(doc) + self.merge_characters(doc) + return doc + + def merge_characters(self, doc): + start = 0 + chunks = [] + for token in doc: + if token.tag_ != 'CHAR': + chunk = doc[start : token.i + 1] + chunks.append(chunk) + start = token.i + 1 + text = doc.text + for chunk in chunks: + chunk.merge(chunk[-1].tag_, chunk.text, u'')