mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Add character tagger for Chinese
This commit is contained in:
parent
b6ccd8d76a
commit
e3de3f62cb
|
@ -1,6 +1,27 @@
|
|||
import jieba
|
||||
from ..language import Language
|
||||
from .jieba import JiebaTokenizer
|
||||
|
||||
from ..tokenizer import Tokenizer
|
||||
from ..tokens.doc import Doc
|
||||
|
||||
|
||||
class JiebaTokenizer(Tokenizer):
|
||||
def __call__(self, text):
|
||||
orths = []
|
||||
spaces = []
|
||||
for orth, start, end in jieba.tokenize(text):
|
||||
# TODO: This is wrong if multiple spaces in a row.
|
||||
if orth == u' ':
|
||||
spaces[-1] = True
|
||||
else:
|
||||
orths.append(orth)
|
||||
spaces.append(False)
|
||||
return Doc(self.vocab, orths_and_spaces=zip(orths, spaces))
|
||||
|
||||
|
||||
class CharacterTokenizer(Tokenizer):
|
||||
def __call__(self, text):
|
||||
return self.tokens_from_list(list(text))
|
||||
|
||||
class Chinese(Language):
|
||||
lang = u'zh'
|
||||
|
@ -8,6 +29,4 @@ class Chinese(Language):
|
|||
@classmethod
|
||||
def default_tokenizer(cls, package, vocab):
|
||||
'''Return Jieba-wrapper tokenizer.'''
|
||||
return JiebaTokenizer.from_package(package, vocab)
|
||||
|
||||
|
||||
return CharacterTokenizer.from_package(package, vocab)
|
||||
|
|
Loading…
Reference in New Issue
Block a user