mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Add character tagger for Chinese
This commit is contained in:
parent
b6ccd8d76a
commit
e3de3f62cb
|
@ -1,6 +1,27 @@
|
||||||
|
import jieba
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from .jieba import JiebaTokenizer
|
|
||||||
|
|
||||||
|
from ..tokenizer import Tokenizer
|
||||||
|
from ..tokens.doc import Doc
|
||||||
|
|
||||||
|
|
||||||
|
class JiebaTokenizer(Tokenizer):
|
||||||
|
def __call__(self, text):
|
||||||
|
orths = []
|
||||||
|
spaces = []
|
||||||
|
for orth, start, end in jieba.tokenize(text):
|
||||||
|
# TODO: This is wrong if multiple spaces in a row.
|
||||||
|
if orth == u' ':
|
||||||
|
spaces[-1] = True
|
||||||
|
else:
|
||||||
|
orths.append(orth)
|
||||||
|
spaces.append(False)
|
||||||
|
return Doc(self.vocab, orths_and_spaces=zip(orths, spaces))
|
||||||
|
|
||||||
|
|
||||||
|
class CharacterTokenizer(Tokenizer):
|
||||||
|
def __call__(self, text):
|
||||||
|
return self.tokens_from_list(list(text))
|
||||||
|
|
||||||
class Chinese(Language):
|
class Chinese(Language):
|
||||||
lang = u'zh'
|
lang = u'zh'
|
||||||
|
@ -8,6 +29,4 @@ class Chinese(Language):
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_tokenizer(cls, package, vocab):
|
def default_tokenizer(cls, package, vocab):
|
||||||
'''Return Jieba-wrapper tokenizer.'''
|
'''Return Jieba-wrapper tokenizer.'''
|
||||||
return JiebaTokenizer.from_package(package, vocab)
|
return CharacterTokenizer.from_package(package, vocab)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user