mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Work on Chinese support
This commit is contained in:
parent
a6a25166ba
commit
9bbd6cf031
|
@ -1,5 +1,30 @@
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
from ..tokenizer import Tokenizer
|
||||||
|
from ..tagger import Tagger
|
||||||
|
|
||||||
|
|
||||||
|
class CharacterTokenizer(Tokenizer):
|
||||||
|
def __call__(self, text):
|
||||||
|
return self.tokens_from_list(list(text))
|
||||||
|
|
||||||
|
|
||||||
class Chinese(Language):
|
class Chinese(Language):
|
||||||
lang = u'zh'
|
lang = u'zh'
|
||||||
|
|
||||||
|
def __call__(self, text):
|
||||||
|
doc = self.tokenizer.tokens_from_list(list(text))
|
||||||
|
self.tagger(doc)
|
||||||
|
self.merge_characters(doc)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def merge_characters(self, doc):
|
||||||
|
start = 0
|
||||||
|
chunks = []
|
||||||
|
for token in doc:
|
||||||
|
if token.tag_ != 'CHAR':
|
||||||
|
chunk = doc[start : token.i + 1]
|
||||||
|
chunks.append(chunk)
|
||||||
|
start = token.i + 1
|
||||||
|
text = doc.text
|
||||||
|
for chunk in chunks:
|
||||||
|
chunk.merge(chunk[-1].tag_, chunk.text, u'')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user