mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Add character tagger for Chinese
This commit is contained in:
		
							parent
							
								
									b6ccd8d76a
								
							
						
					
					
						commit
						e3de3f62cb
					
				| 
						 | 
					@ -1,6 +1,27 @@
 | 
				
			||||||
 | 
					import jieba
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from .jieba import JiebaTokenizer
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..tokenizer import Tokenizer
 | 
				
			||||||
 | 
					from ..tokens.doc import Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class JiebaTokenizer(Tokenizer):
 | 
				
			||||||
 | 
					    def __call__(self, text):
 | 
				
			||||||
 | 
					        orths = []
 | 
				
			||||||
 | 
					        spaces = []
 | 
				
			||||||
 | 
					        for orth, start, end in jieba.tokenize(text):
 | 
				
			||||||
 | 
					            # TODO: This is wrong if multiple spaces in a row.
 | 
				
			||||||
 | 
					            if orth == u' ':
 | 
				
			||||||
 | 
					                spaces[-1] = True
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                orths.append(orth)
 | 
				
			||||||
 | 
					                spaces.append(False)
 | 
				
			||||||
 | 
					        return Doc(self.vocab, orths_and_spaces=zip(orths, spaces))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class CharacterTokenizer(Tokenizer):
 | 
				
			||||||
 | 
					    def __call__(self, text):
 | 
				
			||||||
 | 
					        return self.tokens_from_list(list(text))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Chinese(Language):
 | 
					class Chinese(Language):
 | 
				
			||||||
    lang = u'zh'
 | 
					    lang = u'zh'
 | 
				
			||||||
| 
						 | 
					@ -8,6 +29,4 @@ class Chinese(Language):
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def default_tokenizer(cls, package, vocab):
 | 
					    def default_tokenizer(cls, package, vocab):
 | 
				
			||||||
        '''Return Jieba-wrapper tokenizer.'''
 | 
					        '''Return Jieba-wrapper tokenizer.'''
 | 
				
			||||||
        return JiebaTokenizer.from_package(package, vocab)
 | 
					        return CharacterTokenizer.from_package(package, vocab)
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user