mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	* Work on Chinese support
This commit is contained in:
		
							parent
							
								
									a6a25166ba
								
							
						
					
					
						commit
						9bbd6cf031
					
				|  | @ -1,5 +1,30 @@ | ||||||
| from ..language import Language | from ..language import Language | ||||||
|  | from ..tokenizer import Tokenizer | ||||||
|  | from ..tagger import Tagger | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class CharacterTokenizer(Tokenizer): | ||||||
|  |     def __call__(self, text): | ||||||
|  |         return self.tokens_from_list(list(text)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Chinese(Language): | class Chinese(Language): | ||||||
|     lang = u'zh' |     lang = u'zh' | ||||||
|  | 
 | ||||||
|  |     def __call__(self, text): | ||||||
|  |         doc = self.tokenizer.tokens_from_list(list(text)) | ||||||
|  |         self.tagger(doc) | ||||||
|  |         self.merge_characters(doc) | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  |     def merge_characters(self, doc): | ||||||
|  |         start = 0 | ||||||
|  |         chunks = [] | ||||||
|  |         for token in doc: | ||||||
|  |             if token.tag_ != 'CHAR': | ||||||
|  |                 chunk = doc[start : token.i + 1] | ||||||
|  |                 chunks.append(chunk) | ||||||
|  |                 start = token.i + 1 | ||||||
|  |         text = doc.text | ||||||
|  |         for chunk in chunks: | ||||||
|  |             chunk.merge(chunk[-1].tag_, chunk.text, u'') | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user