mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* Work on Chinese support
This commit is contained in:
		
							parent
							
								
									a6a25166ba
								
							
						
					
					
						commit
						9bbd6cf031
					
				|  | @ -1,5 +1,30 @@ | |||
| from ..language import Language | ||||
| from ..tokenizer import Tokenizer | ||||
| from ..tagger import Tagger | ||||
| 
 | ||||
| 
 | ||||
| class CharacterTokenizer(Tokenizer): | ||||
|     def __call__(self, text): | ||||
|         return self.tokens_from_list(list(text)) | ||||
| 
 | ||||
| 
 | ||||
| class Chinese(Language): | ||||
|     lang = u'zh' | ||||
| 
 | ||||
|     def __call__(self, text): | ||||
|         doc = self.tokenizer.tokens_from_list(list(text)) | ||||
|         self.tagger(doc) | ||||
|         self.merge_characters(doc) | ||||
|         return doc | ||||
| 
 | ||||
|     def merge_characters(self, doc): | ||||
|         start = 0 | ||||
|         chunks = [] | ||||
|         for token in doc: | ||||
|             if token.tag_ != 'CHAR': | ||||
|                 chunk = doc[start : token.i + 1] | ||||
|                 chunks.append(chunk) | ||||
|                 start = token.i + 1 | ||||
|         text = doc.text | ||||
|         for chunk in chunks: | ||||
|             chunk.merge(chunk[-1].tag_, chunk.text, u'') | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user