mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Port BenDerPan's Chinese changes to v2 (finally) (#2591)
* add template files for Chinese * add template files for Chinese, and test directory .
This commit is contained in:
		
							parent
							
								
									f2e3e039b7
								
							
						
					
					
						commit
						66983d8412
					
				|  | @ -4,12 +4,16 @@ from __future__ import unicode_literals | |||
| from ...attrs import LANG | ||||
| from ...language import Language | ||||
| from ...tokens import Doc | ||||
| from .tag_map import TAG_MAP | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class ChineseDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'zh'  # for pickling | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     tag_map = TAG_MAP | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
| class Chinese(Language): | ||||
|     lang = 'zh' | ||||
|  |  | |||
							
								
								
									
										1901
									
								
								spacy/lang/zh/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1901
									
								
								spacy/lang/zh/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										24
									
								
								spacy/lang/zh/tag_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								spacy/lang/zh/tag_map.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,24 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..symbols import * | ||||
| 
 | ||||
| 
 | ||||
| TAG_MAP = { | ||||
|     "ADV":      {POS: ADV}, | ||||
|     "NOUN":     {POS: NOUN}, | ||||
|     "ADP":      {POS: ADP}, | ||||
|     "PRON":     {POS: PRON}, | ||||
|     "SCONJ":    {POS: SCONJ}, | ||||
|     "PROPN":    {POS: PROPN}, | ||||
|     "DET":      {POS: DET}, | ||||
|     "SYM":      {POS: SYM}, | ||||
|     "INTJ":     {POS: INTJ}, | ||||
|     "PUNCT":    {POS: PUNCT}, | ||||
|     "NUM":      {POS: NUM}, | ||||
|     "AUX":      {POS: AUX}, | ||||
|     "X":        {POS: X}, | ||||
|     "CONJ":     {POS: CONJ}, | ||||
|     "ADJ":      {POS: ADJ}, | ||||
|     "VERB":     {POS: VERB} | ||||
| } | ||||
							
								
								
									
										46
									
								
								spacy/lang/zh/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								spacy/lang/zh/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,46 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..symbols import * | ||||
| from ..language_data import PRON_LEMMA | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = { | ||||
|     "Jan.": [ | ||||
|         {ORTH: "Jan.", LEMMA: "January"} | ||||
|     ] | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| # exceptions mapped to a single token containing only ORTH property | ||||
| # example: {"string": [{ORTH: "string"}]} | ||||
| # converted using strings_to_exc() util | ||||
| 
 | ||||
| ORTH_ONLY = [ | ||||
|     "a.", | ||||
|     "b.", | ||||
|     "c.", | ||||
|     "d.", | ||||
|     "e.", | ||||
|     "f.", | ||||
|     "g.", | ||||
|     "h.", | ||||
|     "i.", | ||||
|     "j.", | ||||
|     "k.", | ||||
|     "l.", | ||||
|     "m.", | ||||
|     "n.", | ||||
|     "o.", | ||||
|     "p.", | ||||
|     "q.", | ||||
|     "r.", | ||||
|     "s.", | ||||
|     "t.", | ||||
|     "u.", | ||||
|     "v.", | ||||
|     "w.", | ||||
|     "x.", | ||||
|     "y.", | ||||
|     "z." | ||||
| ] | ||||
							
								
								
									
										0
									
								
								spacy/tests/zh/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/zh/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										30
									
								
								spacy/zh/language_data.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								spacy/zh/language_data.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,30 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| # import base language data | ||||
| from .. import language_data as base | ||||
| 
 | ||||
| 
 | ||||
| # import util functions | ||||
| from ..language_data import update_exc, strings_to_exc | ||||
| 
 | ||||
| 
 | ||||
| # import language-specific data from files | ||||
| from .tag_map import TAG_MAP | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) | ||||
| TAG_MAP = dict(TAG_MAP) | ||||
| STOP_WORDS = set(STOP_WORDS) | ||||
| 
 | ||||
| 
 | ||||
| # customize tokenizer exceptions | ||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) | ||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) | ||||
| 
 | ||||
| 
 | ||||
| # export | ||||
| __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"] | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user