mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	
						commit
						f0d7a87e18
					
				
							
								
								
									
										3
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -36,7 +36,8 @@ PACKAGES = [ | |||
|     'spacy.fi', | ||||
|     'spacy.bn', | ||||
|     'spacy.he', | ||||
|     'spacy.nb',     | ||||
|     'spacy.nb', | ||||
|     'spacy.ja', | ||||
|     'spacy.en.lemmatizer', | ||||
|     'spacy.cli.converters', | ||||
|     'spacy.language_data', | ||||
|  |  | |||
|  | @ -5,12 +5,12 @@ from . import util | |||
| from .deprecated import resolve_model_name | ||||
| from .cli.info import info | ||||
| 
 | ||||
| from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb | ||||
| from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja | ||||
| 
 | ||||
| 
 | ||||
| _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, | ||||
|              it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, | ||||
|              fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian) | ||||
|              fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese) | ||||
| 
 | ||||
| 
 | ||||
| for _lang in _languages: | ||||
|  |  | |||
							
								
								
									
										23
									
								
								spacy/ja/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/ja/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals, print_function | ||||
| 
 | ||||
| from os import path | ||||
| 
 | ||||
| from ..language import Language | ||||
| from ..attrs import LANG | ||||
| from ..tokens import Doc | ||||
| 
 | ||||
| from .language_data import * | ||||
| 
 | ||||
| 
 | ||||
| class Japanese(Language): | ||||
|     lang = 'ja' | ||||
| 
 | ||||
|     def make_doc(self, text): | ||||
|         try: | ||||
|             from janome.tokenizer import Tokenizer | ||||
|         except ImportError: | ||||
|             raise ImportError("The Japanese tokenizer requires the Janome library: " | ||||
|                               "https://github.com/mocobeta/janome") | ||||
|         words = [x.surface for x in Tokenizer().tokenize(text)] | ||||
|         return Doc(self.vocab, words=words, spaces=[False]*len(words)) | ||||
							
								
								
									
										23
									
								
								spacy/ja/language_data.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/ja/language_data.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| # import base language data | ||||
| from .. import language_data as base | ||||
| 
 | ||||
| 
 | ||||
| # import util functions | ||||
| from ..language_data import update_exc, strings_to_exc | ||||
| 
 | ||||
| 
 | ||||
| # import language-specific data from files | ||||
| from .tag_map import TAG_MAP | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| TAG_MAP = dict(TAG_MAP) | ||||
| STOP_WORDS = set(STOP_WORDS) | ||||
| 
 | ||||
| 
 | ||||
| # export | ||||
| __all__ = ["TAG_MAP", "STOP_WORDS"] | ||||
							
								
								
									
										9
									
								
								spacy/ja/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								spacy/ja/stop_words.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,9 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| # stop words as whitespace-separated list | ||||
| STOP_WORDS = set(""" | ||||
| 。 | ||||
| 、 | ||||
| """.split()) | ||||
							
								
								
									
										24
									
								
								spacy/ja/tag_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								spacy/ja/tag_map.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,24 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..symbols import * | ||||
| 
 | ||||
| 
 | ||||
| TAG_MAP = { | ||||
|     "ADV":      {POS: ADV}, | ||||
|     "NOUN":     {POS: NOUN}, | ||||
|     "ADP":      {POS: ADP}, | ||||
|     "PRON":     {POS: PRON}, | ||||
|     "SCONJ":    {POS: SCONJ}, | ||||
|     "PROPN":    {POS: PROPN}, | ||||
|     "DET":      {POS: DET}, | ||||
|     "SYM":      {POS: SYM}, | ||||
|     "INTJ":     {POS: INTJ}, | ||||
|     "PUNCT":    {POS: PUNCT}, | ||||
|     "NUM":      {POS: NUM}, | ||||
|     "AUX":      {POS: AUX}, | ||||
|     "X":        {POS: X}, | ||||
|     "CONJ":     {POS: CONJ}, | ||||
|     "ADJ":      {POS: ADJ}, | ||||
|     "VERB":     {POS: VERB} | ||||
| } | ||||
|  | @ -36,7 +36,7 @@ p | |||
|     |  the existing language data and extending the tokenization patterns. | ||||
| 
 | ||||
| +table([ "Language", "Source" ]) | ||||
|     each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } | ||||
|     each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } | ||||
|         +row | ||||
|             +cell #{language} #[code=code] | ||||
|             +cell | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user