mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
						commit
						f0d7a87e18
					
				
							
								
								
									
										1
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -37,6 +37,7 @@ PACKAGES = [
 | 
				
			||||||
    'spacy.bn',
 | 
					    'spacy.bn',
 | 
				
			||||||
    'spacy.he',
 | 
					    'spacy.he',
 | 
				
			||||||
    'spacy.nb',
 | 
					    'spacy.nb',
 | 
				
			||||||
 | 
					    'spacy.ja',
 | 
				
			||||||
    'spacy.en.lemmatizer',
 | 
					    'spacy.en.lemmatizer',
 | 
				
			||||||
    'spacy.cli.converters',
 | 
					    'spacy.cli.converters',
 | 
				
			||||||
    'spacy.language_data',
 | 
					    'spacy.language_data',
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,12 +5,12 @@ from . import util
 | 
				
			||||||
from .deprecated import resolve_model_name
 | 
					from .deprecated import resolve_model_name
 | 
				
			||||||
from .cli.info import info
 | 
					from .cli.info import info
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb
 | 
					from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
 | 
					_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
 | 
				
			||||||
             it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
 | 
					             it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
 | 
				
			||||||
             fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian)
 | 
					             fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for _lang in _languages:
 | 
					for _lang in _languages:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										23
									
								
								spacy/ja/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/ja/__init__.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,23 @@
 | 
				
			||||||
 | 
					# encoding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals, print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from os import path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..language import Language
 | 
				
			||||||
 | 
					from ..attrs import LANG
 | 
				
			||||||
 | 
					from ..tokens import Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .language_data import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Japanese(Language):
 | 
				
			||||||
 | 
					    lang = 'ja'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def make_doc(self, text):
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            from janome.tokenizer import Tokenizer
 | 
				
			||||||
 | 
					        except ImportError:
 | 
				
			||||||
 | 
					            raise ImportError("The Japanese tokenizer requires the Janome library: "
 | 
				
			||||||
 | 
					                              "https://github.com/mocobeta/janome")
 | 
				
			||||||
 | 
					        words = [x.surface for x in Tokenizer().tokenize(text)]
 | 
				
			||||||
 | 
					        return Doc(self.vocab, words=words, spaces=[False]*len(words))
 | 
				
			||||||
							
								
								
									
										23
									
								
								spacy/ja/language_data.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/ja/language_data.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,23 @@
 | 
				
			||||||
 | 
					# encoding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# import base language data
 | 
				
			||||||
 | 
					from .. import language_data as base
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# import util functions
 | 
				
			||||||
 | 
					from ..language_data import update_exc, strings_to_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# import language-specific data from files
 | 
				
			||||||
 | 
					from .tag_map import TAG_MAP
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TAG_MAP = dict(TAG_MAP)
 | 
				
			||||||
 | 
					STOP_WORDS = set(STOP_WORDS)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# export
 | 
				
			||||||
 | 
					__all__ = ["TAG_MAP", "STOP_WORDS"]
 | 
				
			||||||
							
								
								
									
										9
									
								
								spacy/ja/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								spacy/ja/stop_words.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,9 @@
 | 
				
			||||||
 | 
					# encoding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# stop words as whitespace-separated list
 | 
				
			||||||
 | 
					STOP_WORDS = set("""
 | 
				
			||||||
 | 
					。
 | 
				
			||||||
 | 
					、
 | 
				
			||||||
 | 
					""".split())
 | 
				
			||||||
							
								
								
									
										24
									
								
								spacy/ja/tag_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								spacy/ja/tag_map.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,24 @@
 | 
				
			||||||
 | 
					# encoding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..symbols import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TAG_MAP = {
 | 
				
			||||||
 | 
					    "ADV":      {POS: ADV},
 | 
				
			||||||
 | 
					    "NOUN":     {POS: NOUN},
 | 
				
			||||||
 | 
					    "ADP":      {POS: ADP},
 | 
				
			||||||
 | 
					    "PRON":     {POS: PRON},
 | 
				
			||||||
 | 
					    "SCONJ":    {POS: SCONJ},
 | 
				
			||||||
 | 
					    "PROPN":    {POS: PROPN},
 | 
				
			||||||
 | 
					    "DET":      {POS: DET},
 | 
				
			||||||
 | 
					    "SYM":      {POS: SYM},
 | 
				
			||||||
 | 
					    "INTJ":     {POS: INTJ},
 | 
				
			||||||
 | 
					    "PUNCT":    {POS: PUNCT},
 | 
				
			||||||
 | 
					    "NUM":      {POS: NUM},
 | 
				
			||||||
 | 
					    "AUX":      {POS: AUX},
 | 
				
			||||||
 | 
					    "X":        {POS: X},
 | 
				
			||||||
 | 
					    "CONJ":     {POS: CONJ},
 | 
				
			||||||
 | 
					    "ADJ":      {POS: ADJ},
 | 
				
			||||||
 | 
					    "VERB":     {POS: VERB}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -36,7 +36,7 @@ p
 | 
				
			||||||
    |  the existing language data and extending the tokenization patterns.
 | 
					    |  the existing language data and extending the tokenization patterns.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+table([ "Language", "Source" ])
 | 
					+table([ "Language", "Source" ])
 | 
				
			||||||
    each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" }
 | 
					    each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" }
 | 
				
			||||||
        +row
 | 
					        +row
 | 
				
			||||||
            +cell #{language} #[code=code]
 | 
					            +cell #{language} #[code=code]
 | 
				
			||||||
            +cell
 | 
					            +cell
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user