mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-24 20:51:30 +03:00 
			
		
		
		
	Add Japanese lemmas (#2543)
This info was already available from Mecab, forgot to add it before.
This commit is contained in:
		
							parent
							
								
									6042723535
								
							
						
					
					
						commit
						1987f3f784
					
				|  | @ -82,6 +82,7 @@ class JapaneseTokenizer(object): | ||||||
|         for token, dtoken in zip(doc, dtokens): |         for token, dtoken in zip(doc, dtokens): | ||||||
|             token._.mecab_tag = dtoken.pos |             token._.mecab_tag = dtoken.pos | ||||||
|             token.tag_ = resolve_pos(dtoken) |             token.tag_ = resolve_pos(dtoken) | ||||||
|  |             token.lemma_ = dtoken.lemma | ||||||
|         return doc |         return doc | ||||||
| 
 | 
 | ||||||
|     # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to |     # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to | ||||||
|  |  | ||||||
|  | @ -49,6 +49,11 @@ def RU(request): | ||||||
|     pymorphy = pytest.importorskip('pymorphy2') |     pymorphy = pytest.importorskip('pymorphy2') | ||||||
|     return util.get_lang_class('ru')() |     return util.get_lang_class('ru')() | ||||||
| 
 | 
 | ||||||
|  | @pytest.fixture() | ||||||
|  | def JA(request): | ||||||
|  |     mecab = pytest.importorskip("MeCab") | ||||||
|  |     return util.get_lang_class('ja')() | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| #@pytest.fixture(params=_languages) | #@pytest.fixture(params=_languages) | ||||||
| #def tokenizer(request): | #def tokenizer(request): | ||||||
|  | @ -142,7 +147,7 @@ def da_tokenizer(): | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
| def ja_tokenizer(): | def ja_tokenizer(): | ||||||
|     janome = pytest.importorskip("MeCab") |     mecab = pytest.importorskip("MeCab") | ||||||
|     return util.get_lang_class('ja').Defaults.create_tokenizer() |     return util.get_lang_class('ja').Defaults.create_tokenizer() | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
|  |  | ||||||
							
								
								
									
										18
									
								
								spacy/tests/lang/ja/test_lemma.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/tests/lang/ja/test_lemma.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | ||||||
|  | # coding: utf-8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | LEMMAS = ( | ||||||
|  |         ('新しく', '新しい'), | ||||||
|  |         ('赤く', '赤い'), | ||||||
|  |         ('すごく', '凄い'), | ||||||
|  |         ('いただきました', '頂く'), | ||||||
|  |         ('なった', '成る')) | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('word,lemma', LEMMAS) | ||||||
|  | def test_japanese_lemmas(JA, word, lemma): | ||||||
|  |     test_lemma = JA(word)[0].lemma_ | ||||||
|  |     assert test_lemma == lemma | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user