mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	Add Japanese lemmas (#2543)
This info was already available from Mecab, forgot to add it before.
This commit is contained in:
		
							parent
							
								
									6042723535
								
							
						
					
					
						commit
						1987f3f784
					
				|  | @ -82,6 +82,7 @@ class JapaneseTokenizer(object): | |||
|         for token, dtoken in zip(doc, dtokens): | ||||
|             token._.mecab_tag = dtoken.pos | ||||
|             token.tag_ = resolve_pos(dtoken) | ||||
|             token.lemma_ = dtoken.lemma | ||||
|         return doc | ||||
| 
 | ||||
|     # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to | ||||
|  |  | |||
|  | @ -49,6 +49,11 @@ def RU(request): | |||
|     pymorphy = pytest.importorskip('pymorphy2') | ||||
|     return util.get_lang_class('ru')() | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def JA(request): | ||||
|     mecab = pytest.importorskip("MeCab") | ||||
|     return util.get_lang_class('ja')() | ||||
| 
 | ||||
| 
 | ||||
| #@pytest.fixture(params=_languages) | ||||
| #def tokenizer(request): | ||||
|  | @ -142,7 +147,7 @@ def da_tokenizer(): | |||
| 
 | ||||
| @pytest.fixture | ||||
| def ja_tokenizer(): | ||||
|     janome = pytest.importorskip("MeCab") | ||||
|     mecab = pytest.importorskip("MeCab") | ||||
|     return util.get_lang_class('ja').Defaults.create_tokenizer() | ||||
| 
 | ||||
| @pytest.fixture | ||||
|  |  | |||
							
								
								
									
										18
									
								
								spacy/tests/lang/ja/test_lemma.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/tests/lang/ja/test_lemma.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| LEMMAS = ( | ||||
|         ('新しく', '新しい'), | ||||
|         ('赤く', '赤い'), | ||||
|         ('すごく', '凄い'), | ||||
|         ('いただきました', '頂く'), | ||||
|         ('なった', '成る')) | ||||
| 
 | ||||
| @pytest.mark.parametrize('word,lemma', LEMMAS) | ||||
| def test_japanese_lemmas(JA, word, lemma): | ||||
|     test_lemma = JA(word)[0].lemma_ | ||||
|     assert test_lemma == lemma | ||||
| 
 | ||||
| 
 | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user