spaCy/spacy/tests/lang/ko/test_lemmatization.py
Adriane Boyd a2a0e1abf1 Switch to mecab-ko as default Korean tokenizer
Switch to the (confusingly-named) mecab-ko python module for default Korean
tokenization.

Maintain the previous `natto-py` tokenizer as
`spacy.KoreanNattoTokenizer.v1`.
2022-08-11 13:49:23 +02:00

18 lines
622 B
Python

import pytest
@pytest.mark.parametrize(
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")]
)
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
test_lemma = ko_tokenizer(word)[0].lemma_
assert test_lemma == lemma
@pytest.mark.parametrize(
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")]
)
def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
test_lemma = ko_tokenizer_natto(word)[0].lemma_
assert test_lemma == lemma