This commit is contained in:
Matthew Honnibal 2025-01-13 14:00:06 +01:00
parent 6c59f6c623
commit 685a386106
6 changed files with 24 additions and 4 deletions

View File

@ -1,4 +1,5 @@
"""Test that longer and mixed texts are tokenized correctly.""" """Test that longer and mixed texts are tokenized correctly."""
import pytest import pytest

View File

@ -3,7 +3,13 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"word,lemma", "word,lemma",
[("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "すごい"), ("いただきました", "いただく"), ("なった", "なる")], [
("新しく", "新しい"),
("赤く", "赤い"),
("すごく", "すごい"),
("いただきました", "いただく"),
("なった", "なる"),
],
) )
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma): def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
test_lemma = ja_tokenizer(word)[0].lemma_ test_lemma = ja_tokenizer(word)[0].lemma_

View File

@ -143,7 +143,12 @@ def test_ja_tokenizer_sub_tokens(
[ [
( (
"取ってつけた", "取ってつけた",
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]), (
["五段-ラ行;連用形-促音便"],
[],
["下一段-カ行;連用形-一般"],
["助動詞-タ;終止形-一般"],
),
(["トッ"], [""], ["ツケ"], [""]), (["トッ"], [""], ["ツケ"], [""]),
), ),
("2=3", ([], [], []), ([""], ["_"], ["サン"])), ("2=3", ([], [], []), ([""], ["_"], ["サン"])),

View File

@ -2,7 +2,14 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")] "word,lemma",
[
("새로운", "새롭"),
("빨간", "빨갛"),
("클수록", ""),
("뭡니까", ""),
("됐다", ""),
],
) )
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
test_lemma = ko_tokenizer(word)[0].lemma_ test_lemma = ko_tokenizer(word)[0].lemma_

View File

@ -1,4 +1,5 @@
"""Words like numbers are recognized correctly.""" """Words like numbers are recognized correctly."""
import pytest import pytest

View File

@ -265,7 +265,7 @@ def test_pretraining_tagger():
# Try to debug segfault on windows # Try to debug segfault on windows
#def test_pretraining_training(): # def test_pretraining_training():
# """Test that training can use a pretrained Tok2Vec model""" # """Test that training can use a pretrained Tok2Vec model"""
# config = Config().from_str(pretrain_string_internal) # config = Config().from_str(pretrain_string_internal)
# nlp = util.load_model_from_config(config, auto_fill=True, validate=False) # nlp = util.load_model_from_config(config, auto_fill=True, validate=False)