From bcf2b9b4f5e12951394bbc2e77daf5a1763ec9e5 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 22 Aug 2017 00:03:11 +0900 Subject: [PATCH] Update tagger & tokenizer tests Tagger is now parametrized and has two sentences with more tag coverage. The tokenizer tests are updated to reflect differences in tokenization between IPAdic and Unidic. -POLM --- spacy/tests/ja/test_tagger.py | 33 +++++++++++++++++++++++++++----- spacy/tests/ja/test_tokenizer.py | 4 ++-- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py index 43259fb49..629cc795f 100644 --- a/spacy/tests/ja/test_tagger.py +++ b/spacy/tests/ja/test_tagger.py @@ -3,8 +3,31 @@ from __future__ import unicode_literals import pytest -def test_japanese_tagger(japanese): - doc = japanese.make_doc("このファイルには小さなテストが入っているよ") - # note these both have the same raw tag, '連体詞,*,*,*' - assert doc[0].pos_ == "DET" - assert doc[4].pos_ == "ADJ" +TAGGER_TESTS = [ + ('あれならそこにあるよ', + (('代名詞,*,*,*', 'PRON'), + ('助動詞,*,*,*', 'AUX'), + ('代名詞,*,*,*', 'PRON'), + ('助詞,格助詞,*,*', 'ADP'), + ('動詞,非自立可能,*,*', 'VERB'), + ('助詞,終助詞,*,*', 'PART'))), + ('このファイルには小さなテストが入っているよ', + (('連体詞,*,*,*,DET', 'DET'), + ('名詞,普通名詞,サ変可能,*', 'NOUN'), + ('助詞,格助詞,*,*', 'ADP'), + ('助詞,係助詞,*,*', 'ADP'), + ('連体詞,*,*,*,ADJ', 'ADJ'), + ('名詞,普通名詞,サ変可能,*', 'NOUN'), + ('助詞,格助詞,*,*', 'ADP'), + ('動詞,一般,*,*', 'VERB'), + ('助詞,接続助詞,*,*', 'SCONJ'), + ('動詞,非自立可能,*,*', 'VERB'), + ('助詞,終助詞,*,*', 'PART'))) +] + +@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS) +def test_japanese_tagger(japanese, text, expected_tags): + tokens = japanese.make_doc(text) + assert len(tokens) == len(expected_tags) + for token, res in zip(tokens, expected_tags): + assert token.tag_ == res[0] and token.pos_ == res[1] diff --git a/spacy/tests/ja/test_tokenizer.py b/spacy/tests/ja/test_tokenizer.py index 58700b353..17411aee2 100644 --- a/spacy/tests/ja/test_tokenizer.py +++ b/spacy/tests/ja/test_tokenizer.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import pytest TOKENIZER_TESTS = [ - ("日本語だよ", ['日本語', 'だ', 'よ']), + ("日本語だよ", ['日本', '語', 'だ', 'よ']), ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']), ("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']), - ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']), + ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']), ("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']) ]