mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 20:28:20 +03:00
1ee6541ab0
* Use morph for extra Japanese tokenizer info Previously Japanese tokenizer info that didn't correspond to Token fields was put in user data. Since spaCy core should avoid touching user data, this moves most information to the Token.morph attribute. It also adds the normalized form, which wasn't exposed before. The subtokens, which are a list of full tokens, are still added to user data, except with the default tokenizer granualarity. With the default tokenizer settings the subtokens are all None, so in this case the user data is simply not set. * Update tests Also adds a new test for norm data. * Update docs * Add Japanese morphologizer factory Set the default to `extend=True` so that the morphologizer does not clobber the values set by the tokenizer. * Use the norm_ field for normalized forms Before this commit, normalized forms were put in the "norm" field in the morph attributes. I am not sure why I did that instead of using the token morph, I think I just forgot about it. * Skip test if sudachipy is not installed * Fix import Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
155 lines
7.4 KiB
Python
155 lines
7.4 KiB
Python
import pytest
|
|
|
|
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
|
|
from spacy.lang.ja import Japanese, DetailedToken
|
|
|
|
# fmt: off
|
|
TOKENIZER_TESTS = [
|
|
("日本語だよ", ['日本', '語', 'だ', 'よ']),
|
|
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
|
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
|
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']),
|
|
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
|
]
|
|
|
|
TAG_TESTS = [
|
|
("日本語だよ", ['名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']),
|
|
("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']),
|
|
("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']),
|
|
("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点']),
|
|
("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能'])
|
|
]
|
|
|
|
POS_TESTS = [
|
|
('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']),
|
|
('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'AUX', 'AUX', 'PUNCT']),
|
|
('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'AUX', 'PUNCT']),
|
|
('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
|
|
('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
|
|
]
|
|
|
|
SENTENCE_TESTS = [
|
|
("あれ。これ。", ["あれ。", "これ。"]),
|
|
("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]),
|
|
]
|
|
|
|
tokens1 = [
|
|
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
|
|
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
|
|
]
|
|
tokens2 = [
|
|
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
|
|
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
|
|
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
|
|
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
|
|
]
|
|
tokens3 = [
|
|
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
|
|
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
|
|
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
|
|
]
|
|
SUB_TOKEN_TESTS = [
|
|
("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
|
|
]
|
|
# fmt: on
|
|
|
|
|
|
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
|
def test_ja_tokenizer(ja_tokenizer, text, expected_tokens):
|
|
tokens = [token.text for token in ja_tokenizer(text)]
|
|
assert tokens == expected_tokens
|
|
|
|
|
|
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
|
|
def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
|
|
tags = [token.tag_ for token in ja_tokenizer(text)]
|
|
assert tags == expected_tags
|
|
|
|
|
|
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
|
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
|
pos = [token.pos_ for token in ja_tokenizer(text)]
|
|
assert pos == expected_pos
|
|
|
|
|
|
@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
|
|
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
|
|
def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents):
|
|
sents = [str(sent) for sent in ja_tokenizer(text).sents]
|
|
assert sents == expected_sents
|
|
|
|
|
|
def test_ja_tokenizer_extra_spaces(ja_tokenizer):
|
|
# note: three spaces after "I"
|
|
tokens = ja_tokenizer("I like cheese.")
|
|
assert tokens[1].orth_ == " "
|
|
|
|
|
|
@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
|
|
def test_ja_tokenizer_naughty_strings(ja_tokenizer, text):
|
|
tokens = ja_tokenizer(text)
|
|
assert tokens.text_with_ws == text
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"text,len_a,len_b,len_c",
|
|
[
|
|
("選挙管理委員会", 4, 3, 1),
|
|
("客室乗務員", 3, 2, 1),
|
|
("労働者協同組合", 4, 3, 1),
|
|
("機能性食品", 3, 2, 1),
|
|
],
|
|
)
|
|
def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
|
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
|
|
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
|
|
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
|
|
|
|
assert len(ja_tokenizer(text)) == len_a
|
|
assert len(nlp_a(text)) == len_a
|
|
assert len(nlp_b(text)) == len_b
|
|
assert len(nlp_c(text)) == len_c
|
|
|
|
|
|
@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
|
|
def test_ja_tokenizer_sub_tokens(
|
|
ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
|
|
):
|
|
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
|
|
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
|
|
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
|
|
|
|
assert ja_tokenizer(text).user_data.get("sub_tokens") is None
|
|
assert nlp_a(text).user_data.get("sub_tokens") is None
|
|
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
|
|
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"text,inflections,reading_forms",
|
|
[
|
|
(
|
|
"取ってつけた",
|
|
("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
|
|
("トッ", "テ", "ツケ", "タ"),
|
|
),
|
|
],
|
|
)
|
|
def test_ja_tokenizer_inflections_reading_forms(
|
|
ja_tokenizer, text, inflections, reading_forms
|
|
):
|
|
tokens = ja_tokenizer(text)
|
|
test_inflections = [",".join(tt.morph.get("inflection")) for tt in tokens]
|
|
assert test_inflections == list(inflections)
|
|
test_readings = [tt.morph.get("reading")[0] for tt in tokens]
|
|
assert test_readings == list(reading_forms)
|
|
|
|
|
|
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
|
doc = ja_tokenizer("")
|
|
assert len(doc) == 0
|
|
doc = ja_tokenizer(" ")
|
|
assert len(doc) == 1
|
|
doc = ja_tokenizer("\n\n\n \t\t \n\n\n")
|
|
assert len(doc) == 1
|