2017-10-14 14:11:39 +03:00
|
|
|
import pytest
|
|
|
|
|
2020-06-08 17:29:05 +03:00
|
|
|
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
|
2020-06-22 15:32:25 +03:00
|
|
|
from spacy.lang.ja import Japanese, DetailedToken
|
2017-10-14 14:11:39 +03:00
|
|
|
|
2018-11-27 03:09:36 +03:00
|
|
|
# fmt: off
|
2017-10-14 14:11:39 +03:00
|
|
|
TOKENIZER_TESTS = [
|
2018-07-25 00:38:44 +03:00
|
|
|
("日本語だよ", ['日本', '語', 'だ', 'よ']),
|
|
|
|
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
|
|
|
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
|
|
|
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']),
|
|
|
|
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
2017-10-14 14:11:39 +03:00
|
|
|
]
|
|
|
|
|
2018-05-03 19:38:26 +03:00
|
|
|
TAG_TESTS = [
|
Add Japanese Model (#5544)
* Add more rules to deal with Japanese UD mappings
Japanese UD rules sometimes give different UD tags to tokens with the
same underlying POS tag. The UD spec indicates these cases should be
disambiguated using the output of a tool called "comainu", but rules are
enough to get the right result.
These rules are taken from Ginza at time of writing, see #3756.
* Add new tags from GSD
This is a few rare tags that aren't in Unidic but are in the GSD data.
* Add basic Japanese sentencization
This code is taken from Ginza again.
* Add sentenceizer quote handling
Could probably add more paired characters but this will do for now. Also
includes some tests.
* Replace fugashi with SudachiPy
* Modify tag format to match GSD annotations
Some of the tests still need to be updated, but I want to get this up
for testing training.
* Deal with case with closing punct without opening
* refactor resolve_pos()
* change tag field separator from "," to "-"
* add TAG_ORTH_MAP
* add TAG_BIGRAM_MAP
* revise rules for 連体詞
* revise rules for 連体詞
* improve POS about 2%
* add syntax_iterator.py (not mature yet)
* improve syntax_iterators.py
* improve syntax_iterators.py
* add phrases including nouns and drop NPs consist of STOP_WORDS
* First take at noun chunks
This works in many situations but still has issues in others.
If the start of a subtree has no noun, then nested phrases can be
generated.
また行きたい、そんな気持ちにさせてくれるお店です。
[そんな気持ち, また行きたい、そんな気持ちにさせてくれるお店]
For some reason て gets included sometimes. Not sure why.
ゲンに連れ添って円盤生物を調査するパートナーとなる。
[て円盤生物, ...]
Some phrases that look like they should be split are grouped together;
not entirely sure that's wrong. This whole thing becomes one chunk:
道の駅遠山郷北側からかぐら大橋南詰現道交点までの1.060kmのみ開通済み
* Use new generic get_words_and_spaces
The new get_words_and_spaces function is simpler than what was used in
Japanese, so it's good to be able to switch to it. However, there was an
issue. The new function works just on text, so POS info could get out of
sync. Fixing this required a small change to the way dtokens (tokens
with POS and lemma info) were generated.
Specifically, multiple extraneous spaces now become a single token, so
when generating dtokens multiple space tokens should be created in a
row.
* Fix noun_chunks, should be working now
* Fix some tests, add naughty strings tests
Some of the existing tests changed because the tokenization mode of
Sudachi changed to the more fine-grained A mode.
Sudachi also has issues with some strings, so this adds a test against
the naughty strings.
* Remove empty Sudachi tokens
Not doing this creates zero-length tokens and causes errors in the
internal spaCy processing.
* Add yield_bunsetu back in as a separate piece of code
Co-authored-by: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com>
Co-authored-by: hiroshi <hiroshi_matsuda@megagon.ai>
2020-06-04 20:15:43 +03:00
|
|
|
("日本語だよ", ['名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']),
|
|
|
|
("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']),
|
|
|
|
("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']),
|
|
|
|
("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点']),
|
|
|
|
("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能'])
|
2018-05-03 19:38:26 +03:00
|
|
|
]
|
|
|
|
|
|
|
|
POS_TESTS = [
|
2020-07-24 13:45:14 +03:00
|
|
|
('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']),
|
|
|
|
('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'AUX', 'AUX', 'PUNCT']),
|
|
|
|
('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'AUX', 'PUNCT']),
|
2018-07-25 00:38:44 +03:00
|
|
|
('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
|
|
|
|
('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
|
2018-05-03 19:38:26 +03:00
|
|
|
]
|
Add Japanese Model (#5544)
* Add more rules to deal with Japanese UD mappings
Japanese UD rules sometimes give different UD tags to tokens with the
same underlying POS tag. The UD spec indicates these cases should be
disambiguated using the output of a tool called "comainu", but rules are
enough to get the right result.
These rules are taken from Ginza at time of writing, see #3756.
* Add new tags from GSD
This is a few rare tags that aren't in Unidic but are in the GSD data.
* Add basic Japanese sentencization
This code is taken from Ginza again.
* Add sentenceizer quote handling
Could probably add more paired characters but this will do for now. Also
includes some tests.
* Replace fugashi with SudachiPy
* Modify tag format to match GSD annotations
Some of the tests still need to be updated, but I want to get this up
for testing training.
* Deal with case with closing punct without opening
* refactor resolve_pos()
* change tag field separator from "," to "-"
* add TAG_ORTH_MAP
* add TAG_BIGRAM_MAP
* revise rules for 連体詞
* revise rules for 連体詞
* improve POS about 2%
* add syntax_iterator.py (not mature yet)
* improve syntax_iterators.py
* improve syntax_iterators.py
* add phrases including nouns and drop NPs consist of STOP_WORDS
* First take at noun chunks
This works in many situations but still has issues in others.
If the start of a subtree has no noun, then nested phrases can be
generated.
また行きたい、そんな気持ちにさせてくれるお店です。
[そんな気持ち, また行きたい、そんな気持ちにさせてくれるお店]
For some reason て gets included sometimes. Not sure why.
ゲンに連れ添って円盤生物を調査するパートナーとなる。
[て円盤生物, ...]
Some phrases that look like they should be split are grouped together;
not entirely sure that's wrong. This whole thing becomes one chunk:
道の駅遠山郷北側からかぐら大橋南詰現道交点までの1.060kmのみ開通済み
* Use new generic get_words_and_spaces
The new get_words_and_spaces function is simpler than what was used in
Japanese, so it's good to be able to switch to it. However, there was an
issue. The new function works just on text, so POS info could get out of
sync. Fixing this required a small change to the way dtokens (tokens
with POS and lemma info) were generated.
Specifically, multiple extraneous spaces now become a single token, so
when generating dtokens multiple space tokens should be created in a
row.
* Fix noun_chunks, should be working now
* Fix some tests, add naughty strings tests
Some of the existing tests changed because the tokenization mode of
Sudachi changed to the more fine-grained A mode.
Sudachi also has issues with some strings, so this adds a test against
the naughty strings.
* Remove empty Sudachi tokens
Not doing this creates zero-length tokens and causes errors in the
internal spaCy processing.
* Add yield_bunsetu back in as a separate piece of code
Co-authored-by: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com>
Co-authored-by: hiroshi <hiroshi_matsuda@megagon.ai>
2020-06-04 20:15:43 +03:00
|
|
|
|
|
|
|
SENTENCE_TESTS = [
|
2020-06-21 23:38:04 +03:00
|
|
|
("あれ。これ。", ["あれ。", "これ。"]),
|
|
|
|
("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]),
|
|
|
|
]
|
2020-07-25 16:01:15 +03:00
|
|
|
|
|
|
|
tokens1 = [
|
2021-10-01 20:19:26 +03:00
|
|
|
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
|
|
|
|
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
|
2020-07-25 16:01:15 +03:00
|
|
|
]
|
|
|
|
tokens2 = [
|
2021-10-01 20:19:26 +03:00
|
|
|
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
|
|
|
|
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
|
|
|
|
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
|
|
|
|
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
|
2020-07-25 16:01:15 +03:00
|
|
|
]
|
|
|
|
tokens3 = [
|
2021-10-01 20:19:26 +03:00
|
|
|
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
|
|
|
|
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
|
|
|
|
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
|
2020-07-25 16:01:15 +03:00
|
|
|
]
|
|
|
|
SUB_TOKEN_TESTS = [
|
2021-10-01 20:19:26 +03:00
|
|
|
("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
|
2020-07-25 16:01:15 +03:00
|
|
|
]
|
2018-11-27 03:09:36 +03:00
|
|
|
# fmt: on
|
2018-05-03 19:38:26 +03:00
|
|
|
|
2017-10-14 14:11:39 +03:00
|
|
|
|
2021-12-04 22:34:48 +03:00
|
|
|
@pytest.mark.issue(2901)
|
|
|
|
def test_issue2901():
|
|
|
|
"""Test that `nlp` doesn't fail."""
|
|
|
|
try:
|
|
|
|
nlp = Japanese()
|
|
|
|
except ImportError:
|
|
|
|
pytest.skip()
|
|
|
|
|
|
|
|
doc = nlp("pythonが大好きです")
|
|
|
|
assert doc
|
|
|
|
|
|
|
|
|
2018-11-27 03:09:36 +03:00
|
|
|
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
2018-07-25 00:38:44 +03:00
|
|
|
def test_ja_tokenizer(ja_tokenizer, text, expected_tokens):
|
2017-10-14 14:11:39 +03:00
|
|
|
tokens = [token.text for token in ja_tokenizer(text)]
|
|
|
|
assert tokens == expected_tokens
|
2018-05-03 19:38:26 +03:00
|
|
|
|
2018-07-25 00:38:44 +03:00
|
|
|
|
2018-11-27 03:09:36 +03:00
|
|
|
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
|
|
|
|
def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
|
2018-05-03 19:38:26 +03:00
|
|
|
tags = [token.tag_ for token in ja_tokenizer(text)]
|
|
|
|
assert tags == expected_tags
|
|
|
|
|
2018-07-25 00:38:44 +03:00
|
|
|
|
2018-11-27 03:09:36 +03:00
|
|
|
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
|
|
|
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
2018-05-03 19:38:26 +03:00
|
|
|
pos = [token.pos_ for token in ja_tokenizer(text)]
|
|
|
|
assert pos == expected_pos
|
2019-09-13 17:28:12 +03:00
|
|
|
|
2020-06-08 17:29:05 +03:00
|
|
|
|
2020-06-09 13:00:59 +03:00
|
|
|
@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
|
Add Japanese Model (#5544)
* Add more rules to deal with Japanese UD mappings
Japanese UD rules sometimes give different UD tags to tokens with the
same underlying POS tag. The UD spec indicates these cases should be
disambiguated using the output of a tool called "comainu", but rules are
enough to get the right result.
These rules are taken from Ginza at time of writing, see #3756.
* Add new tags from GSD
This is a few rare tags that aren't in Unidic but are in the GSD data.
* Add basic Japanese sentencization
This code is taken from Ginza again.
* Add sentenceizer quote handling
Could probably add more paired characters but this will do for now. Also
includes some tests.
* Replace fugashi with SudachiPy
* Modify tag format to match GSD annotations
Some of the tests still need to be updated, but I want to get this up
for testing training.
* Deal with case with closing punct without opening
* refactor resolve_pos()
* change tag field separator from "," to "-"
* add TAG_ORTH_MAP
* add TAG_BIGRAM_MAP
* revise rules for 連体詞
* revise rules for 連体詞
* improve POS about 2%
* add syntax_iterator.py (not mature yet)
* improve syntax_iterators.py
* improve syntax_iterators.py
* add phrases including nouns and drop NPs consist of STOP_WORDS
* First take at noun chunks
This works in many situations but still has issues in others.
If the start of a subtree has no noun, then nested phrases can be
generated.
また行きたい、そんな気持ちにさせてくれるお店です。
[そんな気持ち, また行きたい、そんな気持ちにさせてくれるお店]
For some reason て gets included sometimes. Not sure why.
ゲンに連れ添って円盤生物を調査するパートナーとなる。
[て円盤生物, ...]
Some phrases that look like they should be split are grouped together;
not entirely sure that's wrong. This whole thing becomes one chunk:
道の駅遠山郷北側からかぐら大橋南詰現道交点までの1.060kmのみ開通済み
* Use new generic get_words_and_spaces
The new get_words_and_spaces function is simpler than what was used in
Japanese, so it's good to be able to switch to it. However, there was an
issue. The new function works just on text, so POS info could get out of
sync. Fixing this required a small change to the way dtokens (tokens
with POS and lemma info) were generated.
Specifically, multiple extraneous spaces now become a single token, so
when generating dtokens multiple space tokens should be created in a
row.
* Fix noun_chunks, should be working now
* Fix some tests, add naughty strings tests
Some of the existing tests changed because the tokenization mode of
Sudachi changed to the more fine-grained A mode.
Sudachi also has issues with some strings, so this adds a test against
the naughty strings.
* Remove empty Sudachi tokens
Not doing this creates zero-length tokens and causes errors in the
internal spaCy processing.
* Add yield_bunsetu back in as a separate piece of code
Co-authored-by: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com>
Co-authored-by: hiroshi <hiroshi_matsuda@megagon.ai>
2020-06-04 20:15:43 +03:00
|
|
|
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
|
2020-06-21 23:38:04 +03:00
|
|
|
def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents):
|
Add Japanese Model (#5544)
* Add more rules to deal with Japanese UD mappings
Japanese UD rules sometimes give different UD tags to tokens with the
same underlying POS tag. The UD spec indicates these cases should be
disambiguated using the output of a tool called "comainu", but rules are
enough to get the right result.
These rules are taken from Ginza at time of writing, see #3756.
* Add new tags from GSD
This is a few rare tags that aren't in Unidic but are in the GSD data.
* Add basic Japanese sentencization
This code is taken from Ginza again.
* Add sentenceizer quote handling
Could probably add more paired characters but this will do for now. Also
includes some tests.
* Replace fugashi with SudachiPy
* Modify tag format to match GSD annotations
Some of the tests still need to be updated, but I want to get this up
for testing training.
* Deal with case with closing punct without opening
* refactor resolve_pos()
* change tag field separator from "," to "-"
* add TAG_ORTH_MAP
* add TAG_BIGRAM_MAP
* revise rules for 連体詞
* revise rules for 連体詞
* improve POS about 2%
* add syntax_iterator.py (not mature yet)
* improve syntax_iterators.py
* improve syntax_iterators.py
* add phrases including nouns and drop NPs consist of STOP_WORDS
* First take at noun chunks
This works in many situations but still has issues in others.
If the start of a subtree has no noun, then nested phrases can be
generated.
また行きたい、そんな気持ちにさせてくれるお店です。
[そんな気持ち, また行きたい、そんな気持ちにさせてくれるお店]
For some reason て gets included sometimes. Not sure why.
ゲンに連れ添って円盤生物を調査するパートナーとなる。
[て円盤生物, ...]
Some phrases that look like they should be split are grouped together;
not entirely sure that's wrong. This whole thing becomes one chunk:
道の駅遠山郷北側からかぐら大橋南詰現道交点までの1.060kmのみ開通済み
* Use new generic get_words_and_spaces
The new get_words_and_spaces function is simpler than what was used in
Japanese, so it's good to be able to switch to it. However, there was an
issue. The new function works just on text, so POS info could get out of
sync. Fixing this required a small change to the way dtokens (tokens
with POS and lemma info) were generated.
Specifically, multiple extraneous spaces now become a single token, so
when generating dtokens multiple space tokens should be created in a
row.
* Fix noun_chunks, should be working now
* Fix some tests, add naughty strings tests
Some of the existing tests changed because the tokenization mode of
Sudachi changed to the more fine-grained A mode.
Sudachi also has issues with some strings, so this adds a test against
the naughty strings.
* Remove empty Sudachi tokens
Not doing this creates zero-length tokens and causes errors in the
internal spaCy processing.
* Add yield_bunsetu back in as a separate piece of code
Co-authored-by: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com>
Co-authored-by: hiroshi <hiroshi_matsuda@megagon.ai>
2020-06-04 20:15:43 +03:00
|
|
|
sents = [str(sent) for sent in ja_tokenizer(text).sents]
|
|
|
|
assert sents == expected_sents
|
|
|
|
|
2019-09-14 13:58:06 +03:00
|
|
|
|
2020-06-08 17:29:05 +03:00
|
|
|
def test_ja_tokenizer_extra_spaces(ja_tokenizer):
|
2019-09-13 17:28:12 +03:00
|
|
|
# note: three spaces after "I"
|
|
|
|
tokens = ja_tokenizer("I like cheese.")
|
Add Japanese Model (#5544)
* Add more rules to deal with Japanese UD mappings
Japanese UD rules sometimes give different UD tags to tokens with the
same underlying POS tag. The UD spec indicates these cases should be
disambiguated using the output of a tool called "comainu", but rules are
enough to get the right result.
These rules are taken from Ginza at time of writing, see #3756.
* Add new tags from GSD
This is a few rare tags that aren't in Unidic but are in the GSD data.
* Add basic Japanese sentencization
This code is taken from Ginza again.
* Add sentenceizer quote handling
Could probably add more paired characters but this will do for now. Also
includes some tests.
* Replace fugashi with SudachiPy
* Modify tag format to match GSD annotations
Some of the tests still need to be updated, but I want to get this up
for testing training.
* Deal with case with closing punct without opening
* refactor resolve_pos()
* change tag field separator from "," to "-"
* add TAG_ORTH_MAP
* add TAG_BIGRAM_MAP
* revise rules for 連体詞
* revise rules for 連体詞
* improve POS about 2%
* add syntax_iterator.py (not mature yet)
* improve syntax_iterators.py
* improve syntax_iterators.py
* add phrases including nouns and drop NPs consist of STOP_WORDS
* First take at noun chunks
This works in many situations but still has issues in others.
If the start of a subtree has no noun, then nested phrases can be
generated.
また行きたい、そんな気持ちにさせてくれるお店です。
[そんな気持ち, また行きたい、そんな気持ちにさせてくれるお店]
For some reason て gets included sometimes. Not sure why.
ゲンに連れ添って円盤生物を調査するパートナーとなる。
[て円盤生物, ...]
Some phrases that look like they should be split are grouped together;
not entirely sure that's wrong. This whole thing becomes one chunk:
道の駅遠山郷北側からかぐら大橋南詰現道交点までの1.060kmのみ開通済み
* Use new generic get_words_and_spaces
The new get_words_and_spaces function is simpler than what was used in
Japanese, so it's good to be able to switch to it. However, there was an
issue. The new function works just on text, so POS info could get out of
sync. Fixing this required a small change to the way dtokens (tokens
with POS and lemma info) were generated.
Specifically, multiple extraneous spaces now become a single token, so
when generating dtokens multiple space tokens should be created in a
row.
* Fix noun_chunks, should be working now
* Fix some tests, add naughty strings tests
Some of the existing tests changed because the tokenization mode of
Sudachi changed to the more fine-grained A mode.
Sudachi also has issues with some strings, so this adds a test against
the naughty strings.
* Remove empty Sudachi tokens
Not doing this creates zero-length tokens and causes errors in the
internal spaCy processing.
* Add yield_bunsetu back in as a separate piece of code
Co-authored-by: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com>
Co-authored-by: hiroshi <hiroshi_matsuda@megagon.ai>
2020-06-04 20:15:43 +03:00
|
|
|
assert tokens[1].orth_ == " "
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
|
2020-06-08 17:29:05 +03:00
|
|
|
def test_ja_tokenizer_naughty_strings(ja_tokenizer, text):
|
Add Japanese Model (#5544)
* Add more rules to deal with Japanese UD mappings
Japanese UD rules sometimes give different UD tags to tokens with the
same underlying POS tag. The UD spec indicates these cases should be
disambiguated using the output of a tool called "comainu", but rules are
enough to get the right result.
These rules are taken from Ginza at time of writing, see #3756.
* Add new tags from GSD
This is a few rare tags that aren't in Unidic but are in the GSD data.
* Add basic Japanese sentencization
This code is taken from Ginza again.
* Add sentenceizer quote handling
Could probably add more paired characters but this will do for now. Also
includes some tests.
* Replace fugashi with SudachiPy
* Modify tag format to match GSD annotations
Some of the tests still need to be updated, but I want to get this up
for testing training.
* Deal with case with closing punct without opening
* refactor resolve_pos()
* change tag field separator from "," to "-"
* add TAG_ORTH_MAP
* add TAG_BIGRAM_MAP
* revise rules for 連体詞
* revise rules for 連体詞
* improve POS about 2%
* add syntax_iterator.py (not mature yet)
* improve syntax_iterators.py
* improve syntax_iterators.py
* add phrases including nouns and drop NPs consist of STOP_WORDS
* First take at noun chunks
This works in many situations but still has issues in others.
If the start of a subtree has no noun, then nested phrases can be
generated.
また行きたい、そんな気持ちにさせてくれるお店です。
[そんな気持ち, また行きたい、そんな気持ちにさせてくれるお店]
For some reason て gets included sometimes. Not sure why.
ゲンに連れ添って円盤生物を調査するパートナーとなる。
[て円盤生物, ...]
Some phrases that look like they should be split are grouped together;
not entirely sure that's wrong. This whole thing becomes one chunk:
道の駅遠山郷北側からかぐら大橋南詰現道交点までの1.060kmのみ開通済み
* Use new generic get_words_and_spaces
The new get_words_and_spaces function is simpler than what was used in
Japanese, so it's good to be able to switch to it. However, there was an
issue. The new function works just on text, so POS info could get out of
sync. Fixing this required a small change to the way dtokens (tokens
with POS and lemma info) were generated.
Specifically, multiple extraneous spaces now become a single token, so
when generating dtokens multiple space tokens should be created in a
row.
* Fix noun_chunks, should be working now
* Fix some tests, add naughty strings tests
Some of the existing tests changed because the tokenization mode of
Sudachi changed to the more fine-grained A mode.
Sudachi also has issues with some strings, so this adds a test against
the naughty strings.
* Remove empty Sudachi tokens
Not doing this creates zero-length tokens and causes errors in the
internal spaCy processing.
* Add yield_bunsetu back in as a separate piece of code
Co-authored-by: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com>
Co-authored-by: hiroshi <hiroshi_matsuda@megagon.ai>
2020-06-04 20:15:43 +03:00
|
|
|
tokens = ja_tokenizer(text)
|
|
|
|
assert tokens.text_with_ws == text
|
|
|
|
|
2020-06-08 17:29:05 +03:00
|
|
|
|
2020-06-21 23:38:04 +03:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"text,len_a,len_b,len_c",
|
2020-06-08 17:29:05 +03:00
|
|
|
[
|
|
|
|
("選挙管理委員会", 4, 3, 1),
|
|
|
|
("客室乗務員", 3, 2, 1),
|
|
|
|
("労働者協同組合", 4, 3, 1),
|
|
|
|
("機能性食品", 3, 2, 1),
|
2020-06-21 23:38:04 +03:00
|
|
|
],
|
2020-06-08 17:29:05 +03:00
|
|
|
)
|
|
|
|
def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
2020-07-22 14:42:59 +03:00
|
|
|
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
|
|
|
|
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
|
|
|
|
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
|
2020-06-08 17:29:05 +03:00
|
|
|
|
|
|
|
assert len(ja_tokenizer(text)) == len_a
|
|
|
|
assert len(nlp_a(text)) == len_a
|
|
|
|
assert len(nlp_b(text)) == len_b
|
|
|
|
assert len(nlp_c(text)) == len_c
|
2020-06-08 22:09:23 +03:00
|
|
|
|
|
|
|
|
2021-10-01 20:19:26 +03:00
|
|
|
@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
|
2020-07-25 16:01:15 +03:00
|
|
|
def test_ja_tokenizer_sub_tokens(
|
2021-10-01 20:19:26 +03:00
|
|
|
ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
|
2020-07-25 16:01:15 +03:00
|
|
|
):
|
2020-07-24 13:45:14 +03:00
|
|
|
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
|
|
|
|
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
|
|
|
|
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
|
2020-06-22 15:32:25 +03:00
|
|
|
|
2021-10-01 20:19:26 +03:00
|
|
|
assert ja_tokenizer(text).user_data.get("sub_tokens") is None
|
|
|
|
assert nlp_a(text).user_data.get("sub_tokens") is None
|
2020-06-29 15:34:15 +03:00
|
|
|
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
|
|
|
|
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
|
2020-06-22 15:32:25 +03:00
|
|
|
|
|
|
|
|
2020-07-25 16:01:15 +03:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"text,inflections,reading_forms",
|
2020-06-22 15:32:25 +03:00
|
|
|
[
|
|
|
|
(
|
|
|
|
"取ってつけた",
|
2021-10-14 10:21:36 +03:00
|
|
|
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
|
|
|
|
(["トッ"], ["テ"], ["ツケ"], ["タ"]),
|
|
|
|
),
|
2021-11-05 11:56:26 +03:00
|
|
|
("2=3", ([], [], []), (["ニ"], ["_"], ["サン"])),
|
2020-07-25 16:01:15 +03:00
|
|
|
],
|
2020-06-22 15:32:25 +03:00
|
|
|
)
|
2020-07-25 16:01:15 +03:00
|
|
|
def test_ja_tokenizer_inflections_reading_forms(
|
|
|
|
ja_tokenizer, text, inflections, reading_forms
|
|
|
|
):
|
2021-10-01 20:19:26 +03:00
|
|
|
tokens = ja_tokenizer(text)
|
2021-10-27 14:13:03 +03:00
|
|
|
test_inflections = [tt.morph.get("Inflection") for tt in tokens]
|
2021-10-01 20:19:26 +03:00
|
|
|
assert test_inflections == list(inflections)
|
2021-10-27 14:13:03 +03:00
|
|
|
test_readings = [tt.morph.get("Reading") for tt in tokens]
|
2021-10-01 20:19:26 +03:00
|
|
|
assert test_readings == list(reading_forms)
|
2020-06-22 15:32:25 +03:00
|
|
|
|
|
|
|
|
2020-06-08 22:09:23 +03:00
|
|
|
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
|
|
|
doc = ja_tokenizer("")
|
|
|
|
assert len(doc) == 0
|
|
|
|
doc = ja_tokenizer(" ")
|
|
|
|
assert len(doc) == 1
|
|
|
|
doc = ja_tokenizer("\n\n\n \t\t \n\n\n")
|
|
|
|
assert len(doc) == 1
|