Disable sentence segmentation in ja tokenizer (#5566)

This commit is contained in:
adrianeboyd 2020-06-09 12:00:59 +02:00 committed by GitHub
parent 86112d2168
commit b7e6e1b9a7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 1 additions and 1 deletions

View File

@ -209,7 +209,6 @@ class JapaneseTokenizer(DummyTokenizer):
token.lemma_ = lemma token.lemma_ = lemma
doc.user_data["unidic_tags"] = unidic_tags doc.user_data["unidic_tags"] = unidic_tags
separate_sentences(doc)
return doc return doc
def _get_config(self): def _get_config(self):

View File

@ -58,6 +58,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
assert pos == expected_pos assert pos == expected_pos
@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) @pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents):
sents = [str(sent) for sent in ja_tokenizer(text).sents] sents = [str(sent) for sent in ja_tokenizer(text).sents]