mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Disable sentence segmentation in ja tokenizer (#5566)
This commit is contained in:
parent
86112d2168
commit
b7e6e1b9a7
|
@ -209,7 +209,6 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
token.lemma_ = lemma
|
token.lemma_ = lemma
|
||||||
doc.user_data["unidic_tags"] = unidic_tags
|
doc.user_data["unidic_tags"] = unidic_tags
|
||||||
|
|
||||||
separate_sentences(doc)
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _get_config(self):
|
def _get_config(self):
|
||||||
|
|
|
@ -58,6 +58,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
||||||
assert pos == expected_pos
|
assert pos == expected_pos
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
|
||||||
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
|
@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
|
||||||
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents):
|
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents):
|
||||||
sents = [str(sent) for sent in ja_tokenizer(text).sents]
|
sents = [str(sent) for sent in ja_tokenizer(text).sents]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user