From a55f5a744f64e2509c324cfca6f8e1692fe853a5 Mon Sep 17 00:00:00 2001 From: Bae Yong-Ju Date: Fri, 6 Sep 2019 17:29:40 +0900 Subject: [PATCH] Fix ValueError exception on empty Korean text. (#4245) --- spacy/lang/ko/__init__.py | 3 ++- spacy/tests/lang/ko/test_tokenizer.py | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 6dc6456e5..c8cd9c3fd 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -58,7 +58,8 @@ def check_spaces(text, tokens): yield prev_end != idx prev_end = idx + len(token) start = prev_end - yield False + if start > 0: + yield False class KoreanTokenizer(DummyTokenizer): diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index 531a41d0b..b8fe7959c 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -45,3 +45,8 @@ def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags): def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): pos = [token.pos_ for token in ko_tokenizer(text)] assert pos == expected_pos.split() + + +def test_ko_empty_doc(ko_tokenizer): + tokens = ko_tokenizer("") + assert len(tokens) == 0