diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 6dc6456e5..c8cd9c3fd 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -58,7 +58,8 @@ def check_spaces(text, tokens): yield prev_end != idx prev_end = idx + len(token) start = prev_end - yield False + if start > 0: + yield False class KoreanTokenizer(DummyTokenizer): diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index 531a41d0b..b8fe7959c 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -45,3 +45,8 @@ def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags): def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): pos = [token.pos_ for token in ko_tokenizer(text)] assert pos == expected_pos.split() + + +def test_ko_empty_doc(ko_tokenizer): + tokens = ko_tokenizer("") + assert len(tokens) == 0 diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 5c1e56157..66ad816f5 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -26,6 +26,14 @@ import PosDeps101 from 'usage/101/\_pos-deps.md' + + +For a list of the fine-grained and coarse-grained part-of-speech tags assigned +by spaCy's models across different languages, see the +[POS tag scheme documentation](/api/annotation#pos-tagging). + + + ### Rule-based morphology {#rule-based-morphology} Inflectional morphology is the process by which a root form of a word is @@ -61,14 +69,7 @@ of the two. The system works as follows: morphological information, without consulting the context of the token. The lemmatizer also accepts list-based exception files, acquired from [WordNet](https://wordnet.princeton.edu/). - - - -For a list of the fine-grained and coarse-grained part-of-speech tags assigned -by spaCy's models across different languages, see the -[POS tag scheme documentation](/api/annotation#pos-tagging). - - + ## Dependency Parsing {#dependency-parse model="parser"} @@ -289,7 +290,7 @@ for token in doc: For a list of the syntactic dependency labels assigned by spaCy's models across different languages, see the -[dependency label scheme documentation](/api/annotation#pos-tagging). +[dependency label scheme documentation](/api/annotation#dependency-parsing).