diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index dfd0bca5b..2f85406c0 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -66,11 +66,15 @@ def detailed_tokens(tokenizer, text): node = node.next # first node is beginning of sentence and empty, skip it words = [] while node.posid != 0: + surface = node.surface + base = surface parts = node.feature.split(',') pos = ','.join(parts[0:4]) - reading = parts[6] - base = parts[7] - surface = parts[8] + + if len(parts) > 6: + # this information is only available for words in the tokenizer dictionary + reading = parts[6] + base = parts[7] words.append( ShortUnitWord(surface, base, pos) ) node = node.next diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py index 629cc795f..85f653836 100644 --- a/spacy/tests/ja/test_tagger.py +++ b/spacy/tests/ja/test_tagger.py @@ -22,7 +22,12 @@ TAGGER_TESTS = [ ('動詞,一般,*,*', 'VERB'), ('助詞,接続助詞,*,*', 'SCONJ'), ('動詞,非自立可能,*,*', 'VERB'), - ('助詞,終助詞,*,*', 'PART'))) + ('助詞,終助詞,*,*', 'PART'))), + ('プププランドに行きたい', + (('名詞,普通名詞,一般,*', 'NOUN'), + ('助詞,格助詞,*,*', 'ADP'), + ('動詞,非自立可能,*,*', 'VERB'), + ('助動詞,*,*,*', 'AUX'))) ] @pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)