From 8b3e1f7b5b2d29ca3b70e5681daa095574b694be Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 29 Aug 2017 23:58:42 +0900 Subject: [PATCH] Handle out-of-vocab words Wasn't handling words out of the tokenizer dictionary vocabulary properly. This adds a fix and test for that. -POLM --- spacy/ja/__init__.py | 10 +++++++--- spacy/tests/ja/test_tagger.py | 7 ++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index dfd0bca5b..2f85406c0 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -66,11 +66,15 @@ def detailed_tokens(tokenizer, text): node = node.next # first node is beginning of sentence and empty, skip it words = [] while node.posid != 0: + surface = node.surface + base = surface parts = node.feature.split(',') pos = ','.join(parts[0:4]) - reading = parts[6] - base = parts[7] - surface = parts[8] + + if len(parts) > 6: + # this information is only available for words in the tokenizer dictionary + reading = parts[6] + base = parts[7] words.append( ShortUnitWord(surface, base, pos) ) node = node.next diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py index 629cc795f..85f653836 100644 --- a/spacy/tests/ja/test_tagger.py +++ b/spacy/tests/ja/test_tagger.py @@ -22,7 +22,12 @@ TAGGER_TESTS = [ ('動詞,一般,*,*', 'VERB'), ('助詞,接続助詞,*,*', 'SCONJ'), ('動詞,非自立可能,*,*', 'VERB'), - ('助詞,終助詞,*,*', 'PART'))) + ('助詞,終助詞,*,*', 'PART'))), + ('プププランドに行きたい', + (('名詞,普通名詞,一般,*', 'NOUN'), + ('助詞,格助詞,*,*', 'ADP'), + ('動詞,非自立可能,*,*', 'VERB'), + ('助動詞,*,*,*', 'AUX'))) ] @pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)