From 8b3e1f7b5b2d29ca3b70e5681daa095574b694be Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 29 Aug 2017 23:58:42 +0900
Subject: [PATCH] Handle out-of-vocab words

Wasn't handling words out of the tokenizer dictionary vocabulary
properly. This adds a fix and test for that. -POLM
---
 spacy/ja/__init__.py          | 10 +++++++---
 spacy/tests/ja/test_tagger.py |  7 ++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index dfd0bca5b..2f85406c0 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -66,11 +66,15 @@ def detailed_tokens(tokenizer, text):
     node = node.next # first node is beginning of sentence and empty, skip it
     words = []
     while node.posid != 0:
+        surface = node.surface
+        base = surface
         parts = node.feature.split(',')
         pos = ','.join(parts[0:4])
-        reading = parts[6]
-        base = parts[7]
-        surface = parts[8]
+
+        if len(parts) > 6:
+            # this information is only available for words in the tokenizer dictionary
+            reading = parts[6]
+            base = parts[7]
 
         words.append( ShortUnitWord(surface, base, pos) )
         node = node.next
diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py
index 629cc795f..85f653836 100644
--- a/spacy/tests/ja/test_tagger.py
+++ b/spacy/tests/ja/test_tagger.py
@@ -22,7 +22,12 @@ TAGGER_TESTS = [
       ('動詞,一般,*,*', 'VERB'),
       ('助詞,接続助詞,*,*', 'SCONJ'),
       ('動詞,非自立可能,*,*', 'VERB'),
-      ('助詞,終助詞,*,*', 'PART')))
+      ('助詞,終助詞,*,*', 'PART'))),
+    ('プププランドに行きたい',
+      (('名詞,普通名詞,一般,*', 'NOUN'),
+      ('助詞,格助詞,*,*', 'ADP'),
+      ('動詞,非自立可能,*,*', 'VERB'),
+      ('助動詞,*,*,*', 'AUX')))
 ]
 
 @pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)