Handle out-of-vocab words

Wasn't handling words out of the tokenizer dictionary vocabulary properly. This adds a fix and test for that. -POLM
2025-08-08 22:24:55 +03:00 · 2017-08-29 23:58:42 +09:00 · 2017-08-29 23:58:42 +09:00 · 8b3e1f7b5b
commit 8b3e1f7b5b
parent 95050201ce
2 changed files with 13 additions and 4 deletions
--- a/spacy/ja/init.py
+++ b/spacy/ja/init.py
@ -66,11 +66,15 @@ def detailed_tokens(tokenizer, text):
    node = node.next # first node is beginning of sentence and empty, skip it
    words = []
    while node.posid != 0:
+        surface = node.surface
+        base = surface
        parts = node.feature.split(',')
        pos = ','.join(parts[0:4])
-        reading = parts[6]
-        base = parts[7]
-        surface = parts[8]
+
+        if len(parts) > 6:
+            # this information is only available for words in the tokenizer dictionary
+            reading = parts[6]
+            base = parts[7]

        words.append( ShortUnitWord(surface, base, pos) )
        node = node.next
--- a/spacy/tests/ja/test_tagger.py
+++ b/spacy/tests/ja/test_tagger.py
@ -22,7 +22,12 @@ TAGGER_TESTS = [
      ('動詞,一般,*,*', 'VERB'),
      ('助詞,接続助詞,*,*', 'SCONJ'),
      ('動詞,非自立可能,*,*', 'VERB'),
-      ('助詞,終助詞,*,*', 'PART')))
+      ('助詞,終助詞,*,*', 'PART'))),
+    ('プププランドに行きたい',
+      (('名詞,普通名詞,一般,*', 'NOUN'),
+      ('助詞,格助詞,*,*', 'ADP'),
+      ('動詞,非自立可能,*,*', 'VERB'),
+      ('助動詞,*,*,*', 'AUX')))
 ]

@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)