From 0a7fcebdf7d05ec961bd940c6217988e34b1fced Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 30 Jan 2015 12:33:38 +1100
Subject: [PATCH] * Fix Issue #12: Incorrect token.idx calculations for some
 punctuation, in the presence of token cache

---
 spacy/tokenizer.pyx | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index f540eeb88..0f96c058e 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -95,7 +95,6 @@ cdef class Tokenizer:
         return tokens
 
     cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
-        #cached = <Cached*>self._specials.get(key)
         cached = <_Cached*>self._cache.get(key)
         if cached == NULL:
             return False
@@ -176,7 +175,12 @@ cdef class Tokenizer:
         if string.n != 0:
             cache_hit = self._try_cache(idx, string.key, tokens)
             if cache_hit:
-                idx = tokens.data[tokens.length - 1].idx + 1
+                # Get last idx
+                idx = tokens.data[tokens.length - 1].idx
+                # Increment by last length
+                idx += tokens.data[tokens.length - 1].lex.length
+                # Add 1 for space
+                idx += 1
             else:
                 split = self._find_infix(string.chars, string.n)
                 if split == 0 or split == -1: