From 0a7fcebdf7d05ec961bd940c6217988e34b1fced Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Jan 2015 12:33:38 +1100 Subject: [PATCH] * Fix Issue #12: Incorrect token.idx calculations for some punctuation, in the presence of token cache --- spacy/tokenizer.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f540eeb88..0f96c058e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -95,7 +95,6 @@ cdef class Tokenizer: return tokens cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: - #cached = self._specials.get(key) cached = <_Cached*>self._cache.get(key) if cached == NULL: return False @@ -176,7 +175,12 @@ cdef class Tokenizer: if string.n != 0: cache_hit = self._try_cache(idx, string.key, tokens) if cache_hit: - idx = tokens.data[tokens.length - 1].idx + 1 + # Get last idx + idx = tokens.data[tokens.length - 1].idx + # Increment by last length + idx += tokens.data[tokens.length - 1].lex.length + # Add 1 for space + idx += 1 else: split = self._find_infix(string.chars, string.n) if split == 0 or split == -1: