* Fix Issue #351: Indices off when leading whitespace

This commit is contained in:
Matthew Honnibal 2016-05-04 15:53:36 +02:00
parent b4bfc6ae55
commit 519366f677

View File

@ -98,7 +98,7 @@ cdef class Tokenizer:
cdef int i = 0 cdef int i = 0
cdef int start = 0 cdef int start = 0
cdef bint cache_hit cdef bint cache_hit
cdef bint in_ws = False cdef bint in_ws = string[0].isspace()
cdef unicode span cdef unicode span
# The task here is much like string.split, but not quite # The task here is much like string.split, but not quite
# We find spans of whitespace and non-space characters, and ignore # We find spans of whitespace and non-space characters, and ignore
@ -116,12 +116,12 @@ cdef class Tokenizer:
cache_hit = self._try_cache(key, tokens) cache_hit = self._try_cache(key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, span, key) self._tokenize(tokens, span, key)
in_ws = not in_ws
if uc == ' ': if uc == ' ':
tokens.c[tokens.length - 1].spacy = True tokens.c[tokens.length - 1].spacy = True
start = i + 1 start = i + 1
else: else:
start = i start = i
in_ws = not in_ws
i += 1 i += 1
i += 1 i += 1
if start < i: if start < i: