* Fix Issue #351: Indices off when leading whitespace

2025-11-11 05:19:52 +03:00 · 2016-05-04 15:53:36 +02:00 · 2016-05-04 15:53:36 +02:00 · 519366f677
commit 519366f677
parent b4bfc6ae55
1 changed files with 2 additions and 2 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -98,7 +98,7 @@ cdef class Tokenizer:
        cdef int i = 0
        cdef int start = 0
        cdef bint cache_hit
-        cdef bint in_ws = False
+        cdef bint in_ws = string[0].isspace()
        cdef unicode span
        # The task here is much like string.split, but not quite
        # We find spans of whitespace and non-space characters, and ignore
@ -116,12 +116,12 @@ cdef class Tokenizer:
                    cache_hit = self._try_cache(key, tokens)
                    if not cache_hit:
                        self._tokenize(tokens, span, key)
                in_ws = not in_ws
                if uc == ' ':
                    tokens.c[tokens.length - 1].spacy = True
                    start = i + 1
                else:
                    start = i
                in_ws = not in_ws
            i += 1
        i += 1
        if start < i: