mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
* Fix Issue #351: Indices off when leading whitespace
This commit is contained in:
parent
b4bfc6ae55
commit
519366f677
|
@ -98,7 +98,7 @@ cdef class Tokenizer:
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
cdef bint cache_hit
|
cdef bint cache_hit
|
||||||
cdef bint in_ws = False
|
cdef bint in_ws = string[0].isspace()
|
||||||
cdef unicode span
|
cdef unicode span
|
||||||
# The task here is much like string.split, but not quite
|
# The task here is much like string.split, but not quite
|
||||||
# We find spans of whitespace and non-space characters, and ignore
|
# We find spans of whitespace and non-space characters, and ignore
|
||||||
|
@ -116,12 +116,12 @@ cdef class Tokenizer:
|
||||||
cache_hit = self._try_cache(key, tokens)
|
cache_hit = self._try_cache(key, tokens)
|
||||||
if not cache_hit:
|
if not cache_hit:
|
||||||
self._tokenize(tokens, span, key)
|
self._tokenize(tokens, span, key)
|
||||||
in_ws = not in_ws
|
|
||||||
if uc == ' ':
|
if uc == ' ':
|
||||||
tokens.c[tokens.length - 1].spacy = True
|
tokens.c[tokens.length - 1].spacy = True
|
||||||
start = i + 1
|
start = i + 1
|
||||||
else:
|
else:
|
||||||
start = i
|
start = i
|
||||||
|
in_ws = not in_ws
|
||||||
i += 1
|
i += 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user