mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-03 20:00:21 +03:00
Fix wild pointer problem
This commit is contained in:
parent
54bdc11353
commit
5b29568fb7
|
@ -27,4 +27,4 @@ cdef class StringStore:
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
||||||
cdef const unsigned char* utf8_ptr(self, attr_t hash_val)
|
cdef (const unsigned char*, int) utf8_ptr(self, const attr_t hash_val)
|
||||||
|
|
|
@ -317,23 +317,17 @@ cdef class StringStore:
|
||||||
return value
|
return value
|
||||||
|
|
||||||
@cython.boundscheck(False) # Deactivate bounds checking
|
@cython.boundscheck(False) # Deactivate bounds checking
|
||||||
cdef const unsigned char* utf8_ptr(self, const attr_t hash_val):
|
cdef (const unsigned char*, int) utf8_ptr(self, const attr_t hash_val):
|
||||||
if hash_val == 0:
|
# Returns a pointer to the UTF-8 string together with its length in bytes.
|
||||||
return b""
|
# This method presumes the calling code has already checked that *hash_val*
|
||||||
elif hash_val < len(SYMBOLS_BY_INT):
|
# is not 0 and does not refer to a member of *SYMBOLS_BY_INT*.
|
||||||
return SYMBOLS_BY_INT[hash_val].encode("utf-8")
|
|
||||||
cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
|
cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
|
||||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||||
return string.s[1:string.s[0]+1]
|
return &string.s[1], string.s[0]
|
||||||
elif string.p[0] < 255:
|
cdef length=0, i=0
|
||||||
return string.p[1:string.p[0]+1]
|
|
||||||
cdef int i, length
|
|
||||||
i = 0
|
|
||||||
length = 0
|
|
||||||
while string.p[i] == 255:
|
while string.p[i] == 255:
|
||||||
i += 1
|
i += 1
|
||||||
length += 255
|
length += 255
|
||||||
length += string.p[i]
|
length += string.p[i]
|
||||||
i += 1
|
return &string.p[i + 1], length
|
||||||
return string.p[i:length + i]
|
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ from .span cimport Span
|
||||||
from .token cimport MISSING_DEP
|
from .token cimport MISSING_DEP
|
||||||
from ._dict_proxies import SpanGroups
|
from ._dict_proxies import SpanGroups
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
|
from ..symbols import NAMES as SYMBOLS_BY_INT
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
|
@ -1820,16 +1821,24 @@ cdef class Doc:
|
||||||
|
|
||||||
# Define working variables
|
# Define working variables
|
||||||
cdef TokenC tok_c
|
cdef TokenC tok_c
|
||||||
cdef int tok_i, tok_str_l
|
cdef int tok_i, tok_str_l, working_store_i
|
||||||
cdef attr_t num_tok_attr
|
cdef attr_t num_tok_attr
|
||||||
|
cdef bytes tok_str_bytes
|
||||||
cdef const unsigned char* tok_str
|
cdef const unsigned char* tok_str
|
||||||
cdef np.uint64_t* w_hashes_ptr = hashes_ptr
|
cdef np.uint64_t* w_hashes_ptr = hashes_ptr
|
||||||
|
|
||||||
for tok_i in range(doc_l):
|
for tok_i in range(doc_l):
|
||||||
tok_c = self.c[tok_i]
|
tok_c = self.c[tok_i]
|
||||||
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
|
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
|
||||||
tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
|
if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens
|
||||||
tok_str_l = strlen(<char*> tok_str)
|
if num_tok_attr == 0:
|
||||||
|
tok_str_bytes = b""
|
||||||
|
else:
|
||||||
|
tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8")
|
||||||
|
tok_str = tok_str_bytes
|
||||||
|
tok_str_l = len(tok_str_bytes)
|
||||||
|
else:
|
||||||
|
tok_str, tok_str_l = self.vocab.strings.utf8_ptr(num_tok_attr)
|
||||||
|
|
||||||
if p_max_l > 0:
|
if p_max_l > 0:
|
||||||
_set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf)
|
_set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf)
|
||||||
|
@ -2055,13 +2064,13 @@ cdef void _set_prefix_lengths(
|
||||||
cdef int tok_str_idx = 1, pref_l_buf_idx = 0
|
cdef int tok_str_idx = 1, pref_l_buf_idx = 0
|
||||||
|
|
||||||
while pref_l_buf_idx < p_max_l:
|
while pref_l_buf_idx < p_max_l:
|
||||||
if (tok_str[tok_str_idx] == 0 # end of string
|
if (tok_str_idx >= tok_str_l
|
||||||
or
|
or
|
||||||
((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character
|
((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character
|
||||||
):
|
):
|
||||||
pref_l_buf[pref_l_buf_idx] = tok_str_idx
|
pref_l_buf[pref_l_buf_idx] = tok_str_idx
|
||||||
pref_l_buf_idx += 1
|
pref_l_buf_idx += 1
|
||||||
if tok_str[tok_str_idx] == 0: # end of string
|
if tok_str_idx >= tok_str_l:
|
||||||
break
|
break
|
||||||
tok_str_idx += 1
|
tok_str_idx += 1
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user