mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Fix wild pointer problem
This commit is contained in:
parent
54bdc11353
commit
5b29568fb7
|
@ -27,4 +27,4 @@ cdef class StringStore:
|
|||
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
||||
cdef const unsigned char* utf8_ptr(self, attr_t hash_val)
|
||||
cdef (const unsigned char*, int) utf8_ptr(self, const attr_t hash_val)
|
||||
|
|
|
@ -317,23 +317,17 @@ cdef class StringStore:
|
|||
return value
|
||||
|
||||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
cdef const unsigned char* utf8_ptr(self, const attr_t hash_val):
|
||||
if hash_val == 0:
|
||||
return b""
|
||||
elif hash_val < len(SYMBOLS_BY_INT):
|
||||
return SYMBOLS_BY_INT[hash_val].encode("utf-8")
|
||||
cdef (const unsigned char*, int) utf8_ptr(self, const attr_t hash_val):
|
||||
# Returns a pointer to the UTF-8 string together with its length in bytes.
|
||||
# This method presumes the calling code has already checked that *hash_val*
|
||||
# is not 0 and does not refer to a member of *SYMBOLS_BY_INT*.
|
||||
cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
|
||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||
return string.s[1:string.s[0]+1]
|
||||
elif string.p[0] < 255:
|
||||
return string.p[1:string.p[0]+1]
|
||||
cdef int i, length
|
||||
i = 0
|
||||
length = 0
|
||||
return &string.s[1], string.s[0]
|
||||
cdef length=0, i=0
|
||||
while string.p[i] == 255:
|
||||
i += 1
|
||||
length += 255
|
||||
length += string.p[i]
|
||||
i += 1
|
||||
return string.p[i:length + i]
|
||||
return &string.p[i + 1], length
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ from .span cimport Span
|
|||
from .token cimport MISSING_DEP
|
||||
from ._dict_proxies import SpanGroups
|
||||
from .token cimport Token
|
||||
from ..symbols import NAMES as SYMBOLS_BY_INT
|
||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs cimport attr_id_t
|
||||
|
@ -1820,16 +1821,24 @@ cdef class Doc:
|
|||
|
||||
# Define working variables
|
||||
cdef TokenC tok_c
|
||||
cdef int tok_i, tok_str_l
|
||||
cdef int tok_i, tok_str_l, working_store_i
|
||||
cdef attr_t num_tok_attr
|
||||
cdef bytes tok_str_bytes
|
||||
cdef const unsigned char* tok_str
|
||||
cdef np.uint64_t* w_hashes_ptr = hashes_ptr
|
||||
|
||||
for tok_i in range(doc_l):
|
||||
tok_c = self.c[tok_i]
|
||||
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
|
||||
tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
|
||||
tok_str_l = strlen(<char*> tok_str)
|
||||
if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens
|
||||
if num_tok_attr == 0:
|
||||
tok_str_bytes = b""
|
||||
else:
|
||||
tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8")
|
||||
tok_str = tok_str_bytes
|
||||
tok_str_l = len(tok_str_bytes)
|
||||
else:
|
||||
tok_str, tok_str_l = self.vocab.strings.utf8_ptr(num_tok_attr)
|
||||
|
||||
if p_max_l > 0:
|
||||
_set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf)
|
||||
|
@ -2055,13 +2064,13 @@ cdef void _set_prefix_lengths(
|
|||
cdef int tok_str_idx = 1, pref_l_buf_idx = 0
|
||||
|
||||
while pref_l_buf_idx < p_max_l:
|
||||
if (tok_str[tok_str_idx] == 0 # end of string
|
||||
if (tok_str_idx >= tok_str_l
|
||||
or
|
||||
((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character
|
||||
):
|
||||
pref_l_buf[pref_l_buf_idx] = tok_str_idx
|
||||
pref_l_buf_idx += 1
|
||||
if tok_str[tok_str_idx] == 0: # end of string
|
||||
if tok_str_idx >= tok_str_l:
|
||||
break
|
||||
tok_str_idx += 1
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user