Fix wild pointer problem

2025-08-02 19:30:19 +03:00 · 2022-11-10 11:37:03 +01:00 · 2022-11-10 11:37:03 +01:00 · 5b29568fb7
commit 5b29568fb7
parent 54bdc11353
3 changed files with 23 additions and 20 deletions
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -27,4 +27,4 @@ cdef class StringStore:

    cdef const Utf8Str* intern_unicode(self, str py_string)
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
-    cdef const unsigned char* utf8_ptr(self, attr_t hash_val)
+    cdef (const unsigned char*, int) utf8_ptr(self, const attr_t hash_val)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -317,23 +317,17 @@ cdef class StringStore:
        return value

    @cython.boundscheck(False)  # Deactivate bounds checking
-    cdef const unsigned char* utf8_ptr(self, const attr_t hash_val):
-        if hash_val == 0:
-            return b""
-        elif hash_val < len(SYMBOLS_BY_INT):
-            return SYMBOLS_BY_INT[hash_val].encode("utf-8")
+    cdef (const unsigned char*, int) utf8_ptr(self, const attr_t hash_val):
+        # Returns a pointer to the UTF-8 string together with its length in bytes.
+        # This method presumes the calling code has already checked that *hash_val*
+        # is not 0 and does not refer to a member of *SYMBOLS_BY_INT*.
        cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
        if string.s[0] < sizeof(string.s) and string.s[0] != 0:
-            return string.s[1:string.s[0]+1]
-        elif string.p[0] < 255:
-            return string.p[1:string.p[0]+1]
-        cdef int i, length
-        i = 0
-        length = 0
+            return &string.s[1], string.s[0]
+        cdef length=0, i=0
        while string.p[i] == 255:
            i += 1
            length += 255
        length += string.p[i]
-        i += 1
-        return string.p[i:length + i]
+        return &string.p[i + 1], length
        
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -21,6 +21,7 @@ from .span cimport Span
 from .token cimport MISSING_DEP
 from ._dict_proxies import SpanGroups
 from .token cimport Token
+from ..symbols import NAMES as SYMBOLS_BY_INT
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
@ -1820,16 +1821,24 @@ cdef class Doc:
         
        # Define working variables
        cdef TokenC tok_c
-        cdef int tok_i, tok_str_l
+        cdef int tok_i, tok_str_l, working_store_i
        cdef attr_t num_tok_attr
+        cdef bytes tok_str_bytes
        cdef const unsigned char* tok_str
        cdef np.uint64_t* w_hashes_ptr = hashes_ptr
        
        for tok_i in range(doc_l):
            tok_c = self.c[tok_i]
            num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
-            tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
-            tok_str_l = strlen(<char*> tok_str)
+            if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens
+                if num_tok_attr == 0:
+                    tok_str_bytes = b""
+                else:
+                    tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8")
+                tok_str = tok_str_bytes
+                tok_str_l = len(tok_str_bytes)
+            else:
+                tok_str, tok_str_l = self.vocab.strings.utf8_ptr(num_tok_attr)

            if p_max_l > 0:
                _set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf)
@ -2055,13 +2064,13 @@ cdef void _set_prefix_lengths(
    cdef int tok_str_idx = 1, pref_l_buf_idx = 0

    while pref_l_buf_idx < p_max_l:
-        if (tok_str[tok_str_idx] == 0 # end of string 
+        if (tok_str_idx >= tok_str_l 
            or 
            ((tok_str[tok_str_idx] & 0xc0) != 0x80)  # not a continuation character
        ):
            pref_l_buf[pref_l_buf_idx] = tok_str_idx
            pref_l_buf_idx += 1
-        if tok_str[tok_str_idx] == 0: # end of string
+        if tok_str_idx >= tok_str_l:
            break
        tok_str_idx += 1