* Fix short string optimization in strings.pyx. StringStore tests now all pass.

2025-11-07 19:37:38 +03:00 · 2015-07-20 12:05:23 +02:00 · 2015-07-20 12:05:23 +02:00 · 52d538ea42
commit 52d538ea42
parent 09a3055630
1 changed files with 25 additions and 20 deletions
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -18,46 +18,47 @@ cpdef hash_t hash_string(unicode string) except 0:
 cdef unicode _decode(const Utf8Str* string):
    cdef int i, length
    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
        return string.s[1:string.s[0]+1].decode('utf8')
-    elif string.p[0] < 256:
+    elif string.p[0] < 255:
        return string.p[1:string.p[0]+1].decode('utf8')
    else:
-        raise Exception(string.p[0])
+        i = 0
-    cdef int i = 0
+        length = 0
-    cdef int length = 0
+        while string.p[i] == 255:
-    while string.p[i] == 255:
+            i += 1
            length += 255
        length += string.p[i]
        i += 1
-        length += 255
+        return string.p[i:length + i].decode('utf8')
    length += string.p[i]
    i += 1
    return string.p[i:length - i].decode('utf8')
 cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *:
    cdef int n_length_bytes
    cdef int i
    cdef Utf8Str string
    assert length != 0
    if length < sizeof(string.s):
        string.s[0] = <unsigned char>length
        memcpy(&string.s[1], chars, length)
        return string
-    elif length < 256:
+    elif length < 255:
        string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
        string.p[0] = length
        memcpy(&string.p[1], chars, length)
        assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
        return string
    else:
-        raise Exception(length)
+        i = 0
-    cdef int n_length_bytes = (length // 256) + ((length % 256) != 0)
+        n_length_bytes = (length // 255) + 1
-    cdef int i = 0
+        string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
-    while length >= 256:
+        for i in range(n_length_bytes-1):
-        string.p[i] = 255
+            string.p[i] = 255
-        length -= 255
+        string.p[n_length_bytes-1] = length % 255
-        i += 1
+        memcpy(&string.p[n_length_bytes], chars, length)
-    string.p[i] = length
+        assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
-    memcpy(&string.p[i+1], chars, length)
+        return string
    return string
 cdef class StringStore:
@ -87,9 +88,13 @@ cdef class StringStore:
            utf8str = &self.c[<int>string_or_id]
            return _decode(utf8str)
        elif isinstance(string_or_id, bytes):
            if len(string_or_id) == 0:
                return 0
            utf8str = self.intern(<unsigned char*>string_or_id, len(string_or_id))
            return utf8str - self.c
        elif isinstance(string_or_id, unicode):
            if len(string_or_id) == 0:
                return 0
            byte_string = string_or_id.encode('utf8')
            utf8str = self.intern(<unsigned char*>byte_string, len(byte_string))
            return utf8str - self.c