* Fix short string optimization in strings.pyx. StringStore tests now all pass.

This commit is contained in:
Matthew Honnibal 2015-07-20 12:05:23 +02:00
parent 09a3055630
commit 52d538ea42

View File

@ -18,45 +18,46 @@ cpdef hash_t hash_string(unicode string) except 0:
cdef unicode _decode(const Utf8Str* string):
cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1].decode('utf8')
elif string.p[0] < 256:
elif string.p[0] < 255:
return string.p[1:string.p[0]+1].decode('utf8')
else:
raise Exception(string.p[0])
cdef int i = 0
cdef int length = 0
i = 0
length = 0
while string.p[i] == 255:
i += 1
length += 255
length += string.p[i]
i += 1
return string.p[i:length - i].decode('utf8')
return string.p[i:length + i].decode('utf8')
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *:
cdef int n_length_bytes
cdef int i
cdef Utf8Str string
assert length != 0
if length < sizeof(string.s):
string.s[0] = <unsigned char>length
memcpy(&string.s[1], chars, length)
return string
elif length < 256:
elif length < 255:
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
string.p[0] = length
memcpy(&string.p[1], chars, length)
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string
else:
raise Exception(length)
cdef int n_length_bytes = (length // 256) + ((length % 256) != 0)
cdef int i = 0
while length >= 256:
i = 0
n_length_bytes = (length // 255) + 1
string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
for i in range(n_length_bytes-1):
string.p[i] = 255
length -= 255
i += 1
string.p[i] = length
memcpy(&string.p[i+1], chars, length)
string.p[n_length_bytes-1] = length % 255
memcpy(&string.p[n_length_bytes], chars, length)
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string
@ -87,9 +88,13 @@ cdef class StringStore:
utf8str = &self.c[<int>string_or_id]
return _decode(utf8str)
elif isinstance(string_or_id, bytes):
if len(string_or_id) == 0:
return 0
utf8str = self.intern(<unsigned char*>string_or_id, len(string_or_id))
return utf8str - self.c
elif isinstance(string_or_id, unicode):
if len(string_or_id) == 0:
return 0
byte_string = string_or_id.encode('utf8')
utf8str = self.intern(<unsigned char*>byte_string, len(byte_string))
return utf8str - self.c