* Fix short string optimization in strings.pyx. StringStore tests now all pass.

This commit is contained in:
Matthew Honnibal 2015-07-20 12:05:23 +02:00
parent 09a3055630
commit 52d538ea42

View File

@ -18,46 +18,47 @@ cpdef hash_t hash_string(unicode string) except 0:
cdef unicode _decode(const Utf8Str* string): cdef unicode _decode(const Utf8Str* string):
cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0: if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1].decode('utf8') return string.s[1:string.s[0]+1].decode('utf8')
elif string.p[0] < 256: elif string.p[0] < 255:
return string.p[1:string.p[0]+1].decode('utf8') return string.p[1:string.p[0]+1].decode('utf8')
else: else:
raise Exception(string.p[0]) i = 0
cdef int i = 0 length = 0
cdef int length = 0 while string.p[i] == 255:
while string.p[i] == 255: i += 1
length += 255
length += string.p[i]
i += 1 i += 1
length += 255 return string.p[i:length + i].decode('utf8')
length += string.p[i]
i += 1
return string.p[i:length - i].decode('utf8')
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *: cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *:
cdef int n_length_bytes
cdef int i
cdef Utf8Str string cdef Utf8Str string
assert length != 0 assert length != 0
if length < sizeof(string.s): if length < sizeof(string.s):
string.s[0] = <unsigned char>length string.s[0] = <unsigned char>length
memcpy(&string.s[1], chars, length) memcpy(&string.s[1], chars, length)
return string return string
elif length < 256: elif length < 255:
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char)) string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
string.p[0] = length string.p[0] = length
memcpy(&string.p[1], chars, length) memcpy(&string.p[1], chars, length)
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string return string
else: else:
raise Exception(length) i = 0
cdef int n_length_bytes = (length // 256) + ((length % 256) != 0) n_length_bytes = (length // 255) + 1
cdef int i = 0 string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
while length >= 256: for i in range(n_length_bytes-1):
string.p[i] = 255 string.p[i] = 255
length -= 255 string.p[n_length_bytes-1] = length % 255
i += 1 memcpy(&string.p[n_length_bytes], chars, length)
string.p[i] = length assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
memcpy(&string.p[i+1], chars, length) return string
return string
cdef class StringStore: cdef class StringStore:
@ -87,9 +88,13 @@ cdef class StringStore:
utf8str = &self.c[<int>string_or_id] utf8str = &self.c[<int>string_or_id]
return _decode(utf8str) return _decode(utf8str)
elif isinstance(string_or_id, bytes): elif isinstance(string_or_id, bytes):
if len(string_or_id) == 0:
return 0
utf8str = self.intern(<unsigned char*>string_or_id, len(string_or_id)) utf8str = self.intern(<unsigned char*>string_or_id, len(string_or_id))
return utf8str - self.c return utf8str - self.c
elif isinstance(string_or_id, unicode): elif isinstance(string_or_id, unicode):
if len(string_or_id) == 0:
return 0
byte_string = string_or_id.encode('utf8') byte_string = string_or_id.encode('utf8')
utf8str = self.intern(<unsigned char*>byte_string, len(byte_string)) utf8str = self.intern(<unsigned char*>byte_string, len(byte_string))
return utf8str - self.c return utf8str - self.c