From 52d538ea420c49e75a1eed95d88c2ecc90b20fb0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 20 Jul 2015 12:05:23 +0200 Subject: [PATCH] * Fix short string optimization in strings.pyx. StringStore tests now all pass. --- spacy/strings.pyx | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index c311d04b0..ccc186e5f 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -18,46 +18,47 @@ cpdef hash_t hash_string(unicode string) except 0: cdef unicode _decode(const Utf8Str* string): + cdef int i, length if string.s[0] < sizeof(string.s) and string.s[0] != 0: return string.s[1:string.s[0]+1].decode('utf8') - elif string.p[0] < 256: + elif string.p[0] < 255: return string.p[1:string.p[0]+1].decode('utf8') else: - raise Exception(string.p[0]) - cdef int i = 0 - cdef int length = 0 - while string.p[i] == 255: + i = 0 + length = 0 + while string.p[i] == 255: + i += 1 + length += 255 + length += string.p[i] i += 1 - length += 255 - length += string.p[i] - i += 1 - return string.p[i:length - i].decode('utf8') + return string.p[i:length + i].decode('utf8') cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *: + cdef int n_length_bytes + cdef int i cdef Utf8Str string assert length != 0 if length < sizeof(string.s): string.s[0] = length memcpy(&string.s[1], chars, length) return string - elif length < 256: + elif length < 255: string.p = mem.alloc(length + 1, sizeof(unsigned char)) string.p[0] = length memcpy(&string.p[1], chars, length) assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] return string else: - raise Exception(length) - cdef int n_length_bytes = (length // 256) + ((length % 256) != 0) - cdef int i = 0 - while length >= 256: - string.p[i] = 255 - length -= 255 - i += 1 - string.p[i] = length - memcpy(&string.p[i+1], chars, length) - return string + i = 0 + n_length_bytes = (length // 255) + 1 + string.p = mem.alloc(length + n_length_bytes, sizeof(unsigned char)) + for i in range(n_length_bytes-1): + string.p[i] = 255 + string.p[n_length_bytes-1] = length % 255 + memcpy(&string.p[n_length_bytes], chars, length) + assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] + return string cdef class StringStore: @@ -87,9 +88,13 @@ cdef class StringStore: utf8str = &self.c[string_or_id] return _decode(utf8str) elif isinstance(string_or_id, bytes): + if len(string_or_id) == 0: + return 0 utf8str = self.intern(string_or_id, len(string_or_id)) return utf8str - self.c elif isinstance(string_or_id, unicode): + if len(string_or_id) == 0: + return 0 byte_string = string_or_id.encode('utf8') utf8str = self.intern(byte_string, len(byte_string)) return utf8str - self.c