mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Fix short string optimization in strings.pyx. StringStore tests now all pass.
This commit is contained in:
parent
09a3055630
commit
52d538ea42
|
@ -18,46 +18,47 @@ cpdef hash_t hash_string(unicode string) except 0:
|
|||
|
||||
|
||||
cdef unicode _decode(const Utf8Str* string):
|
||||
cdef int i, length
|
||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||
return string.s[1:string.s[0]+1].decode('utf8')
|
||||
elif string.p[0] < 256:
|
||||
elif string.p[0] < 255:
|
||||
return string.p[1:string.p[0]+1].decode('utf8')
|
||||
else:
|
||||
raise Exception(string.p[0])
|
||||
cdef int i = 0
|
||||
cdef int length = 0
|
||||
while string.p[i] == 255:
|
||||
i = 0
|
||||
length = 0
|
||||
while string.p[i] == 255:
|
||||
i += 1
|
||||
length += 255
|
||||
length += string.p[i]
|
||||
i += 1
|
||||
length += 255
|
||||
length += string.p[i]
|
||||
i += 1
|
||||
return string.p[i:length - i].decode('utf8')
|
||||
return string.p[i:length + i].decode('utf8')
|
||||
|
||||
|
||||
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *:
|
||||
cdef int n_length_bytes
|
||||
cdef int i
|
||||
cdef Utf8Str string
|
||||
assert length != 0
|
||||
if length < sizeof(string.s):
|
||||
string.s[0] = <unsigned char>length
|
||||
memcpy(&string.s[1], chars, length)
|
||||
return string
|
||||
elif length < 256:
|
||||
elif length < 255:
|
||||
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
|
||||
string.p[0] = length
|
||||
memcpy(&string.p[1], chars, length)
|
||||
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
|
||||
return string
|
||||
else:
|
||||
raise Exception(length)
|
||||
cdef int n_length_bytes = (length // 256) + ((length % 256) != 0)
|
||||
cdef int i = 0
|
||||
while length >= 256:
|
||||
string.p[i] = 255
|
||||
length -= 255
|
||||
i += 1
|
||||
string.p[i] = length
|
||||
memcpy(&string.p[i+1], chars, length)
|
||||
return string
|
||||
i = 0
|
||||
n_length_bytes = (length // 255) + 1
|
||||
string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
|
||||
for i in range(n_length_bytes-1):
|
||||
string.p[i] = 255
|
||||
string.p[n_length_bytes-1] = length % 255
|
||||
memcpy(&string.p[n_length_bytes], chars, length)
|
||||
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
|
||||
return string
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
|
@ -87,9 +88,13 @@ cdef class StringStore:
|
|||
utf8str = &self.c[<int>string_or_id]
|
||||
return _decode(utf8str)
|
||||
elif isinstance(string_or_id, bytes):
|
||||
if len(string_or_id) == 0:
|
||||
return 0
|
||||
utf8str = self.intern(<unsigned char*>string_or_id, len(string_or_id))
|
||||
return utf8str - self.c
|
||||
elif isinstance(string_or_id, unicode):
|
||||
if len(string_or_id) == 0:
|
||||
return 0
|
||||
byte_string = string_or_id.encode('utf8')
|
||||
utf8str = self.intern(<unsigned char*>byte_string, len(byte_string))
|
||||
return utf8str - self.c
|
||||
|
|
Loading…
Reference in New Issue
Block a user