mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
* Fix short string optimization in strings.pyx. StringStore tests now all pass.
This commit is contained in:
parent
09a3055630
commit
52d538ea42
|
@ -18,45 +18,46 @@ cpdef hash_t hash_string(unicode string) except 0:
|
||||||
|
|
||||||
|
|
||||||
cdef unicode _decode(const Utf8Str* string):
|
cdef unicode _decode(const Utf8Str* string):
|
||||||
|
cdef int i, length
|
||||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||||
return string.s[1:string.s[0]+1].decode('utf8')
|
return string.s[1:string.s[0]+1].decode('utf8')
|
||||||
elif string.p[0] < 256:
|
elif string.p[0] < 255:
|
||||||
return string.p[1:string.p[0]+1].decode('utf8')
|
return string.p[1:string.p[0]+1].decode('utf8')
|
||||||
else:
|
else:
|
||||||
raise Exception(string.p[0])
|
i = 0
|
||||||
cdef int i = 0
|
length = 0
|
||||||
cdef int length = 0
|
|
||||||
while string.p[i] == 255:
|
while string.p[i] == 255:
|
||||||
i += 1
|
i += 1
|
||||||
length += 255
|
length += 255
|
||||||
length += string.p[i]
|
length += string.p[i]
|
||||||
i += 1
|
i += 1
|
||||||
return string.p[i:length - i].decode('utf8')
|
return string.p[i:length + i].decode('utf8')
|
||||||
|
|
||||||
|
|
||||||
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *:
|
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *:
|
||||||
|
cdef int n_length_bytes
|
||||||
|
cdef int i
|
||||||
cdef Utf8Str string
|
cdef Utf8Str string
|
||||||
assert length != 0
|
assert length != 0
|
||||||
if length < sizeof(string.s):
|
if length < sizeof(string.s):
|
||||||
string.s[0] = <unsigned char>length
|
string.s[0] = <unsigned char>length
|
||||||
memcpy(&string.s[1], chars, length)
|
memcpy(&string.s[1], chars, length)
|
||||||
return string
|
return string
|
||||||
elif length < 256:
|
elif length < 255:
|
||||||
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
|
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
|
||||||
string.p[0] = length
|
string.p[0] = length
|
||||||
memcpy(&string.p[1], chars, length)
|
memcpy(&string.p[1], chars, length)
|
||||||
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
|
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
|
||||||
return string
|
return string
|
||||||
else:
|
else:
|
||||||
raise Exception(length)
|
i = 0
|
||||||
cdef int n_length_bytes = (length // 256) + ((length % 256) != 0)
|
n_length_bytes = (length // 255) + 1
|
||||||
cdef int i = 0
|
string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
|
||||||
while length >= 256:
|
for i in range(n_length_bytes-1):
|
||||||
string.p[i] = 255
|
string.p[i] = 255
|
||||||
length -= 255
|
string.p[n_length_bytes-1] = length % 255
|
||||||
i += 1
|
memcpy(&string.p[n_length_bytes], chars, length)
|
||||||
string.p[i] = length
|
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
|
||||||
memcpy(&string.p[i+1], chars, length)
|
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
@ -87,9 +88,13 @@ cdef class StringStore:
|
||||||
utf8str = &self.c[<int>string_or_id]
|
utf8str = &self.c[<int>string_or_id]
|
||||||
return _decode(utf8str)
|
return _decode(utf8str)
|
||||||
elif isinstance(string_or_id, bytes):
|
elif isinstance(string_or_id, bytes):
|
||||||
|
if len(string_or_id) == 0:
|
||||||
|
return 0
|
||||||
utf8str = self.intern(<unsigned char*>string_or_id, len(string_or_id))
|
utf8str = self.intern(<unsigned char*>string_or_id, len(string_or_id))
|
||||||
return utf8str - self.c
|
return utf8str - self.c
|
||||||
elif isinstance(string_or_id, unicode):
|
elif isinstance(string_or_id, unicode):
|
||||||
|
if len(string_or_id) == 0:
|
||||||
|
return 0
|
||||||
byte_string = string_or_id.encode('utf8')
|
byte_string = string_or_id.encode('utf8')
|
||||||
utf8str = self.intern(<unsigned char*>byte_string, len(byte_string))
|
utf8str = self.intern(<unsigned char*>byte_string, len(byte_string))
|
||||||
return utf8str - self.c
|
return utf8str - self.c
|
||||||
|
|
Loading…
Reference in New Issue
Block a user