hash_string() should not depend on python's internal unicode representation, also fixes https://github.com/spacy-io/sense2vec/issues/5 for py2

This commit is contained in:
Henning Peters 2016-03-06 09:19:27 +01:00
parent 7adbd7a785
commit b740f20191

View File

@ -23,10 +23,8 @@ import ujson as json
cpdef hash_t hash_string(unicode string) except 0:
# This has to be like this for
chars = <char*>PyUnicode_AS_DATA(string)
size = PyUnicode_GET_DATA_SIZE(string)
return hash64(chars, size, 1)
chars = string.encode('utf8')
return hash64(<char*>chars, len(chars), 1)
cdef unicode _decode(const Utf8Str* string):