Make floret murmurhash endian-neutral (#9735)

This commit is contained in:
Adriane Boyd 2021-12-20 17:11:31 +01:00 committed by GitHub
parent 1163073756
commit 837d241b68
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,5 +1,5 @@
cimport numpy as np cimport numpy as np
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t, uint64_t
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from libcpp.set cimport set as cppset from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64 from murmurhash.mrmr cimport hash128_x64
@ -353,12 +353,18 @@ cdef class Vectors:
key (str): The string key. key (str): The string key.
RETURNS: A list of the integer hashes. RETURNS: A list of the integer hashes.
""" """
cdef uint32_t[4] out # MurmurHash3_x64_128 returns an array of 2 uint64_t values.
cdef uint64_t[2] out
chars = s.encode("utf8") chars = s.encode("utf8")
cdef char* utf8_string = chars cdef char* utf8_string = chars
hash128_x64(utf8_string, len(chars), self.hash_seed, &out) hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
rows = [out[i] for i in range(min(self.hash_count, 4))] rows = [
return rows out[0] & 0xffffffffu,
out[0] >> 32,
out[1] & 0xffffffffu,
out[1] >> 32,
]
return rows[:min(self.hash_count, 4)]
def _get_ngrams(self, unicode key): def _get_ngrams(self, unicode key):
"""Get all padded ngram strings using the ngram settings. """Get all padded ngram strings using the ngram settings.