mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Make floret murmurhash endian-neutral (#9735)
This commit is contained in:
parent
1163073756
commit
837d241b68
|
@ -1,5 +1,5 @@
|
|||
cimport numpy as np
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from cython.operator cimport dereference as deref
|
||||
from libcpp.set cimport set as cppset
|
||||
from murmurhash.mrmr cimport hash128_x64
|
||||
|
@ -353,12 +353,18 @@ cdef class Vectors:
|
|||
key (str): The string key.
|
||||
RETURNS: A list of the integer hashes.
|
||||
"""
|
||||
cdef uint32_t[4] out
|
||||
# MurmurHash3_x64_128 returns an array of 2 uint64_t values.
|
||||
cdef uint64_t[2] out
|
||||
chars = s.encode("utf8")
|
||||
cdef char* utf8_string = chars
|
||||
hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
|
||||
rows = [out[i] for i in range(min(self.hash_count, 4))]
|
||||
return rows
|
||||
rows = [
|
||||
out[0] & 0xffffffffu,
|
||||
out[0] >> 32,
|
||||
out[1] & 0xffffffffu,
|
||||
out[1] >> 32,
|
||||
]
|
||||
return rows[:min(self.hash_count, 4)]
|
||||
|
||||
def _get_ngrams(self, unicode key):
|
||||
"""Get all padded ngram strings using the ngram settings.
|
||||
|
|
Loading…
Reference in New Issue
Block a user