mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Make floret murmurhash endian-neutral (#9735)
This commit is contained in:
parent
1163073756
commit
837d241b68
|
@ -1,5 +1,5 @@
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from libcpp.set cimport set as cppset
|
from libcpp.set cimport set as cppset
|
||||||
from murmurhash.mrmr cimport hash128_x64
|
from murmurhash.mrmr cimport hash128_x64
|
||||||
|
@ -353,12 +353,18 @@ cdef class Vectors:
|
||||||
key (str): The string key.
|
key (str): The string key.
|
||||||
RETURNS: A list of the integer hashes.
|
RETURNS: A list of the integer hashes.
|
||||||
"""
|
"""
|
||||||
cdef uint32_t[4] out
|
# MurmurHash3_x64_128 returns an array of 2 uint64_t values.
|
||||||
|
cdef uint64_t[2] out
|
||||||
chars = s.encode("utf8")
|
chars = s.encode("utf8")
|
||||||
cdef char* utf8_string = chars
|
cdef char* utf8_string = chars
|
||||||
hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
|
hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
|
||||||
rows = [out[i] for i in range(min(self.hash_count, 4))]
|
rows = [
|
||||||
return rows
|
out[0] & 0xffffffffu,
|
||||||
|
out[0] >> 32,
|
||||||
|
out[1] & 0xffffffffu,
|
||||||
|
out[1] >> 32,
|
||||||
|
]
|
||||||
|
return rows[:min(self.hash_count, 4)]
|
||||||
|
|
||||||
def _get_ngrams(self, unicode key):
|
def _get_ngrams(self, unicode key):
|
||||||
"""Get all padded ngram strings using the ngram settings.
|
"""Get all padded ngram strings using the ngram settings.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user