mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Switch to FNV1A hashing
This commit is contained in:
parent
e7626f423a
commit
557799358c
|
@ -1,6 +1,7 @@
|
|||
import weakref
|
||||
|
||||
import numpy
|
||||
import ctypes
|
||||
from numpy.testing import assert_array_equal
|
||||
from murmurhash.mrmr import hash
|
||||
import pytest
|
||||
|
@ -14,6 +15,7 @@ from spacy.lang.xx import MultiLanguage
|
|||
from spacy.language import Language
|
||||
from spacy.lexeme import Lexeme
|
||||
from spacy.tokens import Doc, Span, SpanGroup, Token
|
||||
from spacy.tokens.doc import get_fnv1a_hash
|
||||
from spacy.util import get_search_char_byte_arrays
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
@ -994,9 +996,8 @@ def test_doc_spans_setdefault(en_tokenizer):
|
|||
assert len(doc.spans["key3"]) == 2
|
||||
|
||||
|
||||
def _get_32_bit_hash(input: str) -> int:
|
||||
working_hash = hash(input.encode("UTF-8"))
|
||||
return working_hash
|
||||
def _encode_and_hash(input: str) -> int:
|
||||
return get_fnv1a_hash(input.encode("UTF-8"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
|
@ -1051,62 +1052,60 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
|||
hashes_per_tok=10,
|
||||
)
|
||||
|
||||
print(hashes)
|
||||
assert hashes[0][0] == _encode_and_hash("s")
|
||||
assert hashes[0][1] == _encode_and_hash("spa")
|
||||
assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac")
|
||||
assert hashes[0][3] == _encode_and_hash("Cy" if case_sensitive else "cy")
|
||||
assert hashes[0][4] == _encode_and_hash("aCy" if case_sensitive else "acy")
|
||||
assert hashes[0][5] == _encode_and_hash("paCy" if case_sensitive else "pacy")
|
||||
assert hashes[0][6] == _encode_and_hash("spaCy" if case_sensitive else "spacy")
|
||||
|
||||
assert hashes[0][0] == _get_32_bit_hash("s")
|
||||
assert hashes[0][1] == _get_32_bit_hash("spa")
|
||||
assert hashes[0][2] == _get_32_bit_hash("spaC" if case_sensitive else "spac")
|
||||
assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy")
|
||||
assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy")
|
||||
assert hashes[0][5] == _get_32_bit_hash("paCy" if case_sensitive else "pacy")
|
||||
assert hashes[0][6] == _get_32_bit_hash("spaCy" if case_sensitive else "spacy")
|
||||
|
||||
assert hashes[0][7] == _get_32_bit_hash("p")
|
||||
assert hashes[0][8] == _get_32_bit_hash("p")
|
||||
assert hashes[0][9] == _get_32_bit_hash("p")
|
||||
assert hashes[1][0] == _get_32_bit_hash("✨")
|
||||
assert hashes[1][1] == _get_32_bit_hash("✨")
|
||||
assert hashes[1][2] == _get_32_bit_hash("✨")
|
||||
assert hashes[1][3] == _get_32_bit_hash("✨")
|
||||
assert hashes[1][4] == _get_32_bit_hash("✨")
|
||||
assert hashes[1][5] == _get_32_bit_hash("✨")
|
||||
assert hashes[1][6] == _get_32_bit_hash("✨")
|
||||
assert hashes[0][7] == _encode_and_hash("p")
|
||||
assert hashes[0][8] == _encode_and_hash("p")
|
||||
assert hashes[0][9] == _encode_and_hash("p")
|
||||
assert hashes[1][0] == _encode_and_hash("✨")
|
||||
assert hashes[1][1] == _encode_and_hash("✨")
|
||||
assert hashes[1][2] == _encode_and_hash("✨")
|
||||
assert hashes[1][3] == _encode_and_hash("✨")
|
||||
assert hashes[1][4] == _encode_and_hash("✨")
|
||||
assert hashes[1][5] == _encode_and_hash("✨")
|
||||
assert hashes[1][6] == _encode_and_hash("✨")
|
||||
assert hashes[1][7] == 0
|
||||
assert hashes[1][8] == _get_32_bit_hash("✨")
|
||||
assert hashes[1][9] == _get_32_bit_hash("✨")
|
||||
assert hashes[2][0] == _get_32_bit_hash("a")
|
||||
assert hashes[2][1] == _get_32_bit_hash("and")
|
||||
assert hashes[2][2] == _get_32_bit_hash("and")
|
||||
assert hashes[2][3] == _get_32_bit_hash("nd")
|
||||
assert hashes[2][4] == _get_32_bit_hash("and")
|
||||
assert hashes[2][5] == _get_32_bit_hash("and")
|
||||
assert hashes[2][6] == _get_32_bit_hash("and")
|
||||
assert hashes[1][8] == _encode_and_hash("✨")
|
||||
assert hashes[1][9] == _encode_and_hash("✨")
|
||||
assert hashes[2][0] == _encode_and_hash("a")
|
||||
assert hashes[2][1] == _encode_and_hash("and")
|
||||
assert hashes[2][2] == _encode_and_hash("and")
|
||||
assert hashes[2][3] == _encode_and_hash("nd")
|
||||
assert hashes[2][4] == _encode_and_hash("and")
|
||||
assert hashes[2][5] == _encode_and_hash("and")
|
||||
assert hashes[2][6] == _encode_and_hash("and")
|
||||
assert hashes[2][7] == 0
|
||||
assert hashes[2][8] == 0
|
||||
assert hashes[2][9] == 0
|
||||
assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p")
|
||||
assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro")
|
||||
assert hashes[3][2] == _get_32_bit_hash("Prod" if case_sensitive else "prod")
|
||||
assert hashes[3][3] == _get_32_bit_hash("gy")
|
||||
assert hashes[3][4] == _get_32_bit_hash("igy")
|
||||
assert hashes[3][5] == _get_32_bit_hash("digy")
|
||||
assert hashes[3][6] == _get_32_bit_hash("odigy")
|
||||
assert hashes[3][7] == 0 if case_sensitive else _get_32_bit_hash("pr")
|
||||
assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
|
||||
assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
|
||||
assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
|
||||
assert hashes[3][3] == _encode_and_hash("gy")
|
||||
assert hashes[3][4] == _encode_and_hash("igy")
|
||||
assert hashes[3][5] == _encode_and_hash("digy")
|
||||
assert hashes[3][6] == _encode_and_hash("odigy")
|
||||
assert hashes[3][7] == 0 if case_sensitive else _encode_and_hash("pr")
|
||||
|
||||
assert hashes[3][8] == _get_32_bit_hash("r")
|
||||
assert hashes[3][8] == _encode_and_hash("r")
|
||||
|
||||
if case_sensitive:
|
||||
assert hashes[3][9] == _get_32_bit_hash("r")
|
||||
assert hashes[3][9] == _encode_and_hash("r")
|
||||
else:
|
||||
assert hashes[3][9] == _get_32_bit_hash("rp")
|
||||
assert hashes[3][9] == _encode_and_hash("rp")
|
||||
|
||||
# check values are the same cross-platform
|
||||
if case_sensitive:
|
||||
assert hashes[0][2] == -1253438126
|
||||
assert hashes[0][2] == 1140960578
|
||||
else:
|
||||
assert hashes[0][2] == -2095352600
|
||||
assert hashes[1][3] == 910783208
|
||||
assert hashes[3][8] == 1553167345
|
||||
assert hashes[0][2] == 604076770
|
||||
assert hashes[1][3] == 3384544169
|
||||
assert hashes[3][8] == 4144776981
|
||||
|
||||
|
||||
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
||||
|
@ -1148,26 +1147,26 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
|||
hashes_per_tok=5,
|
||||
)
|
||||
|
||||
assert hashes[0][0] == _get_32_bit_hash("cy")
|
||||
assert hashes[0][1] == _get_32_bit_hash("acy")
|
||||
assert hashes[0][2] == _get_32_bit_hash("pacy")
|
||||
assert hashes[0][3] == _get_32_bit_hash("spacy")
|
||||
assert hashes[0][4] == _get_32_bit_hash("p")
|
||||
assert hashes[1][0] == _get_32_bit_hash("✨")
|
||||
assert hashes[1][1] == _get_32_bit_hash("✨")
|
||||
assert hashes[1][2] == _get_32_bit_hash("✨")
|
||||
assert hashes[1][3] == _get_32_bit_hash("✨")
|
||||
assert hashes[0][0] == _encode_and_hash("cy")
|
||||
assert hashes[0][1] == _encode_and_hash("acy")
|
||||
assert hashes[0][2] == _encode_and_hash("pacy")
|
||||
assert hashes[0][3] == _encode_and_hash("spacy")
|
||||
assert hashes[0][4] == _encode_and_hash("p")
|
||||
assert hashes[1][0] == _encode_and_hash("✨")
|
||||
assert hashes[1][1] == _encode_and_hash("✨")
|
||||
assert hashes[1][2] == _encode_and_hash("✨")
|
||||
assert hashes[1][3] == _encode_and_hash("✨")
|
||||
assert hashes[1][4] == 0
|
||||
assert hashes[2][0] == _get_32_bit_hash("nd")
|
||||
assert hashes[2][1] == _get_32_bit_hash("and")
|
||||
assert hashes[2][2] == _get_32_bit_hash("and")
|
||||
assert hashes[2][3] == _get_32_bit_hash("and")
|
||||
assert hashes[2][0] == _encode_and_hash("nd")
|
||||
assert hashes[2][1] == _encode_and_hash("and")
|
||||
assert hashes[2][2] == _encode_and_hash("and")
|
||||
assert hashes[2][3] == _encode_and_hash("and")
|
||||
assert hashes[2][4] == 0
|
||||
assert hashes[3][0] == _get_32_bit_hash("gy")
|
||||
assert hashes[3][1] == _get_32_bit_hash("igy")
|
||||
assert hashes[3][2] == _get_32_bit_hash("digy")
|
||||
assert hashes[3][3] == _get_32_bit_hash("odigy")
|
||||
assert hashes[3][4] == _get_32_bit_hash("pr")
|
||||
assert hashes[3][0] == _encode_and_hash("gy")
|
||||
assert hashes[3][1] == _encode_and_hash("igy")
|
||||
assert hashes[3][2] == _encode_and_hash("digy")
|
||||
assert hashes[3][3] == _encode_and_hash("odigy")
|
||||
assert hashes[3][4] == _encode_and_hash("pr")
|
||||
|
||||
|
||||
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
|
||||
|
@ -1205,8 +1204,8 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
|
|||
hashes_per_tok=2,
|
||||
)
|
||||
|
||||
assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length])
|
||||
assert hashes[0][1] == _get_32_bit_hash("sp𐌞cé"[-s_length:])
|
||||
assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
|
||||
assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
|
@ -1273,37 +1272,37 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
|
|||
)
|
||||
|
||||
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
|
||||
assert hashes[0][0] == _get_32_bit_hash("i")
|
||||
assert hashes[0][1] == _get_32_bit_hash("İ".lower())
|
||||
assert hashes[0][0] == _encode_and_hash("i")
|
||||
assert hashes[0][1] == _encode_and_hash("İ".lower())
|
||||
if case_sensitive:
|
||||
assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "İ")
|
||||
assert hashes[0][3] == _get_32_bit_hash("İ".lower() + "İ")
|
||||
assert hashes[0][4] == _get_32_bit_hash("İ")
|
||||
assert hashes[0][5] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ")
|
||||
assert hashes[0][6] == _get_32_bit_hash("İ".lower() + "İ")
|
||||
assert hashes[0][7] == _get_32_bit_hash("İ".lower() + "İ")
|
||||
assert hashes[0][8] == _get_32_bit_hash("İ")
|
||||
assert hashes[0][9] == _get_32_bit_hash("İ")
|
||||
assert hashes[0][12] == _get_32_bit_hash("İ")
|
||||
assert hashes[0][13] == _get_32_bit_hash("İ")
|
||||
assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ")
|
||||
assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ")
|
||||
assert hashes[0][4] == _encode_and_hash("İ")
|
||||
assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ")
|
||||
assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ")
|
||||
assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ")
|
||||
assert hashes[0][8] == _encode_and_hash("İ")
|
||||
assert hashes[0][9] == _encode_and_hash("İ")
|
||||
assert hashes[0][12] == _encode_and_hash("İ")
|
||||
assert hashes[0][13] == _encode_and_hash("İ")
|
||||
|
||||
else:
|
||||
assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "i")
|
||||
assert hashes[0][3] == _get_32_bit_hash("İ".lower() * 2)
|
||||
assert hashes[0][4] == _get_32_bit_hash(COMBINING_DOT_ABOVE)
|
||||
assert hashes[0][5] == _get_32_bit_hash("İ".lower())
|
||||
assert hashes[0][6] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower())
|
||||
assert hashes[0][7] == _get_32_bit_hash("İ".lower() * 2)
|
||||
assert hashes[0][8] == _get_32_bit_hash("i")
|
||||
assert hashes[0][9] == _get_32_bit_hash("İ".lower())
|
||||
assert hashes[0][10] == _get_32_bit_hash("İ".lower() + "i")
|
||||
assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2)
|
||||
assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE)
|
||||
assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i")
|
||||
assert hashes[0][14] == _get_32_bit_hash(
|
||||
assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
|
||||
assert hashes[0][3] == _encode_and_hash("İ".lower() * 2)
|
||||
assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE)
|
||||
assert hashes[0][5] == _encode_and_hash("İ".lower())
|
||||
assert hashes[0][6] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ".lower())
|
||||
assert hashes[0][7] == _encode_and_hash("İ".lower() * 2)
|
||||
assert hashes[0][8] == _encode_and_hash("i")
|
||||
assert hashes[0][9] == _encode_and_hash("İ".lower())
|
||||
assert hashes[0][10] == _encode_and_hash("İ".lower() + "i")
|
||||
assert hashes[0][11] == _encode_and_hash("İ".lower() * 2)
|
||||
assert hashes[0][12] == _encode_and_hash(COMBINING_DOT_ABOVE)
|
||||
assert hashes[0][13] == _encode_and_hash(COMBINING_DOT_ABOVE + "i")
|
||||
assert hashes[0][14] == _encode_and_hash(
|
||||
COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE
|
||||
)
|
||||
assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2)
|
||||
assert hashes[0][15] == _encode_and_hash((COMBINING_DOT_ABOVE + "i") * 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
|
@ -1346,21 +1345,21 @@ def test_get_character_combination_hashes_string_store_spec_cases(
|
|||
ss_max_l=0,
|
||||
hashes_per_tok=3,
|
||||
)
|
||||
assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl")
|
||||
assert hashes[0][1] == _get_32_bit_hash("19")
|
||||
assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
|
||||
assert hashes[0][1] == _encode_and_hash("19")
|
||||
assert hashes[0][2] == 0
|
||||
assert hashes[1][0] == _get_32_bit_hash("be")
|
||||
assert hashes[1][1] == _get_32_bit_hash("ee")
|
||||
assert hashes[1][0] == _encode_and_hash("be")
|
||||
assert hashes[1][1] == _encode_and_hash("ee")
|
||||
if case_sensitive:
|
||||
assert hashes[1][2] == 0
|
||||
else:
|
||||
assert hashes[1][2] == _get_32_bit_hash("ee")
|
||||
assert hashes[2][0] == hashes[3][0] == _get_32_bit_hash("se")
|
||||
assert hashes[2][1] == hashes[3][1] == _get_32_bit_hash("ty")
|
||||
assert hashes[1][2] == _encode_and_hash("ee")
|
||||
assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
|
||||
assert hashes[2][1] == hashes[3][1] == _encode_and_hash("ty")
|
||||
if case_sensitive:
|
||||
assert hashes[2][2] == hashes[3][2] == 0
|
||||
else:
|
||||
assert hashes[2][2] == hashes[3][2] == _get_32_bit_hash("ee")
|
||||
assert hashes[2][2] == hashes[3][2] == _encode_and_hash("ee")
|
||||
|
||||
|
||||
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||
|
|
|
@ -6,6 +6,8 @@ from ..structs cimport TokenC, LexemeC, SpanC
|
|||
from ..typedefs cimport attr_t
|
||||
from ..attrs cimport attr_id_t
|
||||
|
||||
from libc.stdint cimport uint32_t
|
||||
|
||||
|
||||
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
|
||||
cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil
|
||||
|
@ -72,12 +74,18 @@ cdef void _search_for_chars(
|
|||
) nogil
|
||||
|
||||
|
||||
cdef uint32_t fnv1a_hash(
|
||||
const unsigned char* ptr,
|
||||
const int length
|
||||
) nogil
|
||||
|
||||
|
||||
cdef int _write_hashes(
|
||||
const unsigned char* res_buf,
|
||||
const unsigned char* aff_l_buf,
|
||||
const unsigned char* offset_buf,
|
||||
const int end_idx,
|
||||
np.int64_t* hashes_ptr,
|
||||
np.uint32_t* hashes_ptr,
|
||||
) nogil
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ cimport numpy as np
|
|||
from cpython cimport array
|
||||
from libc.string cimport memcpy, memcmp, memset, strlen
|
||||
from libc.math cimport sqrt
|
||||
from libc.stdint cimport int32_t, uint64_t
|
||||
from libc.stdint cimport int32_t, uint64_t, uint32_t
|
||||
|
||||
import copy
|
||||
from collections import Counter, defaultdict
|
||||
|
@ -17,7 +17,6 @@ import srsly
|
|||
from thinc.api import get_array_module, get_current_ops
|
||||
from thinc.util import copy_array
|
||||
import warnings
|
||||
from murmurhash.mrmr cimport hash32
|
||||
|
||||
from .span cimport Span
|
||||
from .token cimport MISSING_DEP
|
||||
|
@ -1809,15 +1808,15 @@ cdef class Doc:
|
|||
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
|
||||
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
|
||||
cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok
|
||||
cdef np.int64_t* hashes_ptr = <np.int64_t*> mem.alloc(
|
||||
total_hashes, sizeof(np.int64_t))
|
||||
cdef np.uint32_t* hashes_ptr = <np.uint32_t*> mem.alloc(
|
||||
total_hashes, sizeof(np.uint32_t))
|
||||
|
||||
# Define working variables
|
||||
cdef TokenC tok_c
|
||||
cdef int hash_idx, tok_i, tok_str_l
|
||||
cdef attr_t num_tok_attr
|
||||
cdef const unsigned char* tok_str
|
||||
cdef np.int64_t* w_hashes_ptr = hashes_ptr
|
||||
cdef np.uint32_t* w_hashes_ptr = hashes_ptr
|
||||
|
||||
for tok_i in range(doc_l):
|
||||
tok_c = self.c[tok_i]
|
||||
|
@ -1843,9 +1842,9 @@ cdef class Doc:
|
|||
ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True)
|
||||
w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr)
|
||||
|
||||
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty(
|
||||
(doc_l, hashes_per_tok), dtype="int64")
|
||||
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.int64_t))
|
||||
cdef np.ndarray[np.uint32_t, ndim=2] hashes = numpy.empty(
|
||||
(doc_l, hashes_per_tok), dtype="uint32")
|
||||
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint32_t))
|
||||
return hashes
|
||||
|
||||
|
||||
|
@ -2029,6 +2028,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
|||
lca_matrix[k, j] = lca - start
|
||||
return lca_matrix
|
||||
|
||||
|
||||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
cdef void _set_prefix_lengths(
|
||||
const unsigned char* tok_str,
|
||||
|
@ -2181,13 +2181,38 @@ cdef void _search_for_chars(
|
|||
memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
|
||||
|
||||
|
||||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
cdef uint32_t fnv1a_hash(
|
||||
const unsigned char* ptr,
|
||||
const int length
|
||||
) nogil:
|
||||
""" Returns the FNV-1a hash for a sequence of bytes.
|
||||
The behaviour of this method has been verified against several pieces
|
||||
of data from http://www.isthe.com/chongo/src/fnv/test_fnv.c.
|
||||
"""
|
||||
cdef uint32_t hash_val = 0x811c9dc5
|
||||
cdef int offset = 0
|
||||
|
||||
while offset < length:
|
||||
hash_val ^= ptr[offset]
|
||||
hash_val *= 0x01000193
|
||||
offset += 1
|
||||
|
||||
return hash_val
|
||||
|
||||
|
||||
def get_fnv1a_hash(input: bytes):
|
||||
""" Python method to facilitate testing *fnv1a_hash*. """
|
||||
return fnv1a_hash(input, len(input))
|
||||
|
||||
|
||||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
cdef int _write_hashes(
|
||||
const unsigned char* res_buf,
|
||||
const unsigned char* aff_l_buf,
|
||||
const unsigned char* offset_buf,
|
||||
const int end_idx,
|
||||
np.int64_t* hashes_ptr,
|
||||
np.uint32_t* hashes_ptr,
|
||||
) nogil:
|
||||
""" Write hashes for a token/rich property group combination.
|
||||
|
||||
|
@ -2208,9 +2233,9 @@ cdef int _write_hashes(
|
|||
offset = offset_buf[aff_l - 1]
|
||||
if offset > 0:
|
||||
if end_idx != 0:
|
||||
hash_val = hash32(<void*> (res_buf + end_idx - offset), offset, 0)
|
||||
hash_val = fnv1a_hash(res_buf + end_idx - offset, offset)
|
||||
else:
|
||||
hash_val = hash32(<void*> res_buf, offset, 0)
|
||||
hash_val = fnv1a_hash(res_buf, offset)
|
||||
hashes_ptr[hash_idx] = hash_val
|
||||
hash_idx += 1
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user