Switch to FNV1A hashing

This commit is contained in:
richard@explosion.ai 2022-11-02 20:04:43 +01:00
parent e7626f423a
commit 557799358c
3 changed files with 144 additions and 112 deletions

View File

@ -1,6 +1,7 @@
import weakref import weakref
import numpy import numpy
import ctypes
from numpy.testing import assert_array_equal from numpy.testing import assert_array_equal
from murmurhash.mrmr import hash from murmurhash.mrmr import hash
import pytest import pytest
@ -14,6 +15,7 @@ from spacy.lang.xx import MultiLanguage
from spacy.language import Language from spacy.language import Language
from spacy.lexeme import Lexeme from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, SpanGroup, Token from spacy.tokens import Doc, Span, SpanGroup, Token
from spacy.tokens.doc import get_fnv1a_hash
from spacy.util import get_search_char_byte_arrays from spacy.util import get_search_char_byte_arrays
from spacy.vocab import Vocab from spacy.vocab import Vocab
@ -994,9 +996,8 @@ def test_doc_spans_setdefault(en_tokenizer):
assert len(doc.spans["key3"]) == 2 assert len(doc.spans["key3"]) == 2
def _get_32_bit_hash(input: str) -> int: def _encode_and_hash(input: str) -> int:
working_hash = hash(input.encode("UTF-8")) return get_fnv1a_hash(input.encode("UTF-8"))
return working_hash
@pytest.mark.parametrize("case_sensitive", [True, False]) @pytest.mark.parametrize("case_sensitive", [True, False])
@ -1051,62 +1052,60 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
hashes_per_tok=10, hashes_per_tok=10,
) )
print(hashes) assert hashes[0][0] == _encode_and_hash("s")
assert hashes[0][1] == _encode_and_hash("spa")
assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac")
assert hashes[0][3] == _encode_and_hash("Cy" if case_sensitive else "cy")
assert hashes[0][4] == _encode_and_hash("aCy" if case_sensitive else "acy")
assert hashes[0][5] == _encode_and_hash("paCy" if case_sensitive else "pacy")
assert hashes[0][6] == _encode_and_hash("spaCy" if case_sensitive else "spacy")
assert hashes[0][0] == _get_32_bit_hash("s") assert hashes[0][7] == _encode_and_hash("p")
assert hashes[0][1] == _get_32_bit_hash("spa") assert hashes[0][8] == _encode_and_hash("p")
assert hashes[0][2] == _get_32_bit_hash("spaC" if case_sensitive else "spac") assert hashes[0][9] == _encode_and_hash("p")
assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy") assert hashes[1][0] == _encode_and_hash("")
assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy") assert hashes[1][1] == _encode_and_hash("")
assert hashes[0][5] == _get_32_bit_hash("paCy" if case_sensitive else "pacy") assert hashes[1][2] == _encode_and_hash("")
assert hashes[0][6] == _get_32_bit_hash("spaCy" if case_sensitive else "spacy") assert hashes[1][3] == _encode_and_hash("")
assert hashes[1][4] == _encode_and_hash("")
assert hashes[0][7] == _get_32_bit_hash("p") assert hashes[1][5] == _encode_and_hash("")
assert hashes[0][8] == _get_32_bit_hash("p") assert hashes[1][6] == _encode_and_hash("")
assert hashes[0][9] == _get_32_bit_hash("p")
assert hashes[1][0] == _get_32_bit_hash("")
assert hashes[1][1] == _get_32_bit_hash("")
assert hashes[1][2] == _get_32_bit_hash("")
assert hashes[1][3] == _get_32_bit_hash("")
assert hashes[1][4] == _get_32_bit_hash("")
assert hashes[1][5] == _get_32_bit_hash("")
assert hashes[1][6] == _get_32_bit_hash("")
assert hashes[1][7] == 0 assert hashes[1][7] == 0
assert hashes[1][8] == _get_32_bit_hash("") assert hashes[1][8] == _encode_and_hash("")
assert hashes[1][9] == _get_32_bit_hash("") assert hashes[1][9] == _encode_and_hash("")
assert hashes[2][0] == _get_32_bit_hash("a") assert hashes[2][0] == _encode_and_hash("a")
assert hashes[2][1] == _get_32_bit_hash("and") assert hashes[2][1] == _encode_and_hash("and")
assert hashes[2][2] == _get_32_bit_hash("and") assert hashes[2][2] == _encode_and_hash("and")
assert hashes[2][3] == _get_32_bit_hash("nd") assert hashes[2][3] == _encode_and_hash("nd")
assert hashes[2][4] == _get_32_bit_hash("and") assert hashes[2][4] == _encode_and_hash("and")
assert hashes[2][5] == _get_32_bit_hash("and") assert hashes[2][5] == _encode_and_hash("and")
assert hashes[2][6] == _get_32_bit_hash("and") assert hashes[2][6] == _encode_and_hash("and")
assert hashes[2][7] == 0 assert hashes[2][7] == 0
assert hashes[2][8] == 0 assert hashes[2][8] == 0
assert hashes[2][9] == 0 assert hashes[2][9] == 0
assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p") assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro") assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
assert hashes[3][2] == _get_32_bit_hash("Prod" if case_sensitive else "prod") assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
assert hashes[3][3] == _get_32_bit_hash("gy") assert hashes[3][3] == _encode_and_hash("gy")
assert hashes[3][4] == _get_32_bit_hash("igy") assert hashes[3][4] == _encode_and_hash("igy")
assert hashes[3][5] == _get_32_bit_hash("digy") assert hashes[3][5] == _encode_and_hash("digy")
assert hashes[3][6] == _get_32_bit_hash("odigy") assert hashes[3][6] == _encode_and_hash("odigy")
assert hashes[3][7] == 0 if case_sensitive else _get_32_bit_hash("pr") assert hashes[3][7] == 0 if case_sensitive else _encode_and_hash("pr")
assert hashes[3][8] == _get_32_bit_hash("r") assert hashes[3][8] == _encode_and_hash("r")
if case_sensitive: if case_sensitive:
assert hashes[3][9] == _get_32_bit_hash("r") assert hashes[3][9] == _encode_and_hash("r")
else: else:
assert hashes[3][9] == _get_32_bit_hash("rp") assert hashes[3][9] == _encode_and_hash("rp")
# check values are the same cross-platform # check values are the same cross-platform
if case_sensitive: if case_sensitive:
assert hashes[0][2] == -1253438126 assert hashes[0][2] == 1140960578
else: else:
assert hashes[0][2] == -2095352600 assert hashes[0][2] == 604076770
assert hashes[1][3] == 910783208 assert hashes[1][3] == 3384544169
assert hashes[3][8] == 1553167345 assert hashes[3][8] == 4144776981
def test_get_character_combination_hashes_good_case_partial(en_tokenizer): def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
@ -1148,26 +1147,26 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
hashes_per_tok=5, hashes_per_tok=5,
) )
assert hashes[0][0] == _get_32_bit_hash("cy") assert hashes[0][0] == _encode_and_hash("cy")
assert hashes[0][1] == _get_32_bit_hash("acy") assert hashes[0][1] == _encode_and_hash("acy")
assert hashes[0][2] == _get_32_bit_hash("pacy") assert hashes[0][2] == _encode_and_hash("pacy")
assert hashes[0][3] == _get_32_bit_hash("spacy") assert hashes[0][3] == _encode_and_hash("spacy")
assert hashes[0][4] == _get_32_bit_hash("p") assert hashes[0][4] == _encode_and_hash("p")
assert hashes[1][0] == _get_32_bit_hash("") assert hashes[1][0] == _encode_and_hash("")
assert hashes[1][1] == _get_32_bit_hash("") assert hashes[1][1] == _encode_and_hash("")
assert hashes[1][2] == _get_32_bit_hash("") assert hashes[1][2] == _encode_and_hash("")
assert hashes[1][3] == _get_32_bit_hash("") assert hashes[1][3] == _encode_and_hash("")
assert hashes[1][4] == 0 assert hashes[1][4] == 0
assert hashes[2][0] == _get_32_bit_hash("nd") assert hashes[2][0] == _encode_and_hash("nd")
assert hashes[2][1] == _get_32_bit_hash("and") assert hashes[2][1] == _encode_and_hash("and")
assert hashes[2][2] == _get_32_bit_hash("and") assert hashes[2][2] == _encode_and_hash("and")
assert hashes[2][3] == _get_32_bit_hash("and") assert hashes[2][3] == _encode_and_hash("and")
assert hashes[2][4] == 0 assert hashes[2][4] == 0
assert hashes[3][0] == _get_32_bit_hash("gy") assert hashes[3][0] == _encode_and_hash("gy")
assert hashes[3][1] == _get_32_bit_hash("igy") assert hashes[3][1] == _encode_and_hash("igy")
assert hashes[3][2] == _get_32_bit_hash("digy") assert hashes[3][2] == _encode_and_hash("digy")
assert hashes[3][3] == _get_32_bit_hash("odigy") assert hashes[3][3] == _encode_and_hash("odigy")
assert hashes[3][4] == _get_32_bit_hash("pr") assert hashes[3][4] == _encode_and_hash("pr")
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
@ -1205,8 +1204,8 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
hashes_per_tok=2, hashes_per_tok=2,
) )
assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length]) assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
assert hashes[0][1] == _get_32_bit_hash("sp𐌞cé"[-s_length:]) assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:])
@pytest.mark.parametrize("case_sensitive", [True, False]) @pytest.mark.parametrize("case_sensitive", [True, False])
@ -1273,37 +1272,37 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
) )
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
assert hashes[0][0] == _get_32_bit_hash("i") assert hashes[0][0] == _encode_and_hash("i")
assert hashes[0][1] == _get_32_bit_hash("İ".lower()) assert hashes[0][1] == _encode_and_hash("İ".lower())
if case_sensitive: if case_sensitive:
assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "İ") assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ")
assert hashes[0][3] == _get_32_bit_hash("İ".lower() + "İ") assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ")
assert hashes[0][4] == _get_32_bit_hash("İ") assert hashes[0][4] == _encode_and_hash("İ")
assert hashes[0][5] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ") assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ")
assert hashes[0][6] == _get_32_bit_hash("İ".lower() + "İ") assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ")
assert hashes[0][7] == _get_32_bit_hash("İ".lower() + "İ") assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ")
assert hashes[0][8] == _get_32_bit_hash("İ") assert hashes[0][8] == _encode_and_hash("İ")
assert hashes[0][9] == _get_32_bit_hash("İ") assert hashes[0][9] == _encode_and_hash("İ")
assert hashes[0][12] == _get_32_bit_hash("İ") assert hashes[0][12] == _encode_and_hash("İ")
assert hashes[0][13] == _get_32_bit_hash("İ") assert hashes[0][13] == _encode_and_hash("İ")
else: else:
assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "i") assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
assert hashes[0][3] == _get_32_bit_hash("İ".lower() * 2) assert hashes[0][3] == _encode_and_hash("İ".lower() * 2)
assert hashes[0][4] == _get_32_bit_hash(COMBINING_DOT_ABOVE) assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE)
assert hashes[0][5] == _get_32_bit_hash("İ".lower()) assert hashes[0][5] == _encode_and_hash("İ".lower())
assert hashes[0][6] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower()) assert hashes[0][6] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ".lower())
assert hashes[0][7] == _get_32_bit_hash("İ".lower() * 2) assert hashes[0][7] == _encode_and_hash("İ".lower() * 2)
assert hashes[0][8] == _get_32_bit_hash("i") assert hashes[0][8] == _encode_and_hash("i")
assert hashes[0][9] == _get_32_bit_hash("İ".lower()) assert hashes[0][9] == _encode_and_hash("İ".lower())
assert hashes[0][10] == _get_32_bit_hash("İ".lower() + "i") assert hashes[0][10] == _encode_and_hash("İ".lower() + "i")
assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2) assert hashes[0][11] == _encode_and_hash("İ".lower() * 2)
assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE) assert hashes[0][12] == _encode_and_hash(COMBINING_DOT_ABOVE)
assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i") assert hashes[0][13] == _encode_and_hash(COMBINING_DOT_ABOVE + "i")
assert hashes[0][14] == _get_32_bit_hash( assert hashes[0][14] == _encode_and_hash(
COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE
) )
assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2) assert hashes[0][15] == _encode_and_hash((COMBINING_DOT_ABOVE + "i") * 2)
@pytest.mark.parametrize("case_sensitive", [True, False]) @pytest.mark.parametrize("case_sensitive", [True, False])
@ -1346,21 +1345,21 @@ def test_get_character_combination_hashes_string_store_spec_cases(
ss_max_l=0, ss_max_l=0,
hashes_per_tok=3, hashes_per_tok=3,
) )
assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl") assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
assert hashes[0][1] == _get_32_bit_hash("19") assert hashes[0][1] == _encode_and_hash("19")
assert hashes[0][2] == 0 assert hashes[0][2] == 0
assert hashes[1][0] == _get_32_bit_hash("be") assert hashes[1][0] == _encode_and_hash("be")
assert hashes[1][1] == _get_32_bit_hash("ee") assert hashes[1][1] == _encode_and_hash("ee")
if case_sensitive: if case_sensitive:
assert hashes[1][2] == 0 assert hashes[1][2] == 0
else: else:
assert hashes[1][2] == _get_32_bit_hash("ee") assert hashes[1][2] == _encode_and_hash("ee")
assert hashes[2][0] == hashes[3][0] == _get_32_bit_hash("se") assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
assert hashes[2][1] == hashes[3][1] == _get_32_bit_hash("ty") assert hashes[2][1] == hashes[3][1] == _encode_and_hash("ty")
if case_sensitive: if case_sensitive:
assert hashes[2][2] == hashes[3][2] == 0 assert hashes[2][2] == hashes[3][2] == 0
else: else:
assert hashes[2][2] == hashes[3][2] == _get_32_bit_hash("ee") assert hashes[2][2] == hashes[3][2] == _encode_and_hash("ee")
def test_character_combination_hashes_empty_lengths(en_tokenizer): def test_character_combination_hashes_empty_lengths(en_tokenizer):

View File

@ -6,6 +6,8 @@ from ..structs cimport TokenC, LexemeC, SpanC
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from libc.stdint cimport uint32_t
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil
@ -72,12 +74,18 @@ cdef void _search_for_chars(
) nogil ) nogil
cdef uint32_t fnv1a_hash(
const unsigned char* ptr,
const int length
) nogil
cdef int _write_hashes( cdef int _write_hashes(
const unsigned char* res_buf, const unsigned char* res_buf,
const unsigned char* aff_l_buf, const unsigned char* aff_l_buf,
const unsigned char* offset_buf, const unsigned char* offset_buf,
const int end_idx, const int end_idx,
np.int64_t* hashes_ptr, np.uint32_t* hashes_ptr,
) nogil ) nogil

View File

@ -6,7 +6,7 @@ cimport numpy as np
from cpython cimport array from cpython cimport array
from libc.string cimport memcpy, memcmp, memset, strlen from libc.string cimport memcpy, memcmp, memset, strlen
from libc.math cimport sqrt from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t from libc.stdint cimport int32_t, uint64_t, uint32_t
import copy import copy
from collections import Counter, defaultdict from collections import Counter, defaultdict
@ -17,7 +17,6 @@ import srsly
from thinc.api import get_array_module, get_current_ops from thinc.api import get_array_module, get_current_ops
from thinc.util import copy_array from thinc.util import copy_array
import warnings import warnings
from murmurhash.mrmr cimport hash32
from .span cimport Span from .span cimport Span
from .token cimport MISSING_DEP from .token cimport MISSING_DEP
@ -1809,15 +1808,15 @@ cdef class Doc:
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4) cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1) cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok
cdef np.int64_t* hashes_ptr = <np.int64_t*> mem.alloc( cdef np.uint32_t* hashes_ptr = <np.uint32_t*> mem.alloc(
total_hashes, sizeof(np.int64_t)) total_hashes, sizeof(np.uint32_t))
# Define working variables # Define working variables
cdef TokenC tok_c cdef TokenC tok_c
cdef int hash_idx, tok_i, tok_str_l cdef int hash_idx, tok_i, tok_str_l
cdef attr_t num_tok_attr cdef attr_t num_tok_attr
cdef const unsigned char* tok_str cdef const unsigned char* tok_str
cdef np.int64_t* w_hashes_ptr = hashes_ptr cdef np.uint32_t* w_hashes_ptr = hashes_ptr
for tok_i in range(doc_l): for tok_i in range(doc_l):
tok_c = self.c[tok_i] tok_c = self.c[tok_i]
@ -1843,9 +1842,9 @@ cdef class Doc:
ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True) ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True)
w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr) w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr)
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty( cdef np.ndarray[np.uint32_t, ndim=2] hashes = numpy.empty(
(doc_l, hashes_per_tok), dtype="int64") (doc_l, hashes_per_tok), dtype="uint32")
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.int64_t)) memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint32_t))
return hashes return hashes
@ -2029,6 +2028,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
lca_matrix[k, j] = lca - start lca_matrix[k, j] = lca - start
return lca_matrix return lca_matrix
@cython.boundscheck(False) # Deactivate bounds checking @cython.boundscheck(False) # Deactivate bounds checking
cdef void _set_prefix_lengths( cdef void _set_prefix_lengths(
const unsigned char* tok_str, const unsigned char* tok_str,
@ -2181,13 +2181,38 @@ cdef void _search_for_chars(
memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
@cython.boundscheck(False) # Deactivate bounds checking
cdef uint32_t fnv1a_hash(
const unsigned char* ptr,
const int length
) nogil:
""" Returns the FNV-1a hash for a sequence of bytes.
The behaviour of this method has been verified against several pieces
of data from http://www.isthe.com/chongo/src/fnv/test_fnv.c.
"""
cdef uint32_t hash_val = 0x811c9dc5
cdef int offset = 0
while offset < length:
hash_val ^= ptr[offset]
hash_val *= 0x01000193
offset += 1
return hash_val
def get_fnv1a_hash(input: bytes):
""" Python method to facilitate testing *fnv1a_hash*. """
return fnv1a_hash(input, len(input))
@cython.boundscheck(False) # Deactivate bounds checking @cython.boundscheck(False) # Deactivate bounds checking
cdef int _write_hashes( cdef int _write_hashes(
const unsigned char* res_buf, const unsigned char* res_buf,
const unsigned char* aff_l_buf, const unsigned char* aff_l_buf,
const unsigned char* offset_buf, const unsigned char* offset_buf,
const int end_idx, const int end_idx,
np.int64_t* hashes_ptr, np.uint32_t* hashes_ptr,
) nogil: ) nogil:
""" Write hashes for a token/rich property group combination. """ Write hashes for a token/rich property group combination.
@ -2208,9 +2233,9 @@ cdef int _write_hashes(
offset = offset_buf[aff_l - 1] offset = offset_buf[aff_l - 1]
if offset > 0: if offset > 0:
if end_idx != 0: if end_idx != 0:
hash_val = hash32(<void*> (res_buf + end_idx - offset), offset, 0) hash_val = fnv1a_hash(res_buf + end_idx - offset, offset)
else: else:
hash_val = hash32(<void*> res_buf, offset, 0) hash_val = fnv1a_hash(res_buf, offset)
hashes_ptr[hash_idx] = hash_val hashes_ptr[hash_idx] = hash_val
hash_idx += 1 hash_idx += 1