diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 99d0b913e..cdbf9c007 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,6 +1,7 @@ import weakref import numpy +import ctypes from numpy.testing import assert_array_equal from murmurhash.mrmr import hash import pytest @@ -14,6 +15,7 @@ from spacy.lang.xx import MultiLanguage from spacy.language import Language from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, SpanGroup, Token +from spacy.tokens.doc import get_fnv1a_hash from spacy.util import get_search_char_byte_arrays from spacy.vocab import Vocab @@ -994,9 +996,8 @@ def test_doc_spans_setdefault(en_tokenizer): assert len(doc.spans["key3"]) == 2 -def _get_32_bit_hash(input: str) -> int: - working_hash = hash(input.encode("UTF-8")) - return working_hash +def _encode_and_hash(input: str) -> int: + return get_fnv1a_hash(input.encode("UTF-8")) @pytest.mark.parametrize("case_sensitive", [True, False]) @@ -1051,62 +1052,60 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive hashes_per_tok=10, ) - print(hashes) + assert hashes[0][0] == _encode_and_hash("s") + assert hashes[0][1] == _encode_and_hash("spa") + assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac") + assert hashes[0][3] == _encode_and_hash("Cy" if case_sensitive else "cy") + assert hashes[0][4] == _encode_and_hash("aCy" if case_sensitive else "acy") + assert hashes[0][5] == _encode_and_hash("paCy" if case_sensitive else "pacy") + assert hashes[0][6] == _encode_and_hash("spaCy" if case_sensitive else "spacy") - assert hashes[0][0] == _get_32_bit_hash("s") - assert hashes[0][1] == _get_32_bit_hash("spa") - assert hashes[0][2] == _get_32_bit_hash("spaC" if case_sensitive else "spac") - assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy") - assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy") - assert hashes[0][5] == _get_32_bit_hash("paCy" if case_sensitive else "pacy") - assert hashes[0][6] == _get_32_bit_hash("spaCy" if case_sensitive else "spacy") - - assert hashes[0][7] == _get_32_bit_hash("p") - assert hashes[0][8] == _get_32_bit_hash("p") - assert hashes[0][9] == _get_32_bit_hash("p") - assert hashes[1][0] == _get_32_bit_hash("✨") - assert hashes[1][1] == _get_32_bit_hash("✨") - assert hashes[1][2] == _get_32_bit_hash("✨") - assert hashes[1][3] == _get_32_bit_hash("✨") - assert hashes[1][4] == _get_32_bit_hash("✨") - assert hashes[1][5] == _get_32_bit_hash("✨") - assert hashes[1][6] == _get_32_bit_hash("✨") + assert hashes[0][7] == _encode_and_hash("p") + assert hashes[0][8] == _encode_and_hash("p") + assert hashes[0][9] == _encode_and_hash("p") + assert hashes[1][0] == _encode_and_hash("✨") + assert hashes[1][1] == _encode_and_hash("✨") + assert hashes[1][2] == _encode_and_hash("✨") + assert hashes[1][3] == _encode_and_hash("✨") + assert hashes[1][4] == _encode_and_hash("✨") + assert hashes[1][5] == _encode_and_hash("✨") + assert hashes[1][6] == _encode_and_hash("✨") assert hashes[1][7] == 0 - assert hashes[1][8] == _get_32_bit_hash("✨") - assert hashes[1][9] == _get_32_bit_hash("✨") - assert hashes[2][0] == _get_32_bit_hash("a") - assert hashes[2][1] == _get_32_bit_hash("and") - assert hashes[2][2] == _get_32_bit_hash("and") - assert hashes[2][3] == _get_32_bit_hash("nd") - assert hashes[2][4] == _get_32_bit_hash("and") - assert hashes[2][5] == _get_32_bit_hash("and") - assert hashes[2][6] == _get_32_bit_hash("and") + assert hashes[1][8] == _encode_and_hash("✨") + assert hashes[1][9] == _encode_and_hash("✨") + assert hashes[2][0] == _encode_and_hash("a") + assert hashes[2][1] == _encode_and_hash("and") + assert hashes[2][2] == _encode_and_hash("and") + assert hashes[2][3] == _encode_and_hash("nd") + assert hashes[2][4] == _encode_and_hash("and") + assert hashes[2][5] == _encode_and_hash("and") + assert hashes[2][6] == _encode_and_hash("and") assert hashes[2][7] == 0 assert hashes[2][8] == 0 assert hashes[2][9] == 0 - assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p") - assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro") - assert hashes[3][2] == _get_32_bit_hash("Prod" if case_sensitive else "prod") - assert hashes[3][3] == _get_32_bit_hash("gy") - assert hashes[3][4] == _get_32_bit_hash("igy") - assert hashes[3][5] == _get_32_bit_hash("digy") - assert hashes[3][6] == _get_32_bit_hash("odigy") - assert hashes[3][7] == 0 if case_sensitive else _get_32_bit_hash("pr") + assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p") + assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro") + assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod") + assert hashes[3][3] == _encode_and_hash("gy") + assert hashes[3][4] == _encode_and_hash("igy") + assert hashes[3][5] == _encode_and_hash("digy") + assert hashes[3][6] == _encode_and_hash("odigy") + assert hashes[3][7] == 0 if case_sensitive else _encode_and_hash("pr") - assert hashes[3][8] == _get_32_bit_hash("r") + assert hashes[3][8] == _encode_and_hash("r") if case_sensitive: - assert hashes[3][9] == _get_32_bit_hash("r") + assert hashes[3][9] == _encode_and_hash("r") else: - assert hashes[3][9] == _get_32_bit_hash("rp") + assert hashes[3][9] == _encode_and_hash("rp") # check values are the same cross-platform if case_sensitive: - assert hashes[0][2] == -1253438126 + assert hashes[0][2] == 1140960578 else: - assert hashes[0][2] == -2095352600 - assert hashes[1][3] == 910783208 - assert hashes[3][8] == 1553167345 + assert hashes[0][2] == 604076770 + assert hashes[1][3] == 3384544169 + assert hashes[3][8] == 4144776981 def test_get_character_combination_hashes_good_case_partial(en_tokenizer): @@ -1148,26 +1147,26 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer): hashes_per_tok=5, ) - assert hashes[0][0] == _get_32_bit_hash("cy") - assert hashes[0][1] == _get_32_bit_hash("acy") - assert hashes[0][2] == _get_32_bit_hash("pacy") - assert hashes[0][3] == _get_32_bit_hash("spacy") - assert hashes[0][4] == _get_32_bit_hash("p") - assert hashes[1][0] == _get_32_bit_hash("✨") - assert hashes[1][1] == _get_32_bit_hash("✨") - assert hashes[1][2] == _get_32_bit_hash("✨") - assert hashes[1][3] == _get_32_bit_hash("✨") + assert hashes[0][0] == _encode_and_hash("cy") + assert hashes[0][1] == _encode_and_hash("acy") + assert hashes[0][2] == _encode_and_hash("pacy") + assert hashes[0][3] == _encode_and_hash("spacy") + assert hashes[0][4] == _encode_and_hash("p") + assert hashes[1][0] == _encode_and_hash("✨") + assert hashes[1][1] == _encode_and_hash("✨") + assert hashes[1][2] == _encode_and_hash("✨") + assert hashes[1][3] == _encode_and_hash("✨") assert hashes[1][4] == 0 - assert hashes[2][0] == _get_32_bit_hash("nd") - assert hashes[2][1] == _get_32_bit_hash("and") - assert hashes[2][2] == _get_32_bit_hash("and") - assert hashes[2][3] == _get_32_bit_hash("and") + assert hashes[2][0] == _encode_and_hash("nd") + assert hashes[2][1] == _encode_and_hash("and") + assert hashes[2][2] == _encode_and_hash("and") + assert hashes[2][3] == _encode_and_hash("and") assert hashes[2][4] == 0 - assert hashes[3][0] == _get_32_bit_hash("gy") - assert hashes[3][1] == _get_32_bit_hash("igy") - assert hashes[3][2] == _get_32_bit_hash("digy") - assert hashes[3][3] == _get_32_bit_hash("odigy") - assert hashes[3][4] == _get_32_bit_hash("pr") + assert hashes[3][0] == _encode_and_hash("gy") + assert hashes[3][1] == _encode_and_hash("igy") + assert hashes[3][2] == _encode_and_hash("digy") + assert hashes[3][3] == _encode_and_hash("odigy") + assert hashes[3][4] == _encode_and_hash("pr") def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): @@ -1205,8 +1204,8 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): hashes_per_tok=2, ) - assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length]) - assert hashes[0][1] == _get_32_bit_hash("sp𐌞cé"[-s_length:]) + assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length]) + assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:]) @pytest.mark.parametrize("case_sensitive", [True, False]) @@ -1273,37 +1272,37 @@ def test_get_character_combination_hashes_turkish_i_with_dot( ) COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") - assert hashes[0][0] == _get_32_bit_hash("i") - assert hashes[0][1] == _get_32_bit_hash("İ".lower()) + assert hashes[0][0] == _encode_and_hash("i") + assert hashes[0][1] == _encode_and_hash("İ".lower()) if case_sensitive: - assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "İ") - assert hashes[0][3] == _get_32_bit_hash("İ".lower() + "İ") - assert hashes[0][4] == _get_32_bit_hash("İ") - assert hashes[0][5] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ") - assert hashes[0][6] == _get_32_bit_hash("İ".lower() + "İ") - assert hashes[0][7] == _get_32_bit_hash("İ".lower() + "İ") - assert hashes[0][8] == _get_32_bit_hash("İ") - assert hashes[0][9] == _get_32_bit_hash("İ") - assert hashes[0][12] == _get_32_bit_hash("İ") - assert hashes[0][13] == _get_32_bit_hash("İ") + assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ") + assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ") + assert hashes[0][4] == _encode_and_hash("İ") + assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ") + assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ") + assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ") + assert hashes[0][8] == _encode_and_hash("İ") + assert hashes[0][9] == _encode_and_hash("İ") + assert hashes[0][12] == _encode_and_hash("İ") + assert hashes[0][13] == _encode_and_hash("İ") else: - assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "i") - assert hashes[0][3] == _get_32_bit_hash("İ".lower() * 2) - assert hashes[0][4] == _get_32_bit_hash(COMBINING_DOT_ABOVE) - assert hashes[0][5] == _get_32_bit_hash("İ".lower()) - assert hashes[0][6] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower()) - assert hashes[0][7] == _get_32_bit_hash("İ".lower() * 2) - assert hashes[0][8] == _get_32_bit_hash("i") - assert hashes[0][9] == _get_32_bit_hash("İ".lower()) - assert hashes[0][10] == _get_32_bit_hash("İ".lower() + "i") - assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2) - assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE) - assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i") - assert hashes[0][14] == _get_32_bit_hash( + assert hashes[0][2] == _encode_and_hash("İ".lower() + "i") + assert hashes[0][3] == _encode_and_hash("İ".lower() * 2) + assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE) + assert hashes[0][5] == _encode_and_hash("İ".lower()) + assert hashes[0][6] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ".lower()) + assert hashes[0][7] == _encode_and_hash("İ".lower() * 2) + assert hashes[0][8] == _encode_and_hash("i") + assert hashes[0][9] == _encode_and_hash("İ".lower()) + assert hashes[0][10] == _encode_and_hash("İ".lower() + "i") + assert hashes[0][11] == _encode_and_hash("İ".lower() * 2) + assert hashes[0][12] == _encode_and_hash(COMBINING_DOT_ABOVE) + assert hashes[0][13] == _encode_and_hash(COMBINING_DOT_ABOVE + "i") + assert hashes[0][14] == _encode_and_hash( COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE ) - assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2) + assert hashes[0][15] == _encode_and_hash((COMBINING_DOT_ABOVE + "i") * 2) @pytest.mark.parametrize("case_sensitive", [True, False]) @@ -1346,21 +1345,21 @@ def test_get_character_combination_hashes_string_store_spec_cases( ss_max_l=0, hashes_per_tok=3, ) - assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl") - assert hashes[0][1] == _get_32_bit_hash("19") + assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl") + assert hashes[0][1] == _encode_and_hash("19") assert hashes[0][2] == 0 - assert hashes[1][0] == _get_32_bit_hash("be") - assert hashes[1][1] == _get_32_bit_hash("ee") + assert hashes[1][0] == _encode_and_hash("be") + assert hashes[1][1] == _encode_and_hash("ee") if case_sensitive: assert hashes[1][2] == 0 else: - assert hashes[1][2] == _get_32_bit_hash("ee") - assert hashes[2][0] == hashes[3][0] == _get_32_bit_hash("se") - assert hashes[2][1] == hashes[3][1] == _get_32_bit_hash("ty") + assert hashes[1][2] == _encode_and_hash("ee") + assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se") + assert hashes[2][1] == hashes[3][1] == _encode_and_hash("ty") if case_sensitive: assert hashes[2][2] == hashes[3][2] == 0 else: - assert hashes[2][2] == hashes[3][2] == _get_32_bit_hash("ee") + assert hashes[2][2] == hashes[3][2] == _encode_and_hash("ee") def test_character_combination_hashes_empty_lengths(en_tokenizer): diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 41d150bb0..12a0e03a4 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -6,6 +6,8 @@ from ..structs cimport TokenC, LexemeC, SpanC from ..typedefs cimport attr_t from ..attrs cimport attr_id_t +from libc.stdint cimport uint32_t + cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil @@ -72,12 +74,18 @@ cdef void _search_for_chars( ) nogil +cdef uint32_t fnv1a_hash( + const unsigned char* ptr, + const int length +) nogil + + cdef int _write_hashes( const unsigned char* res_buf, const unsigned char* aff_l_buf, const unsigned char* offset_buf, const int end_idx, - np.int64_t* hashes_ptr, + np.uint32_t* hashes_ptr, ) nogil diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5c751d5a5..33d45a145 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -6,7 +6,7 @@ cimport numpy as np from cpython cimport array from libc.string cimport memcpy, memcmp, memset, strlen from libc.math cimport sqrt -from libc.stdint cimport int32_t, uint64_t +from libc.stdint cimport int32_t, uint64_t, uint32_t import copy from collections import Counter, defaultdict @@ -17,7 +17,6 @@ import srsly from thinc.api import get_array_module, get_current_ops from thinc.util import copy_array import warnings -from murmurhash.mrmr cimport hash32 from .span cimport Span from .token cimport MISSING_DEP @@ -1809,15 +1808,15 @@ cdef class Doc: cdef unsigned char* ss_res_buf = mem.alloc(ss_max_l, 4) cdef unsigned char* ss_l_buf = mem.alloc(ss_max_l, 1) cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok - cdef np.int64_t* hashes_ptr = mem.alloc( - total_hashes, sizeof(np.int64_t)) + cdef np.uint32_t* hashes_ptr = mem.alloc( + total_hashes, sizeof(np.uint32_t)) # Define working variables cdef TokenC tok_c cdef int hash_idx, tok_i, tok_str_l cdef attr_t num_tok_attr cdef const unsigned char* tok_str - cdef np.int64_t* w_hashes_ptr = hashes_ptr + cdef np.uint32_t* w_hashes_ptr = hashes_ptr for tok_i in range(doc_l): tok_c = self.c[tok_i] @@ -1843,9 +1842,9 @@ cdef class Doc: ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True) w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr) - cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty( - (doc_l, hashes_per_tok), dtype="int64") - memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.int64_t)) + cdef np.ndarray[np.uint32_t, ndim=2] hashes = numpy.empty( + (doc_l, hashes_per_tok), dtype="uint32") + memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint32_t)) return hashes @@ -2029,6 +2028,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): lca_matrix[k, j] = lca - start return lca_matrix + @cython.boundscheck(False) # Deactivate bounds checking cdef void _set_prefix_lengths( const unsigned char* tok_str, @@ -2181,13 +2181,38 @@ cdef void _search_for_chars( memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) +@cython.boundscheck(False) # Deactivate bounds checking +cdef uint32_t fnv1a_hash( + const unsigned char* ptr, + const int length +) nogil: + """ Returns the FNV-1a hash for a sequence of bytes. + The behaviour of this method has been verified against several pieces + of data from http://www.isthe.com/chongo/src/fnv/test_fnv.c. + """ + cdef uint32_t hash_val = 0x811c9dc5 + cdef int offset = 0 + + while offset < length: + hash_val ^= ptr[offset] + hash_val *= 0x01000193 + offset += 1 + + return hash_val + + +def get_fnv1a_hash(input: bytes): + """ Python method to facilitate testing *fnv1a_hash*. """ + return fnv1a_hash(input, len(input)) + + @cython.boundscheck(False) # Deactivate bounds checking cdef int _write_hashes( const unsigned char* res_buf, const unsigned char* aff_l_buf, const unsigned char* offset_buf, const int end_idx, - np.int64_t* hashes_ptr, + np.uint32_t* hashes_ptr, ) nogil: """ Write hashes for a token/rich property group combination. @@ -2208,9 +2233,9 @@ cdef int _write_hashes( offset = offset_buf[aff_l - 1] if offset > 0: if end_idx != 0: - hash_val = hash32( (res_buf + end_idx - offset), offset, 0) + hash_val = fnv1a_hash(res_buf + end_idx - offset, offset) else: - hash_val = hash32( res_buf, offset, 0) + hash_val = fnv1a_hash(res_buf, offset) hashes_ptr[hash_idx] = hash_val hash_idx += 1