From e7626f423a8642fbeeb4709b4372c66dfb473b0f Mon Sep 17 00:00:00 2001 From: "richard@explosion.ai" Date: Wed, 2 Nov 2022 17:11:20 +0100 Subject: [PATCH] Generate Numpy array at end --- spacy/tests/doc/test_doc_api.py | 291 +++++++++++++++++++------------- spacy/tokens/doc.pxd | 9 +- spacy/tokens/doc.pyi | 30 +--- spacy/tokens/doc.pyx | 59 ++++--- 4 files changed, 211 insertions(+), 178 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 61eab311b..99d0b913e 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1006,45 +1006,60 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive) hashes = doc.get_character_combination_hashes( cs=case_sensitive, - p_lengths=bytes((1, 3, 4,)), - p_max_l = 4, - s_lengths=bytes((2, 3, 4, 5,)), - s_max_l = 5, + p_lengths=bytes( + ( + 1, + 3, + 4, + ) + ), + p_max_l=4, + s_lengths=bytes( + ( + 2, + 3, + 4, + 5, + ) + ), + s_max_l=5, ps_1byte_ch=ps1, - ps_1byte_ch_l = len(ps1), + ps_1byte_ch_l=len(ps1), ps_2byte_ch=ps2, - ps_2byte_ch_l = len(ps2), + ps_2byte_ch_l=len(ps2), ps_3byte_ch=ps3, - ps_3byte_ch_l = len(ps3), + ps_3byte_ch_l=len(ps3), ps_4byte_ch=ps4, - ps_4byte_ch_l = len(ps4), + ps_4byte_ch_l=len(ps4), ps_lengths=bytes((2,)), - ps_max_l = 2, + ps_max_l=2, ss_1byte_ch=ss1, - ss_1byte_ch_l = len(ss1), + ss_1byte_ch_l=len(ss1), ss_2byte_ch=ss2, - ss_2byte_ch_l = len(ss2), + ss_2byte_ch_l=len(ss2), ss_3byte_ch=ss3, - ss_3byte_ch_l = len(ss3), + ss_3byte_ch_l=len(ss3), ss_4byte_ch=ss4, - ss_4byte_ch_l = len(ss4), - ss_lengths=bytes((1, 2,)), - ss_max_l = 2, + ss_4byte_ch_l=len(ss4), + ss_lengths=bytes( + ( + 1, + 2, + ) + ), + ss_max_l=2, + hashes_per_tok=10, ) + print(hashes) + assert hashes[0][0] == _get_32_bit_hash("s") assert hashes[0][1] == _get_32_bit_hash("spa") - assert hashes[0][2] == _get_32_bit_hash( - "spaC" if case_sensitive else "spac" - ) + assert hashes[0][2] == _get_32_bit_hash("spaC" if case_sensitive else "spac") assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy") assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy") - assert hashes[0][5] == _get_32_bit_hash( - "paCy" if case_sensitive else "pacy" - ) - assert hashes[0][6] == _get_32_bit_hash( - "spaCy" if case_sensitive else "spacy" - ) + assert hashes[0][5] == _get_32_bit_hash("paCy" if case_sensitive else "pacy") + assert hashes[0][6] == _get_32_bit_hash("spaCy" if case_sensitive else "spacy") assert hashes[0][7] == _get_32_bit_hash("p") assert hashes[0][8] == _get_32_bit_hash("p") @@ -1071,9 +1086,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive assert hashes[2][9] == 0 assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p") assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro") - assert hashes[3][2] == _get_32_bit_hash( - "Prod" if case_sensitive else "prod" - ) + assert hashes[3][2] == _get_32_bit_hash("Prod" if case_sensitive else "prod") assert hashes[3][3] == _get_32_bit_hash("gy") assert hashes[3][4] == _get_32_bit_hash("igy") assert hashes[3][5] == _get_32_bit_hash("digy") @@ -1102,32 +1115,39 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer): hashes = doc.get_character_combination_hashes( cs=False, p_lengths=bytes(), - p_max_l = 0, - s_lengths=bytes((2,3,4,5,)), - s_max_l = 5, + p_max_l=0, + s_lengths=bytes( + ( + 2, + 3, + 4, + 5, + ) + ), + s_max_l=5, ps_1byte_ch=ps1, - ps_1byte_ch_l = len(ps1), + ps_1byte_ch_l=len(ps1), ps_2byte_ch=ps2, - ps_2byte_ch_l = len(ps2), + ps_2byte_ch_l=len(ps2), ps_3byte_ch=ps3, - ps_3byte_ch_l = len(ps3), + ps_3byte_ch_l=len(ps3), ps_4byte_ch=ps4, - ps_4byte_ch_l = len(ps4), + ps_4byte_ch_l=len(ps4), ps_lengths=bytes((2,)), - ps_max_l = 2, + ps_max_l=2, ss_1byte_ch=bytes(), - ss_1byte_ch_l = 0, + ss_1byte_ch_l=0, ss_2byte_ch=bytes(), - ss_2byte_ch_l = 0, + ss_2byte_ch_l=0, ss_3byte_ch=bytes(), - ss_3byte_ch_l = 0, + ss_3byte_ch_l=0, ss_4byte_ch=bytes(), - ss_4byte_ch_l = 0, + ss_4byte_ch_l=0, ss_lengths=bytes(), - ss_max_l = 0, + ss_max_l=0, + hashes_per_tok=5, ) - - + assert hashes[0][0] == _get_32_bit_hash("cy") assert hashes[0][1] == _get_32_bit_hash("acy") assert hashes[0][2] == _get_32_bit_hash("pacy") @@ -1155,33 +1175,34 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): for p_length in range(1, 8): for s_length in range(1, 8): - + hashes = doc.get_character_combination_hashes( cs=False, p_lengths=bytes((p_length,)), - p_max_l = p_length, + p_max_l=p_length, s_lengths=bytes((s_length,)), - s_max_l = s_length, + s_max_l=s_length, ps_1byte_ch=bytes(), - ps_1byte_ch_l = 0, + ps_1byte_ch_l=0, ps_2byte_ch=bytes(), - ps_2byte_ch_l = 0, + ps_2byte_ch_l=0, ps_3byte_ch=bytes(), - ps_3byte_ch_l = 0, + ps_3byte_ch_l=0, ps_4byte_ch=bytes(), - ps_4byte_ch_l = 0, + ps_4byte_ch_l=0, ps_lengths=bytes(), - ps_max_l = 0, + ps_max_l=0, ss_1byte_ch=bytes(), - ss_1byte_ch_l = 0, + ss_1byte_ch_l=0, ss_2byte_ch=bytes(), - ss_2byte_ch_l = 0, + ss_2byte_ch_l=0, ss_3byte_ch=bytes(), - ss_3byte_ch_l = 0, + ss_3byte_ch_l=0, ss_4byte_ch=bytes(), - ss_4byte_ch_l = 0, + ss_4byte_ch_l=0, ss_lengths=bytes(), - ss_max_l = 0 + ss_max_l=0, + hashes_per_tok=2, ) assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length]) @@ -1189,35 +1210,66 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): @pytest.mark.parametrize("case_sensitive", [True, False]) -def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_sensitive): +def test_get_character_combination_hashes_turkish_i_with_dot( + en_tokenizer, case_sensitive +): doc = en_tokenizer("İ".lower() + "İ") s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive) hashes = doc.get_character_combination_hashes( cs=case_sensitive, - p_lengths=bytes((1,2,3,4,)), - p_max_l = 4, - s_lengths=bytes((1,2,3,4,)), - s_max_l = 4, + p_lengths=bytes( + ( + 1, + 2, + 3, + 4, + ) + ), + p_max_l=4, + s_lengths=bytes( + ( + 1, + 2, + 3, + 4, + ) + ), + s_max_l=4, ps_1byte_ch=s1, - ps_1byte_ch_l = len(s1), + ps_1byte_ch_l=len(s1), ps_2byte_ch=s2, - ps_2byte_ch_l = len(s2), + ps_2byte_ch_l=len(s2), ps_3byte_ch=s3, - ps_3byte_ch_l = len(s3), + ps_3byte_ch_l=len(s3), ps_4byte_ch=s4, - ps_4byte_ch_l = len(s4), - ps_lengths=bytes((1,2,3,4,)), - ps_max_l = 4, + ps_4byte_ch_l=len(s4), + ps_lengths=bytes( + ( + 1, + 2, + 3, + 4, + ) + ), + ps_max_l=4, ss_1byte_ch=s1, - ss_1byte_ch_l = len(s1), + ss_1byte_ch_l=len(s1), ss_2byte_ch=s2, - ss_2byte_ch_l = len(s2), + ss_2byte_ch_l=len(s2), ss_3byte_ch=s3, - ss_3byte_ch_l = len(s3), + ss_3byte_ch_l=len(s3), ss_4byte_ch=s4, - ss_4byte_ch_l = len(s4), - ss_lengths=bytes((1,2,3,4,)), - ss_max_l = 4 + ss_4byte_ch_l=len(s4), + ss_lengths=bytes( + ( + 1, + 2, + 3, + 4, + ) + ), + ss_max_l=4, + hashes_per_tok=16, ) COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") @@ -1248,46 +1300,51 @@ def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_ assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2) assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE) assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i") - assert hashes[0][14] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE) + assert hashes[0][14] == _get_32_bit_hash( + COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE + ) assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2) - + @pytest.mark.parametrize("case_sensitive", [True, False]) -def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer, case_sensitive): +def test_get_character_combination_hashes_string_store_spec_cases( + en_tokenizer, case_sensitive +): symbol = "FLAG19" short_word = "bee" normal_word = "serendipity" long_word = "serendipity" * 50 assert len(long_word) > 255 - doc = en_tokenizer(' '.join((symbol, short_word, normal_word, long_word))) + doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word))) assert len(doc) == 4 ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive) hashes = doc.get_character_combination_hashes( cs=case_sensitive, p_lengths=bytes((2,)), - p_max_l = 2, + p_max_l=2, s_lengths=bytes((2,)), - s_max_l = 2, + s_max_l=2, ps_1byte_ch=ps1, - ps_1byte_ch_l = len(ps1), + ps_1byte_ch_l=len(ps1), ps_2byte_ch=ps2, - ps_2byte_ch_l = len(ps2), + ps_2byte_ch_l=len(ps2), ps_3byte_ch=ps3, - ps_3byte_ch_l = len(ps3), + ps_3byte_ch_l=len(ps3), ps_4byte_ch=ps4, - ps_4byte_ch_l = len(ps4), + ps_4byte_ch_l=len(ps4), ps_lengths=bytes((2,)), - ps_max_l = 2, + ps_max_l=2, ss_1byte_ch=bytes(), - ss_1byte_ch_l = 0, + ss_1byte_ch_l=0, ss_2byte_ch=bytes(), - ss_2byte_ch_l = 0, + ss_2byte_ch_l=0, ss_3byte_ch=bytes(), - ss_3byte_ch_l = 0, + ss_3byte_ch_l=0, ss_4byte_ch=bytes(), - ss_4byte_ch_l = 0, + ss_4byte_ch_l=0, ss_lengths=bytes(), - ss_max_l = 0 + ss_max_l=0, + hashes_per_tok=3, ) assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl") assert hashes[0][1] == _get_32_bit_hash("19") @@ -1308,30 +1365,34 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer, def test_character_combination_hashes_empty_lengths(en_tokenizer): doc = en_tokenizer("and𐌞") - assert doc.get_character_combination_hashes( - cs=True, - p_lengths=bytes(), - p_max_l = 0, - s_lengths=bytes(), - s_max_l = 0, - ps_1byte_ch=bytes(), - ps_1byte_ch_l=0, - ps_2byte_ch=bytes(), - ps_2byte_ch_l=0, - ps_3byte_ch=bytes(), - ps_3byte_ch_l=0, - ps_4byte_ch=bytes(), - ps_4byte_ch_l=0, - ps_lengths=bytes(), - ps_max_l = 0, - ss_1byte_ch=bytes(), - ss_1byte_ch_l=0, - ss_2byte_ch=bytes(), - ss_2byte_ch_l=0, - ss_3byte_ch=bytes(), - ss_3byte_ch_l=0, - ss_4byte_ch=bytes(), - ss_4byte_ch_l=0, - ss_lengths=bytes(), - ss_max_l = 0, - ).shape == (1, 0) + assert ( + doc.get_character_combination_hashes( + cs=True, + p_lengths=bytes(), + p_max_l=0, + s_lengths=bytes(), + s_max_l=0, + ps_1byte_ch=bytes(), + ps_1byte_ch_l=0, + ps_2byte_ch=bytes(), + ps_2byte_ch_l=0, + ps_3byte_ch=bytes(), + ps_3byte_ch_l=0, + ps_4byte_ch=bytes(), + ps_4byte_ch_l=0, + ps_lengths=bytes(), + ps_max_l=0, + ss_1byte_ch=bytes(), + ss_1byte_ch_l=0, + ss_2byte_ch=bytes(), + ss_2byte_ch_l=0, + ss_3byte_ch=bytes(), + ss_3byte_ch_l=0, + ss_4byte_ch=bytes(), + ss_4byte_ch_l=0, + ss_lengths=bytes(), + ss_max_l=0, + hashes_per_tok=0, + ).shape + == (1, 0) + ) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 8888939df..41d150bb0 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -51,7 +51,7 @@ cdef void _set_suffix_lengths( const int tok_str_l, unsigned char* suff_l_buf, const int s_max_l, -) +) nogil cdef void _search_for_chars( @@ -72,16 +72,13 @@ cdef void _search_for_chars( ) nogil - cdef int _write_hashes( const unsigned char* res_buf, const unsigned char* aff_l_buf, const unsigned char* offset_buf, const int end_idx, - np.ndarray[np.int64_t, ndim=2] hashes, - const int tok_i, - const int start_hash_idx, -) + np.int64_t* hashes_ptr, +) nogil cdef class Doc: diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index b27c68386..231b9b84d 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -202,35 +202,7 @@ class Doc: ss_4_byte_ch_l: int, ss_lengths: bytes, ss_max_l: int, + hashes_per_tok: int, ) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ... - -def get_character_combination_hashes(self, - *, - const bint cs, - const unsigned char* p_lengths, - const int p_max_l, - const unsigned char* s_lengths, - const int s_max_l, - const unsigned char* ps_1byte_ch, - const int ps_1_byte_ch_l, - const unsigned char* ps_2byte_ch, - const int ps_2_byte_ch_l, - const unsigned char* ps_3byte_ch, - const int ps_3_byte_ch_l, - const unsigned char* ps_4byte_ch, - const int ps_4_byte_ch_l, - const unsigned char* ps_lengths, - const int ps_max_l, - const unsigned char* ss_1byte_ch, - const int ss_1_byte_ch_l, - const unsigned char* ss_2byte_ch, - const int ss_2_byte_ch_l, - const unsigned char* ss_3byte_ch, - const int ss_3_byte_ch_l, - const unsigned char* ss_4byte_ch, - const int ss_4_byte_ch_l, - const unsigned char* ss_lengths, - const int ss_max_l, - ) \ No newline at end of file diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 91836e15e..5c751d5a5 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1735,7 +1735,7 @@ cdef class Doc: j += 1 return output - #@cython.boundscheck(False) # Deactivate bounds checking + @cython.boundscheck(False) # Deactivate bounds checking def get_character_combination_hashes(self, *, const bint cs, @@ -1763,6 +1763,7 @@ cdef class Doc: const int ss_4byte_ch_l, const unsigned char* ss_lengths, const int ss_max_l, + const int hashes_per_tok ): """ Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations @@ -1796,11 +1797,9 @@ cdef class Doc: in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "c" and "ca". ss_max_l: the value of *ss_lengths[-1]*, or *0* if *ss_lengths==None*. Passed in for speed. + hashes_per_tok: the total number of hashes produced for each token. Passed in for speed. """ - cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty( - (self.length, p_max_l + s_max_l + ps_max_l + ss_max_l), dtype="int64") - # Define / allocate buffers cdef Pool mem = Pool() cdef unsigned char* pref_l_buf = mem.alloc(p_max_l, 1) @@ -1809,40 +1808,47 @@ cdef class Doc: cdef unsigned char* ps_l_buf = mem.alloc(ps_max_l, 1) cdef unsigned char* ss_res_buf = mem.alloc(ss_max_l, 4) cdef unsigned char* ss_l_buf = mem.alloc(ss_max_l, 1) - + cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok + cdef np.int64_t* hashes_ptr = mem.alloc( + total_hashes, sizeof(np.int64_t)) + # Define working variables cdef TokenC tok_c cdef int hash_idx, tok_i, tok_str_l cdef attr_t num_tok_attr cdef const unsigned char* tok_str - - for tok_i in range(self.length): + cdef np.int64_t* w_hashes_ptr = hashes_ptr + + for tok_i in range(doc_l): tok_c = self.c[tok_i] num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower tok_str = self.vocab.strings.utf8_ptr(num_tok_attr) tok_str_l = strlen( tok_str) - hash_idx = 0 if p_max_l > 0: _set_prefix_lengths(tok_str, tok_str_l, pref_l_buf, p_max_l) - hash_idx = _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes, tok_i, 0) + w_hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, w_hashes_ptr) if s_max_l > 0: _set_suffix_lengths(tok_str, tok_str_l, suff_l_buf, s_max_l) - hash_idx = _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l, hashes, tok_i, hash_idx) + w_hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l, w_hashes_ptr) if ps_max_l > 0: _search_for_chars(tok_str, tok_str_l, ps_1byte_ch, ps_1byte_ch_l, ps_2byte_ch, ps_2byte_ch_l, ps_3byte_ch, ps_3byte_ch_l, ps_4byte_ch, ps_4byte_ch_l, ps_res_buf, ps_max_l, ps_l_buf, False) - hash_idx = _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes, tok_i, hash_idx) + w_hashes_ptr += _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, w_hashes_ptr) if ss_max_l > 0: _search_for_chars(tok_str, tok_str_l, ss_1byte_ch, ss_1byte_ch_l, ss_2byte_ch, ss_2byte_ch_l, ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True) - _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes, tok_i, hash_idx) - + w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr) + + cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty( + (doc_l, hashes_per_tok), dtype="int64") + memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.int64_t)) return hashes + @staticmethod def _get_array_attrs(): attrs = [LENGTH, SPACY] @@ -2023,7 +2029,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): lca_matrix[k, j] = lca - start return lca_matrix -#@cython.boundscheck(False) # Deactivate bounds checking +@cython.boundscheck(False) # Deactivate bounds checking cdef void _set_prefix_lengths( const unsigned char* tok_str, const int tok_str_l, @@ -2056,13 +2062,13 @@ cdef void _set_prefix_lengths( memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx) -#@cython.boundscheck(False) # Deactivate bounds checking +@cython.boundscheck(False) # Deactivate bounds checking cdef void _set_suffix_lengths( const unsigned char* tok_str, const int tok_str_l, unsigned char* suff_l_buf, const int s_max_l, -): +) nogil: """ Populate *suff_l_buf*, which has length *suff_l*, with the byte lengths of the last *suff_l* characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word. @@ -2086,7 +2092,7 @@ cdef void _set_suffix_lengths( memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx) -#@cython.boundscheck(False) # Deactivate bounds checking +@cython.boundscheck(False) # Deactivate bounds checking cdef void _search_for_chars( const unsigned char* tok_str, const int tok_str_l, @@ -2175,15 +2181,14 @@ cdef void _search_for_chars( memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) +@cython.boundscheck(False) # Deactivate bounds checking cdef int _write_hashes( const unsigned char* res_buf, const unsigned char* aff_l_buf, const unsigned char* offset_buf, const int end_idx, - np.ndarray[np.int64_t, ndim=2] hashes, - const int tok_i, - const int start_hash_idx, -): + np.int64_t* hashes_ptr, +) nogil: """ Write hashes for a token/rich property group combination. res_buf: the string from which to generate the hash values. @@ -2191,24 +2196,22 @@ cdef int _write_hashes( offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*. end_idx: if not *0*, the offset within *res_buf* that should end each affix being hashed; if *0*, affixes start at the beginning of *res_buf* rather than ending at the end. - hashes: the 2D Numpy array in which the hashes are stored. - tok_i: the index of axis 0 of *hashes* to write to. - start_hash_idx: the index of axis 1 of *hashes* at which to start writing. + hashes_ptr: a pointer starting from which the new hashes should be written. """ - cdef int offset, aff_l, hash_val = 0, hash_idx = start_hash_idx + cdef int offset, aff_l, hash_val = 0, hash_idx = 0 while True: - aff_l = aff_l_buf[hash_idx - start_hash_idx] + aff_l = aff_l_buf[hash_idx] if aff_l == 0: return hash_idx offset = offset_buf[aff_l - 1] if offset > 0: if end_idx != 0: - hash_val = hash32( res_buf + end_idx - offset, offset, 0) + hash_val = hash32( (res_buf + end_idx - offset), offset, 0) else: hash_val = hash32( res_buf, offset, 0) - hashes[tok_i, hash_idx] = hash_val + hashes_ptr[hash_idx] = hash_val hash_idx += 1