From c7a960f19e175393f6024ed2ed57cabe75abeefe Mon Sep 17 00:00:00 2001 From: "richard@explosion.ai" Date: Thu, 3 Nov 2022 11:17:07 +0100 Subject: [PATCH] Performance improvement --- spacy/tests/doc/test_doc_api.py | 124 +++++++++++++++++--------------- spacy/tokens/doc.pxd | 8 +-- spacy/tokens/doc.pyx | 48 +++++-------- 3 files changed, 86 insertions(+), 94 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 5c761464f..632b4ab3b 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,3 +1,5 @@ +from audioop import reverse +from pickle import EMPTY_DICT import weakref import numpy @@ -996,6 +998,9 @@ def test_doc_spans_setdefault(en_tokenizer): assert len(doc.spans["key3"]) == 2 +EMPTY_HASH_VALUE = 0x811C9DC5 + + def test_fnv1a_hash(): """Checks the conformity of the FNV1A implementation with http://www.isthe.com/chongo/src/fnv/test_fnv.c. @@ -1213,7 +1218,7 @@ def test_fnv1a_hash(): ] OUTPUTS = [ - 0x811C9DC5, + EMPTY_HASH_VALUE, 0xE40C292C, 0xE70C2DE5, 0xE60C2C52, @@ -1423,8 +1428,11 @@ def test_fnv1a_hash(): assert get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i] -def _encode_and_hash(input: str) -> int: - return get_fnv1a_hash(input.encode("UTF-8")) +def _encode_and_hash(input: str, *, reverse: bool = False) -> int: + encoded_input = input.encode("UTF-8") + if reverse: + encoded_input = encoded_input[::-1] + return get_fnv1a_hash(encoded_input) @pytest.mark.parametrize("case_sensitive", [True, False]) @@ -1482,10 +1490,10 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive assert hashes[0][0] == _encode_and_hash("s") assert hashes[0][1] == _encode_and_hash("spa") assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac") - assert hashes[0][3] == _encode_and_hash("Cy" if case_sensitive else "cy") - assert hashes[0][4] == _encode_and_hash("aCy" if case_sensitive else "acy") - assert hashes[0][5] == _encode_and_hash("paCy" if case_sensitive else "pacy") - assert hashes[0][6] == _encode_and_hash("spaCy" if case_sensitive else "spacy") + assert hashes[0][3] == _encode_and_hash("yC" if case_sensitive else "yc") + assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca") + assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap") + assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps") assert hashes[0][7] == _encode_and_hash("p") assert hashes[0][8] == _encode_and_hash("p") @@ -1493,31 +1501,33 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive assert hashes[1][0] == _encode_and_hash("✨") assert hashes[1][1] == _encode_and_hash("✨") assert hashes[1][2] == _encode_and_hash("✨") - assert hashes[1][3] == _encode_and_hash("✨") - assert hashes[1][4] == _encode_and_hash("✨") - assert hashes[1][5] == _encode_and_hash("✨") - assert hashes[1][6] == _encode_and_hash("✨") - assert hashes[1][7] == 0 + assert hashes[1][3] == _encode_and_hash("✨", reverse=True) + assert hashes[1][4] == _encode_and_hash("✨", reverse=True) + assert hashes[1][5] == _encode_and_hash("✨", reverse=True) + assert hashes[1][6] == _encode_and_hash("✨", reverse=True) + assert hashes[1][7] == EMPTY_HASH_VALUE assert hashes[1][8] == _encode_and_hash("✨") assert hashes[1][9] == _encode_and_hash("✨") assert hashes[2][0] == _encode_and_hash("a") assert hashes[2][1] == _encode_and_hash("and") assert hashes[2][2] == _encode_and_hash("and") - assert hashes[2][3] == _encode_and_hash("nd") - assert hashes[2][4] == _encode_and_hash("and") - assert hashes[2][5] == _encode_and_hash("and") - assert hashes[2][6] == _encode_and_hash("and") - assert hashes[2][7] == 0 - assert hashes[2][8] == 0 - assert hashes[2][9] == 0 + assert hashes[2][3] == _encode_and_hash("dn") + assert hashes[2][4] == _encode_and_hash("dna") + assert hashes[2][5] == _encode_and_hash("dna") + assert hashes[2][6] == _encode_and_hash("dna") + assert hashes[2][7] == EMPTY_HASH_VALUE + assert hashes[2][8] == EMPTY_HASH_VALUE + assert hashes[2][9] == EMPTY_HASH_VALUE assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p") assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro") assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod") - assert hashes[3][3] == _encode_and_hash("gy") - assert hashes[3][4] == _encode_and_hash("igy") - assert hashes[3][5] == _encode_and_hash("digy") - assert hashes[3][6] == _encode_and_hash("odigy") - assert hashes[3][7] == 0 if case_sensitive else _encode_and_hash("pr") + assert hashes[3][3] == _encode_and_hash("yg") + assert hashes[3][4] == _encode_and_hash("ygi") + assert hashes[3][5] == _encode_and_hash("ygid") + assert hashes[3][6] == _encode_and_hash("ygido") + assert ( + hashes[3][7] == EMPTY_HASH_VALUE if case_sensitive else _encode_and_hash("pr") + ) assert hashes[3][8] == _encode_and_hash("r") @@ -1566,25 +1576,25 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer): hashes_per_tok=5, ) - assert hashes[0][0] == _encode_and_hash("cy") - assert hashes[0][1] == _encode_and_hash("acy") - assert hashes[0][2] == _encode_and_hash("pacy") - assert hashes[0][3] == _encode_and_hash("spacy") + assert hashes[0][0] == _encode_and_hash("yc") + assert hashes[0][1] == _encode_and_hash("yca") + assert hashes[0][2] == _encode_and_hash("ycap") + assert hashes[0][3] == _encode_and_hash("ycaps") assert hashes[0][4] == _encode_and_hash("p") - assert hashes[1][0] == _encode_and_hash("✨") - assert hashes[1][1] == _encode_and_hash("✨") - assert hashes[1][2] == _encode_and_hash("✨") - assert hashes[1][3] == _encode_and_hash("✨") - assert hashes[1][4] == 0 - assert hashes[2][0] == _encode_and_hash("nd") - assert hashes[2][1] == _encode_and_hash("and") - assert hashes[2][2] == _encode_and_hash("and") - assert hashes[2][3] == _encode_and_hash("and") - assert hashes[2][4] == 0 - assert hashes[3][0] == _encode_and_hash("gy") - assert hashes[3][1] == _encode_and_hash("igy") - assert hashes[3][2] == _encode_and_hash("digy") - assert hashes[3][3] == _encode_and_hash("odigy") + assert hashes[1][0] == _encode_and_hash("✨", reverse=True) + assert hashes[1][1] == _encode_and_hash("✨", reverse=True) + assert hashes[1][2] == _encode_and_hash("✨", reverse=True) + assert hashes[1][3] == _encode_and_hash("✨", reverse=True) + assert hashes[1][4] == EMPTY_HASH_VALUE + assert hashes[2][0] == _encode_and_hash("dn") + assert hashes[2][1] == _encode_and_hash("dna") + assert hashes[2][2] == _encode_and_hash("dna") + assert hashes[2][3] == _encode_and_hash("dna") + assert hashes[2][4] == EMPTY_HASH_VALUE + assert hashes[3][0] == _encode_and_hash("yg") + assert hashes[3][1] == _encode_and_hash("ygi") + assert hashes[3][2] == _encode_and_hash("ygid") + assert hashes[3][3] == _encode_and_hash("ygido") assert hashes[3][4] == _encode_and_hash("pr") @@ -1624,7 +1634,7 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): ) assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length]) - assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:]) + assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:], reverse=True) @pytest.mark.parametrize("case_sensitive", [True, False]) @@ -1696,10 +1706,10 @@ def test_get_character_combination_hashes_turkish_i_with_dot( if case_sensitive: assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ") assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ") - assert hashes[0][4] == _encode_and_hash("İ") - assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ") - assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ") - assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ") + assert hashes[0][4] == _encode_and_hash("İ", reverse=True) + assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True) + assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True) + assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True) assert hashes[0][8] == _encode_and_hash("İ") assert hashes[0][9] == _encode_and_hash("İ") assert hashes[0][12] == _encode_and_hash("İ") @@ -1708,10 +1718,12 @@ def test_get_character_combination_hashes_turkish_i_with_dot( else: assert hashes[0][2] == _encode_and_hash("İ".lower() + "i") assert hashes[0][3] == _encode_and_hash("İ".lower() * 2) - assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE) - assert hashes[0][5] == _encode_and_hash("İ".lower()) - assert hashes[0][6] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ".lower()) - assert hashes[0][7] == _encode_and_hash("İ".lower() * 2) + assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE, reverse=True) + assert hashes[0][5] == _encode_and_hash("İ".lower(), reverse=True) + assert hashes[0][6] == _encode_and_hash( + COMBINING_DOT_ABOVE + "İ".lower(), reverse=True + ) + assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True) assert hashes[0][8] == _encode_and_hash("i") assert hashes[0][9] == _encode_and_hash("İ".lower()) assert hashes[0][10] == _encode_and_hash("İ".lower() + "i") @@ -1765,18 +1777,18 @@ def test_get_character_combination_hashes_string_store_spec_cases( hashes_per_tok=3, ) assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl") - assert hashes[0][1] == _encode_and_hash("19") - assert hashes[0][2] == 0 + assert hashes[0][1] == _encode_and_hash("91") + assert hashes[0][2] == EMPTY_HASH_VALUE assert hashes[1][0] == _encode_and_hash("be") assert hashes[1][1] == _encode_and_hash("ee") if case_sensitive: - assert hashes[1][2] == 0 + assert hashes[1][2] == EMPTY_HASH_VALUE else: assert hashes[1][2] == _encode_and_hash("ee") assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se") - assert hashes[2][1] == hashes[3][1] == _encode_and_hash("ty") + assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt") if case_sensitive: - assert hashes[2][2] == hashes[3][2] == 0 + assert hashes[2][2] == hashes[3][2] == EMPTY_HASH_VALUE else: assert hashes[2][2] == hashes[3][2] == _encode_and_hash("ee") diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 12a0e03a4..3e74e9827 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -74,17 +74,11 @@ cdef void _search_for_chars( ) nogil -cdef uint32_t fnv1a_hash( - const unsigned char* ptr, - const int length -) nogil - - cdef int _write_hashes( const unsigned char* res_buf, const unsigned char* aff_l_buf, const unsigned char* offset_buf, - const int end_idx, + const int res_buf_l, np.uint32_t* hashes_ptr, ) nogil diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 9f1592caa..b3a528fcc 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -2115,14 +2115,15 @@ cdef void _search_for_chars( more is found, the remainder of *len_buf* is populated wth the byte length from the last result, which may be *0* if the search was not successful. - tok_str: a memoryview of a UTF-8 representation of a string. + tok_str: a UTF-8 representation of a string. tok_str_l: the length of *tok_str*. s_byte_ch: a byte array containing in order n-byte-wide characters to search for. res_buf: the buffer in which to place the search results. max_res_l: the maximum number of found characters to place in *res_buf*. l_buf: a buffer of length *max_res_l* in which to store the byte lengths. The calling code ensures that lengths greater than 255 cannot occur. - suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning. + suffs_not_prefs: if *True*, searching starts from the end of the word; + if *False*, from the beginning. """ cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx cdef int search_chars_l @@ -2179,26 +2180,6 @@ cdef void _search_for_chars( # fill in unused characters in the length buffer memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) - - -@cython.boundscheck(False) # Deactivate bounds checking -cdef uint32_t fnv1a_hash( - const unsigned char* ptr, - const int length -) nogil: - """ Returns the FNV-1a hash for a sequence of bytes. - The behaviour of this method has been verified against several pieces - of data from http://www.isthe.com/chongo/src/fnv/test_fnv.c. - """ - cdef uint32_t hash_val = 0x811c9dc5 - cdef int offset = 0 - - while offset < length: - hash_val ^= ptr[offset] - hash_val *= 0x01000193 - offset += 1 - - return hash_val def get_fnv1a_hash(input: bytes): @@ -2218,31 +2199,36 @@ cdef int _write_hashes( const unsigned char* res_buf, const unsigned char* aff_l_buf, const unsigned char* offset_buf, - const int end_idx, + const int res_buf_l, np.uint32_t* hashes_ptr, ) nogil: - """ Write hashes for a token/rich property group combination. + """ Write FNV1A hashes for a token/rich property group combination. res_buf: the string from which to generate the hash values. aff_l_buf: one-byte lengths describing how many characters to hash. offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*. - end_idx: if not *0*, the offset within *res_buf* that should end each affix being hashed; - if *0*, affixes start at the beginning of *res_buf* rather than ending at the end. + res_buf_l: if affixes should start at the end of *res_buf*, the length of *res_buf*; + if affixes should start at the beginning of *res_buf*, *0*. hashes_ptr: a pointer starting from which the new hashes should be written. + + Returns: the number of hashes written. """ - cdef int offset, aff_l, hash_val = 0, hash_idx = 0 + cdef int last_offset = 0, hash_idx = 0, offset, aff_l + cdef uint32_t hash_val = 0x811c9dc5 while True: aff_l = aff_l_buf[hash_idx] if aff_l == 0: return hash_idx offset = offset_buf[aff_l - 1] - if offset > 0: - if end_idx != 0: - hash_val = fnv1a_hash(res_buf + end_idx - offset, offset) + while last_offset < offset: + if end_idx > 0: + hash_val ^= res_buf[end_idx - last_offset] else: - hash_val = fnv1a_hash(res_buf, offset) + hash_val ^= res_buf[last_offset] + hash_val *= 0x01000193 + last_offset += 1 hashes_ptr[hash_idx] = hash_val hash_idx += 1