From 217ff3655972df02f875bfbacd9a5b399a76c721 Mon Sep 17 00:00:00 2001 From: richardpaulhudson Date: Fri, 28 Oct 2022 13:31:14 +0200 Subject: [PATCH] Tests passing again after refactoring --- spacy/strings.pyx | 2 +- spacy/tests/doc/test_doc_api.py | 251 ++++++++++++++++++++++---------- spacy/tokens/doc.pxd | 14 +- spacy/tokens/doc.pyx | 172 +++++++++------------- 4 files changed, 254 insertions(+), 185 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index b64cbbed2..d86bf600b 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -320,7 +320,7 @@ cdef class StringStore: if hash_val == 0: return "" elif hash_val < len(SYMBOLS_BY_INT): - return SYMBOLS_BY_INT[hash_val] + return SYMBOLS_BY_INT[hash_val].encode("utf-8") cdef Utf8Str* string = self._map.get(hash_val) cdef int i, length if string.s[0] < sizeof(string.s) and string.s[0] != 0: diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 0a1838a3c..a9c974b8c 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -14,7 +14,7 @@ from spacy.lang.xx import MultiLanguage from spacy.language import Language from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, SpanGroup, Token -from spacy.util import get_arrays_for_search_chars +from spacy.util import get_search_char_byte_arrays from spacy.vocab import Vocab from .test_underscore import clean_underscore # noqa: F401 @@ -995,8 +995,7 @@ def test_doc_spans_setdefault(en_tokenizer): def _get_unsigned_32_bit_hash(input: str) -> int: - input = input.replace(" ", "\x00") - working_hash = hash(input.encode("UTF-32LE")) + working_hash = hash(input.encode("UTF-8")) if working_hash < 0: working_hash = working_hash + (2 << 31) return working_hash @@ -1006,27 +1005,29 @@ def _get_unsigned_32_bit_hash(input: str) -> int: def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): doc = en_tokenizer("spaCy✨ and Prodigy") ops = get_current_ops() - pref_search, pref_lookup = get_arrays_for_search_chars("Rp", case_sensitive) - suff_search, suff_lookup = get_arrays_for_search_chars("xx✨rp", case_sensitive) + ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("Rp", case_sensitive) + ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive) hashes = doc.get_character_combination_hashes( cs=case_sensitive, - p_lengths=ops.asarray1i([1, 4, 3]), + p_lengths=ops.asarray1i([1, 3, 4]), s_lengths=ops.asarray1i([2, 3, 4, 5]), - ps_search=pref_search, - ps_lookup=pref_lookup, - ps_l=2 if case_sensitive else 4, + ps_1byte_ch=ps1, + ps_2byte_ch=ps2, + ps_3byte_ch=ps3, + ps_4byte_ch=ps4, ps_lengths=ops.asarray1i([2]), - ss_search=suff_search, - ss_lookup=suff_lookup, - ss_l=5 if case_sensitive else 9, - ss_lengths=ops.asarray1i([2, 1]), + ss_1byte_ch=ss1, + ss_2byte_ch=ss2, + ss_3byte_ch=ss3, + ss_4byte_ch=ss4, + ss_lengths=ops.asarray1i([1, 2]), ) assert hashes[0][0] == _get_unsigned_32_bit_hash("s") - assert hashes[0][1] == _get_unsigned_32_bit_hash( + assert hashes[0][1] == _get_unsigned_32_bit_hash("spa") + assert hashes[0][2] == _get_unsigned_32_bit_hash( "spaC" if case_sensitive else "spac" ) - assert hashes[0][2] == _get_unsigned_32_bit_hash("spa") assert hashes[0][3] == _get_unsigned_32_bit_hash("Cy" if case_sensitive else "cy") assert hashes[0][4] == _get_unsigned_32_bit_hash("aCy" if case_sensitive else "acy") assert hashes[0][5] == _get_unsigned_32_bit_hash( @@ -1036,89 +1037,92 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive "spaCy" if case_sensitive else "spacy" ) - assert hashes[0][7] == _get_unsigned_32_bit_hash("p ") - assert hashes[0][8] == _get_unsigned_32_bit_hash("p ") + assert hashes[0][7] == _get_unsigned_32_bit_hash("p") + assert hashes[0][8] == _get_unsigned_32_bit_hash("p") assert hashes[0][9] == _get_unsigned_32_bit_hash("p") assert hashes[1][0] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][1] == _get_unsigned_32_bit_hash("✨ ") - assert hashes[1][2] == _get_unsigned_32_bit_hash("✨ ") - assert hashes[1][3] == _get_unsigned_32_bit_hash(" ✨") - assert hashes[1][4] == _get_unsigned_32_bit_hash(" ✨") - assert hashes[1][5] == _get_unsigned_32_bit_hash(" ✨") - assert hashes[1][6] == _get_unsigned_32_bit_hash(" ✨") - assert hashes[1][7] == _get_unsigned_32_bit_hash(" ") - assert hashes[1][8] == _get_unsigned_32_bit_hash("✨ ") + assert hashes[1][1] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][2] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][3] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][4] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][5] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][6] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][7] == 0 + assert hashes[1][8] == _get_unsigned_32_bit_hash("✨") assert hashes[1][9] == _get_unsigned_32_bit_hash("✨") assert hashes[2][0] == _get_unsigned_32_bit_hash("a") - assert hashes[2][1] == _get_unsigned_32_bit_hash("and ") + assert hashes[2][1] == _get_unsigned_32_bit_hash("and") assert hashes[2][2] == _get_unsigned_32_bit_hash("and") assert hashes[2][3] == _get_unsigned_32_bit_hash("nd") assert hashes[2][4] == _get_unsigned_32_bit_hash("and") - assert hashes[2][5] == _get_unsigned_32_bit_hash(" and") - assert hashes[2][6] == _get_unsigned_32_bit_hash(" and") - assert hashes[2][7] == _get_unsigned_32_bit_hash(" ") - assert hashes[2][8] == _get_unsigned_32_bit_hash(" ") - assert hashes[2][9] == _get_unsigned_32_bit_hash(" ") + assert hashes[2][5] == _get_unsigned_32_bit_hash("and") + assert hashes[2][6] == _get_unsigned_32_bit_hash("and") + assert hashes[2][7] == 0 + assert hashes[2][8] == 0 + assert hashes[2][9] == 0 assert hashes[3][0] == _get_unsigned_32_bit_hash("P" if case_sensitive else "p") - assert hashes[3][1] == _get_unsigned_32_bit_hash( + assert hashes[3][1] == _get_unsigned_32_bit_hash("Pro" if case_sensitive else "pro") + assert hashes[3][2] == _get_unsigned_32_bit_hash( "Prod" if case_sensitive else "prod" ) - assert hashes[3][2] == _get_unsigned_32_bit_hash("Pro" if case_sensitive else "pro") assert hashes[3][3] == _get_unsigned_32_bit_hash("gy") assert hashes[3][4] == _get_unsigned_32_bit_hash("igy") assert hashes[3][5] == _get_unsigned_32_bit_hash("digy") assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy") - assert hashes[3][7] == _get_unsigned_32_bit_hash(" " if case_sensitive else "pr") + assert hashes[3][7] == 0 if case_sensitive else _get_unsigned_32_bit_hash("pr") - assert hashes[3][9] == _get_unsigned_32_bit_hash("r") + assert hashes[3][8] == _get_unsigned_32_bit_hash("r") if case_sensitive: - assert hashes[3][8] == _get_unsigned_32_bit_hash("r ") + assert hashes[3][9] == _get_unsigned_32_bit_hash("r") else: - assert hashes[3][8] == _get_unsigned_32_bit_hash("rp") + assert hashes[3][9] == _get_unsigned_32_bit_hash("rp") # check values are the same cross-platform if case_sensitive: - assert hashes[0][1] == 3712103410 + assert hashes[0][2] == 3041529170 else: - assert hashes[0][1] == 307339932 - assert hashes[1][3] == 2414314354 - assert hashes[2][8] == 1669671676 + assert hashes[0][2] == 2199614696 + assert hashes[1][3] == 910783208 + assert hashes[3][8] == 1553167345 def test_get_character_combination_hashes_good_case_partial(en_tokenizer): doc = en_tokenizer("spaCy✨ and Prodigy") ops = get_current_ops() - pref_search, pref_lookup = get_arrays_for_search_chars("rp", False) + ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("rp", False) hashes = doc.get_character_combination_hashes( cs=False, p_lengths=ops.asarray1i([]), s_lengths=ops.asarray1i([2, 3, 4, 5]), - ps_search=pref_search, - ps_lookup=pref_lookup, - ps_l=4, + ps_1byte_ch=ps1, + ps_2byte_ch=ps2, + ps_3byte_ch=ps3, + ps_4byte_ch=ps4, ps_lengths=ops.asarray1i([2]), - ss_search=bytes(), - ss_lookup=bytes(), - ss_l=0, + ss_1byte_ch=bytes(), + ss_2byte_ch=bytes(), + ss_3byte_ch=bytes(), + ss_4byte_ch=bytes(), ss_lengths=ops.asarray1i([]), ) - + + assert hashes[0][0] == _get_unsigned_32_bit_hash("cy") assert hashes[0][1] == _get_unsigned_32_bit_hash("acy") assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy") assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy") - assert hashes[0][4] == _get_unsigned_32_bit_hash("p ") - assert hashes[1][0] == _get_unsigned_32_bit_hash(" ✨") - assert hashes[1][1] == _get_unsigned_32_bit_hash(" ✨") - assert hashes[1][2] == _get_unsigned_32_bit_hash(" ✨") - assert hashes[1][3] == _get_unsigned_32_bit_hash(" ✨") - assert hashes[1][4] == _get_unsigned_32_bit_hash(" ") + assert hashes[0][4] == _get_unsigned_32_bit_hash("p") + assert hashes[1][0] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][1] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][2] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][3] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][4] == 0 assert hashes[2][0] == _get_unsigned_32_bit_hash("nd") assert hashes[2][1] == _get_unsigned_32_bit_hash("and") - assert hashes[2][2] == _get_unsigned_32_bit_hash(" and") - assert hashes[2][3] == _get_unsigned_32_bit_hash(" and") - assert hashes[2][4] == _get_unsigned_32_bit_hash(" ") + assert hashes[2][2] == _get_unsigned_32_bit_hash("and") + assert hashes[2][3] == _get_unsigned_32_bit_hash("and") + assert hashes[2][4] == 0 assert hashes[3][0] == _get_unsigned_32_bit_hash("gy") assert hashes[3][1] == _get_unsigned_32_bit_hash("igy") assert hashes[3][2] == _get_unsigned_32_bit_hash("digy") @@ -1126,30 +1130,127 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer): assert hashes[3][4] == _get_unsigned_32_bit_hash("pr") - - def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): doc = en_tokenizer("sp𐌞Cé") ops = get_current_ops() for p_length in range(1, 8): for s_length in range(1, 8): + hashes = doc.get_character_combination_hashes( cs=False, p_lengths=ops.asarray1i([p_length]), s_lengths=ops.asarray1i([s_length]), - ps_search=bytes(), - ps_lookup=bytes(), - ps_l=0, + ps_1byte_ch=bytes(), + ps_2byte_ch=bytes(), + ps_3byte_ch=bytes(), + ps_4byte_ch=bytes(), ps_lengths=ops.asarray1i([]), - ss_search=bytes(), - ss_lookup=bytes(), - ss_l=0, + ss_1byte_ch=bytes(), + ss_2byte_ch=bytes(), + ss_3byte_ch=bytes(), + ss_4byte_ch=bytes(), ss_lengths=ops.asarray1i([]), ) - assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé "[:p_length]) - assert hashes[0][1] == _get_unsigned_32_bit_hash(" sp𐌞cé"[8 - s_length :]) + assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length]) + assert hashes[0][1] == _get_unsigned_32_bit_hash("sp𐌞cé"[-s_length:]) + + +@pytest.mark.parametrize("case_sensitive", [True, False]) +def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_sensitive): + doc = en_tokenizer("İ".lower() + "İ") + ops = get_current_ops() + s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive) + hashes = doc.get_character_combination_hashes( + cs=case_sensitive, + p_lengths=ops.asarray1i([1, 2, 3, 4]), + s_lengths=ops.asarray1i([1, 2, 3, 4]), + ps_1byte_ch=s1, + ps_2byte_ch=s2, + ps_3byte_ch=s3, + ps_4byte_ch=s4, + ps_lengths=ops.asarray1i([1, 2, 3, 4]), + ss_1byte_ch=s1, + ss_2byte_ch=s2, + ss_3byte_ch=s3, + ss_4byte_ch=s4, + ss_lengths=ops.asarray1i([1, 2, 3, 4]), + ) + + COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") + assert hashes[0][0] == _get_unsigned_32_bit_hash("i") + assert hashes[0][1] == _get_unsigned_32_bit_hash("İ".lower()) + if case_sensitive: + assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "İ") + assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() + "İ") + assert hashes[0][4] == _get_unsigned_32_bit_hash("İ") + assert hashes[0][5] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ") + assert hashes[0][6] == _get_unsigned_32_bit_hash("İ".lower() + "İ") + assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() + "İ") + assert hashes[0][8] == _get_unsigned_32_bit_hash("İ") + assert hashes[0][9] == _get_unsigned_32_bit_hash("İ") + assert hashes[0][12] == _get_unsigned_32_bit_hash("İ") + assert hashes[0][13] == _get_unsigned_32_bit_hash("İ") + + else: + assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "i") + assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() * 2) + assert hashes[0][4] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE) + assert hashes[0][5] == _get_unsigned_32_bit_hash("İ".lower()) + assert hashes[0][6] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower()) + assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() * 2) + assert hashes[0][8] == _get_unsigned_32_bit_hash("i") + assert hashes[0][9] == _get_unsigned_32_bit_hash("İ".lower()) + assert hashes[0][10] == _get_unsigned_32_bit_hash("İ".lower() + "i") + assert hashes[0][11] == _get_unsigned_32_bit_hash("İ".lower() * 2) + assert hashes[0][12] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE) + assert hashes[0][13] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i") + assert hashes[0][14] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE) + assert hashes[0][15] == _get_unsigned_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2) + + +@pytest.mark.parametrize("case_sensitive", [True, False]) +def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer, case_sensitive): + symbol = "FLAG19" + short_word = "bee" + normal_word = "serendipity" + long_word = "serendipity" * 50 + assert len(long_word) > 255 + doc = en_tokenizer(' '.join((symbol, short_word, normal_word, long_word))) + assert len(doc) == 4 + ops = get_current_ops() + ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive) + hashes = doc.get_character_combination_hashes( + cs=case_sensitive, + p_lengths=ops.asarray1i([2]), + s_lengths=ops.asarray1i([2]), + ps_1byte_ch=ps1, + ps_2byte_ch=ps2, + ps_3byte_ch=ps3, + ps_4byte_ch=ps4, + ps_lengths=ops.asarray1i([2]), + ss_1byte_ch=bytes(), + ss_2byte_ch=bytes(), + ss_3byte_ch=bytes(), + ss_4byte_ch=bytes(), + ss_lengths=ops.asarray1i([]), + ) + assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl") + assert hashes[0][1] == _get_unsigned_32_bit_hash("19") + assert hashes[0][2] == 0 + assert hashes[1][0] == _get_unsigned_32_bit_hash("be") + assert hashes[1][1] == _get_unsigned_32_bit_hash("ee") + if case_sensitive: + assert hashes[1][2] == 0 + else: + assert hashes[1][2] == _get_unsigned_32_bit_hash("ee") + assert hashes[2][0] == hashes[3][0] == _get_unsigned_32_bit_hash("se") + assert hashes[2][1] == hashes[3][1] == _get_unsigned_32_bit_hash("ty") + if case_sensitive: + assert hashes[2][2] == hashes[3][2] == 0 + else: + assert hashes[2][2] == hashes[3][2] == _get_unsigned_32_bit_hash("ee") def test_character_combination_hashes_empty_lengths(en_tokenizer): @@ -1159,12 +1260,14 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer): cs=True, p_lengths=ops.asarray1i([]), s_lengths=ops.asarray1i([]), - ps_search=bytes(), - ps_lookup=bytes(), - ps_l=0, + ps_1byte_ch=bytes(), + ps_2byte_ch=bytes(), + ps_3byte_ch=bytes(), + ps_4byte_ch=bytes(), ps_lengths=ops.asarray1i([]), - ss_search=bytes(), - ss_lookup=bytes(), - ss_l=0, + ss_1byte_ch=bytes(), + ss_2byte_ch=bytes(), + ss_3byte_ch=bytes(), + ss_4byte_ch=bytes(), ss_lengths=ops.asarray1i([]), ).shape == (1, 0) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 106731e5b..ec9e12731 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -40,13 +40,13 @@ cdef int [:,:] _get_lca_matrix(Doc, int start, int end) cdef void _set_affix_lengths( const unsigned char[:] tok_str, - unsigned char* aff_len_buf, - const int pref_len, - const int suff_len, -) nogil + unsigned char* aff_l_buf, + const int pref_l, + const int suff_l, +) -ccdef void _search_for_chars( +cdef void _search_for_chars( const unsigned char[:] tok_str, const unsigned char[:] s_1byte_ch, const unsigned char[:] s_2byte_ch, @@ -54,9 +54,9 @@ ccdef void _search_for_chars( const unsigned char[:] s_4byte_ch, unsigned char* res_buf, int max_res_l, - unsigned char* len_buf, + unsigned char* l_buf, bint suffs_not_prefs -) nogil +) cdef class Doc: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 11a2ec1b5..407323236 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1736,7 +1736,7 @@ cdef class Doc: return output - def np.ndarray get_character_combination_hashes(self, + def get_character_combination_hashes(self, *, const bint cs, np.ndarray p_lengths, @@ -1751,7 +1751,7 @@ cdef class Doc: const unsigned char[:] ss_3byte_ch, const unsigned char[:] ss_4byte_ch, np.ndarray ss_lengths, - ) nogil: + ): """ Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations derived from the raw text of each token. @@ -1797,11 +1797,11 @@ cdef class Doc: # Define / allocate buffers cdef int aff_l = p_max_l + s_max_l - cdef char* aff_len_buf = self.mem.alloc(aff_l, 1) - cdef char* ps_res_buf = self.mem.alloc(ps_max_l, 4) - cdef char* ps_len_buf = self.mem.alloc(ps_max_l, 1) - cdef char* ss_res_buf = self.mem.alloc(ss_max_l, 4) - cdef char* ss_len_buf = self.mem.alloc(ss_max_l, 1) + cdef unsigned char* aff_l_buf = self.mem.alloc(aff_l, 1) + cdef unsigned char* ps_res_buf = self.mem.alloc(ps_max_l, 4) + cdef unsigned char* ps_l_buf = self.mem.alloc(ps_max_l, 1) + cdef unsigned char* ss_res_buf = self.mem.alloc(ss_max_l, 4) + cdef unsigned char* ss_l_buf = self.mem.alloc(ss_max_l, 1) # Define memory views on length arrays cdef int[:] p_lengths_v = p_lengths @@ -1812,7 +1812,7 @@ cdef class Doc: # Define working variables cdef TokenC tok_c cdef int tok_i, offset - cdef uint64_t hash_val + cdef uint64_t hash_val = 0 cdef attr_t num_tok_attr cdef const unsigned char[:] tok_str @@ -1822,43 +1822,44 @@ cdef class Doc: tok_str = self.vocab.strings.utf8_view(num_tok_attr) if aff_l > 0: - _set_affix_lengths(tok_str, aff_len_buf, p_max_l, s_max_l) + _set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l) + for hash_idx in range(p_h_num): - offset = aff_len_buf[p_lengths_v[hash_idx]] + offset = aff_l_buf[p_lengths_v[hash_idx] - 1] if offset > 0: hash_val = hash32( &tok_str[0], offset, 0) hashes[tok_i, hash_idx] = hash_val for hash_idx in range(p_h_num, s_h_end): - offset = s_lengths_v[hash_idx - p_h_num] + offset = aff_l_buf[s_lengths_v[hash_idx - p_h_num] + p_max_l - 1] if offset > 0: hash_val = hash32( &tok_str[len(tok_str) - offset], offset, 0) hashes[tok_i, hash_idx] = hash_val if ps_h_num > 0: - _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_res_len, False) + _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False) hash_val = 0 for hash_idx in range(s_h_end, ps_h_end): - offset = ps_lengths_v[hash_idx - s_h_end] + offset = ps_l_buf[ps_lengths_v[hash_idx - s_h_end] - 1] if offset > 0: hash_val = hash32(ps_res_buf, offset, 0) hashes[tok_i, hash_idx] = hash_val if ss_h_num > 0: - _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_res_len, True) + _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True) hash_val = 0 for hash_idx in range(ps_h_end, ss_h_end): - offset = ss_lengths_v[hash_idx - ps_h_end] + offset = ss_l_buf[ss_lengths_v[hash_idx - ps_h_end] - 1] if offset > 0: hash_val = hash32(ss_res_buf, offset, 0) hashes[tok_i, hash_idx] = hash_val - self.mem.free(aff_len_buf) + self.mem.free(aff_l_buf) self.mem.free(ps_res_buf) - self.mem.free(ps_len_buf) + self.mem.free(ps_l_buf) self.mem.free(ss_res_buf) - self.mem.free(ss_len_buf) + self.mem.free(ss_l_buf) return hashes @staticmethod @@ -2044,46 +2045,45 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): cdef void _set_affix_lengths( const unsigned char[:] tok_str, - unsigned char* aff_len_buf, - const int pref_len, - const int suff_len, -) nogil: - """ TODO : Populate *len_buf*, which has length *pref_len+suff_len* with the byte lengths of the first *pref_len* and the last - *suff_len* characters within *tok_str*. If the word is shorter than pref and/or suff, the empty lengths in the middle are - filled with zeros. + unsigned char* aff_l_buf, + const int pref_l, + const int suff_l, +): + """ Populate *aff_l_buf*, which has length *pref_l+suff_l* with the byte lengths of the first *pref_l* and the last + *suff_l* characters within *tok_str*. Lengths that are greater than the character length of the whole word are + populated with the byte length of the whole word. tok_str: a memoryview of a UTF-8 representation of a string. - len_buf: a buffer of length *pref_len+suff_len* in which to store the lengths. The calling code ensures that lengths + aff_l_buf: a buffer of length *pref_l+suff_l* in which to store the lengths. The calling code ensures that lengths greater than 255 cannot occur. - pref_len: the number of characters to process at the beginning of the word. - suff_len: the number of characters to process at the end of the word. + pref_l: the number of characters to process at the beginning of the word. + suff_l: the number of characters to process at the end of the word. """ - cdef int tok_str_idx = 0, aff_len_buf_idx = 0, tok_str_len = len(tok_str) + cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = len(tok_str) - while aff_len_buf_idx < pref_len: - if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character - aff_len_buf[aff_len_buf_idx] = tok_str_idx + 1 - aff_len_buf_idx += 1 + while aff_l_buf_idx < pref_l: + if tok_str_idx == len(tok_str) or ((tok_str[tok_str_idx] & 0xc0) != 0x80): # not a continuation character + aff_l_buf[aff_l_buf_idx] = tok_str_idx + aff_l_buf_idx += 1 tok_str_idx += 1 - if tok_str_idx == len(tok_str): + if tok_str_idx > len(tok_str): break - if aff_len_buf_idx < pref_len: - memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx) - aff_len_buf_idx = pref_len + if aff_l_buf_idx < pref_l: + memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l - aff_l_buf_idx) + aff_l_buf_idx = pref_l - tok_str_idx = 1 - while aff_len_buf_idx < pref_len + suff_len: + tok_str_idx = tok_str_l - 1 + while aff_l_buf_idx < pref_l + suff_l: if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character - aff_len_buf[aff_len_buf_idx] = tok_str_len - tok_str_idx - aff_len_buf_idx += 1 - tok_str_idx += 1 - if tok_str_idx > tok_str_len: + aff_l_buf[aff_l_buf_idx] = tok_str_l - tok_str_idx + aff_l_buf_idx += 1 + tok_str_idx -= 1 + if tok_str_idx < 0: break - if aff_len_buf_idx < pref_len + suff_len: - memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx) - + if aff_l_buf_idx < pref_l + suff_l: + memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l + suff_l - aff_l_buf_idx) cdef void _search_for_chars( const unsigned char[:] tok_str, @@ -2093,31 +2093,33 @@ cdef void _search_for_chars( const unsigned char[:] s_4byte_ch, unsigned char* res_buf, int max_res_l, - unsigned char* len_buf, + unsigned char* l_buf, bint suffs_not_prefs -) nogil: +): """ Search *tok_str* within a string for characters within the *s_byte_ch> buffers, starting at the beginning or end depending on the value of *suffs_not_prefs*. Wherever a character matches, - it is added to *res_buf* and the byte length up to that point is added to *len_buf*. + it is added to *res_buf* and the byte length up to that point is added to *len_buf*. When nothing + more is found, the remainder of *len_buf* is populated wth the byte length from the last result, + which may be *0* if the search was not successful. tok_str: a memoryview of a UTF-8 representation of a string. s_byte_ch: a byte array containing in order n-byte-wide characters to search for. res_buf: the buffer in which to place the search results. max_res_l: the maximum number of found characters to place in *res_buf*. - len_buf: a buffer of length *max_res_l* in which to store the byte lengths. + l_buf: a buffer of length *max_res_l* in which to store the byte lengths. The calling code ensures that lengths greater than 255 cannot occur. suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning. """ - cdef int tok_str_len = len(tok_str), search_char_idx = 0, res_buf_idx = 0, len_buf_idx = 0 - cdef int last_tok_str_idx = tok_str_len if suffs_not_prefs else 0 - cdef int this_tok_str_idx = tok_str_len - 1 if suffs_not_prefs else 1 - cdef int ch_wdth, tok_start_idx - cdef char[:] search_chars + cdef int tok_str_l = len(tok_str), search_char_idx = 0, res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx + cdef const unsigned char[:] search_chars + + cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0 + cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1 while True: if ( - this_tok_str_idx == tok_str_len or - (tok_str[this_tok_str_idx] & 0xc0) != 0x80 # not continuation character + this_tok_str_idx == tok_str_l or + (tok_str[this_tok_str_idx] & 0xc0) != 0x80 # not continuation character, always applies to [0]. ): ch_wdth = abs(this_tok_str_idx - last_tok_str_idx) if ch_wdth == 1: @@ -2129,16 +2131,17 @@ cdef void _search_for_chars( else: search_chars = s_4byte_ch tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx + for search_char_idx in range(0, len(search_chars), ch_wdth): - cmp_result = memcmp(tok_str + tok_start_idx, search_chars + search_char_idx, ch_wdth) + cmp_result = memcmp(&tok_str[tok_start_idx], &search_chars[search_char_idx], ch_wdth) if cmp_result == 0: - memcpy(res_buf + res_buf_idx, search_chars + search_char_idx, ch_wdth) + memcpy(res_buf + res_buf_idx, &search_chars[search_char_idx], ch_wdth) res_buf_idx += ch_wdth - len_buf[len_buf_idx] = res_buf_idx - len_buf_idx += 1 - if len_buf_idx == max_res_l: + l_buf[l_buf_idx] = res_buf_idx + l_buf_idx += 1 + if l_buf_idx == max_res_l: return - if cmp_result >= 0: + if cmp_result <= 0: break last_tok_str_idx = this_tok_str_idx if suffs_not_prefs: @@ -2147,48 +2150,11 @@ cdef void _search_for_chars( break else: this_tok_str_idx += 1 - if this_tok_str_idx >= tok_str_len: + if this_tok_str_idx > tok_str_l: break - # fill in unused characters in the length buffer with 0 - memset(res_buf + res_buf_idx, 0, max_res_l - res_buf_idx) - - - - - - - - - - - - cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx - cdef int search_buf_idx - cdef int cmp_result - - while result_buf_idx < result_buf_len: - for search_buf_idx in range (search_buf_len): - cmp_result = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, sizeof(Py_UCS4)) - if cmp_result == 0: - memcpy(result_buf + result_buf_idx, lookup_buf + search_buf_idx, sizeof(Py_UCS4)) - result_buf_idx += 1 - if cmp_result >= 0: - break - if suffs_not_prefs: - if text_string_idx <= tok_idx: - break - text_string_idx -= 1 - else: - text_string_idx += 1 - if text_string_idx >= tok_idx + tok_len: - break - - # fill in any unused characters in the result buffer with zeros - if result_buf_idx < result_buf_len: - memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4)) - - return result_buf_idx > 0 + # fill in unused characters in the length buffer + memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) def pickle_doc(doc):