From 217ff3655972df02f875bfbacd9a5b399a76c721 Mon Sep 17 00:00:00 2001
From: richardpaulhudson <richard@explosion.ai>
Date: Fri, 28 Oct 2022 13:31:14 +0200
Subject: [PATCH] Tests passing again after refactoring

---
 spacy/strings.pyx               |   2 +-
 spacy/tests/doc/test_doc_api.py | 251 ++++++++++++++++++++++----------
 spacy/tokens/doc.pxd            |  14 +-
 spacy/tokens/doc.pyx            | 172 +++++++++-------------
 4 files changed, 254 insertions(+), 185 deletions(-)

diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index b64cbbed2..d86bf600b 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -320,7 +320,7 @@ cdef class StringStore:
         if hash_val == 0:
             return ""
         elif hash_val < len(SYMBOLS_BY_INT):
-            return SYMBOLS_BY_INT[hash_val]
+            return SYMBOLS_BY_INT[hash_val].encode("utf-8")
         cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
         cdef int i, length
         if string.s[0] < sizeof(string.s) and string.s[0] != 0:
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 0a1838a3c..a9c974b8c 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -14,7 +14,7 @@ from spacy.lang.xx import MultiLanguage
 from spacy.language import Language
 from spacy.lexeme import Lexeme
 from spacy.tokens import Doc, Span, SpanGroup, Token
-from spacy.util import get_arrays_for_search_chars
+from spacy.util import get_search_char_byte_arrays
 from spacy.vocab import Vocab
 
 from .test_underscore import clean_underscore  # noqa: F401
@@ -995,8 +995,7 @@ def test_doc_spans_setdefault(en_tokenizer):
 
 
 def _get_unsigned_32_bit_hash(input: str) -> int:
-    input = input.replace(" ", "\x00")
-    working_hash = hash(input.encode("UTF-32LE"))
+    working_hash = hash(input.encode("UTF-8"))
     if working_hash < 0:
         working_hash = working_hash + (2 << 31)
     return working_hash
@@ -1006,27 +1005,29 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
 def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
     doc = en_tokenizer("spaCy✨ and Prodigy")
     ops = get_current_ops()
-    pref_search, pref_lookup = get_arrays_for_search_chars("Rp", case_sensitive)
-    suff_search, suff_lookup = get_arrays_for_search_chars("xx✨rp", case_sensitive)
+    ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("Rp", case_sensitive)
+    ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive)
     hashes = doc.get_character_combination_hashes(
         cs=case_sensitive,
-        p_lengths=ops.asarray1i([1, 4, 3]),
+        p_lengths=ops.asarray1i([1, 3, 4]),
         s_lengths=ops.asarray1i([2, 3, 4, 5]),
-        ps_search=pref_search,
-        ps_lookup=pref_lookup,
-        ps_l=2 if case_sensitive else 4,
+        ps_1byte_ch=ps1,
+        ps_2byte_ch=ps2,
+        ps_3byte_ch=ps3,
+        ps_4byte_ch=ps4,
         ps_lengths=ops.asarray1i([2]),
-        ss_search=suff_search,
-        ss_lookup=suff_lookup,
-        ss_l=5 if case_sensitive else 9,
-        ss_lengths=ops.asarray1i([2, 1]),
+        ss_1byte_ch=ss1,
+        ss_2byte_ch=ss2,
+        ss_3byte_ch=ss3,
+        ss_4byte_ch=ss4,
+        ss_lengths=ops.asarray1i([1, 2]),
     )
 
     assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
-    assert hashes[0][1] == _get_unsigned_32_bit_hash(
+    assert hashes[0][1] == _get_unsigned_32_bit_hash("spa")
+    assert hashes[0][2] == _get_unsigned_32_bit_hash(
         "spaC" if case_sensitive else "spac"
     )
-    assert hashes[0][2] == _get_unsigned_32_bit_hash("spa")
     assert hashes[0][3] == _get_unsigned_32_bit_hash("Cy" if case_sensitive else "cy")
     assert hashes[0][4] == _get_unsigned_32_bit_hash("aCy" if case_sensitive else "acy")
     assert hashes[0][5] == _get_unsigned_32_bit_hash(
@@ -1036,89 +1037,92 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
         "spaCy" if case_sensitive else "spacy"
     )
 
-    assert hashes[0][7] == _get_unsigned_32_bit_hash("p ")
-    assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
+    assert hashes[0][7] == _get_unsigned_32_bit_hash("p")
+    assert hashes[0][8] == _get_unsigned_32_bit_hash("p")
     assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
     assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
-    assert hashes[1][1] == _get_unsigned_32_bit_hash("✨   ")
-    assert hashes[1][2] == _get_unsigned_32_bit_hash("✨  ")
-    assert hashes[1][3] == _get_unsigned_32_bit_hash(" ✨")
-    assert hashes[1][4] == _get_unsigned_32_bit_hash("  ✨")
-    assert hashes[1][5] == _get_unsigned_32_bit_hash("   ✨")
-    assert hashes[1][6] == _get_unsigned_32_bit_hash("    ✨")
-    assert hashes[1][7] == _get_unsigned_32_bit_hash("  ")
-    assert hashes[1][8] == _get_unsigned_32_bit_hash("✨ ")
+    assert hashes[1][1] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][2] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][3] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][4] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][5] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][6] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][7] == 0
+    assert hashes[1][8] == _get_unsigned_32_bit_hash("✨")
     assert hashes[1][9] == _get_unsigned_32_bit_hash("✨")
     assert hashes[2][0] == _get_unsigned_32_bit_hash("a")
-    assert hashes[2][1] == _get_unsigned_32_bit_hash("and ")
+    assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
     assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
     assert hashes[2][3] == _get_unsigned_32_bit_hash("nd")
     assert hashes[2][4] == _get_unsigned_32_bit_hash("and")
-    assert hashes[2][5] == _get_unsigned_32_bit_hash(" and")
-    assert hashes[2][6] == _get_unsigned_32_bit_hash("  and")
-    assert hashes[2][7] == _get_unsigned_32_bit_hash("  ")
-    assert hashes[2][8] == _get_unsigned_32_bit_hash("  ")
-    assert hashes[2][9] == _get_unsigned_32_bit_hash(" ")
+    assert hashes[2][5] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][6] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][7] == 0
+    assert hashes[2][8] == 0
+    assert hashes[2][9] == 0
     assert hashes[3][0] == _get_unsigned_32_bit_hash("P" if case_sensitive else "p")
-    assert hashes[3][1] == _get_unsigned_32_bit_hash(
+    assert hashes[3][1] == _get_unsigned_32_bit_hash("Pro" if case_sensitive else "pro")
+    assert hashes[3][2] == _get_unsigned_32_bit_hash(
         "Prod" if case_sensitive else "prod"
     )
-    assert hashes[3][2] == _get_unsigned_32_bit_hash("Pro" if case_sensitive else "pro")
     assert hashes[3][3] == _get_unsigned_32_bit_hash("gy")
     assert hashes[3][4] == _get_unsigned_32_bit_hash("igy")
     assert hashes[3][5] == _get_unsigned_32_bit_hash("digy")
     assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy")
-    assert hashes[3][7] == _get_unsigned_32_bit_hash("  " if case_sensitive else "pr")
+    assert hashes[3][7] == 0 if case_sensitive else _get_unsigned_32_bit_hash("pr")
 
-    assert hashes[3][9] == _get_unsigned_32_bit_hash("r")
+    assert hashes[3][8] == _get_unsigned_32_bit_hash("r")
 
     if case_sensitive:
-        assert hashes[3][8] == _get_unsigned_32_bit_hash("r ")
+        assert hashes[3][9] == _get_unsigned_32_bit_hash("r")
     else:
-        assert hashes[3][8] == _get_unsigned_32_bit_hash("rp")
+        assert hashes[3][9] == _get_unsigned_32_bit_hash("rp")
 
     # check values are the same cross-platform
     if case_sensitive:
-        assert hashes[0][1] == 3712103410
+        assert hashes[0][2] == 3041529170
     else:
-        assert hashes[0][1] == 307339932
-    assert hashes[1][3] == 2414314354
-    assert hashes[2][8] == 1669671676
+        assert hashes[0][2] == 2199614696
+    assert hashes[1][3] == 910783208
+    assert hashes[3][8] == 1553167345
 
 
 def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
     doc = en_tokenizer("spaCy✨ and Prodigy")
     ops = get_current_ops()
-    pref_search, pref_lookup = get_arrays_for_search_chars("rp", False)
+    ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("rp", False)
     hashes = doc.get_character_combination_hashes(
         cs=False,
         p_lengths=ops.asarray1i([]),
         s_lengths=ops.asarray1i([2, 3, 4, 5]),
-        ps_search=pref_search,
-        ps_lookup=pref_lookup,
-        ps_l=4,
+        ps_1byte_ch=ps1,
+        ps_2byte_ch=ps2,
+        ps_3byte_ch=ps3,
+        ps_4byte_ch=ps4,
         ps_lengths=ops.asarray1i([2]),
-        ss_search=bytes(),
-        ss_lookup=bytes(),
-        ss_l=0,
+        ss_1byte_ch=bytes(),
+        ss_2byte_ch=bytes(),
+        ss_3byte_ch=bytes(),
+        ss_4byte_ch=bytes(),
         ss_lengths=ops.asarray1i([]),
     )
-
+    
+    
     assert hashes[0][0] == _get_unsigned_32_bit_hash("cy")
     assert hashes[0][1] == _get_unsigned_32_bit_hash("acy")
     assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy")
     assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy")
-    assert hashes[0][4] == _get_unsigned_32_bit_hash("p ")
-    assert hashes[1][0] == _get_unsigned_32_bit_hash(" ✨")
-    assert hashes[1][1] == _get_unsigned_32_bit_hash("  ✨")
-    assert hashes[1][2] == _get_unsigned_32_bit_hash("   ✨")
-    assert hashes[1][3] == _get_unsigned_32_bit_hash("    ✨")
-    assert hashes[1][4] == _get_unsigned_32_bit_hash("  ")
+    assert hashes[0][4] == _get_unsigned_32_bit_hash("p")
+    assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][1] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][2] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][3] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][4] == 0
     assert hashes[2][0] == _get_unsigned_32_bit_hash("nd")
     assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
-    assert hashes[2][2] == _get_unsigned_32_bit_hash(" and")
-    assert hashes[2][3] == _get_unsigned_32_bit_hash("  and")
-    assert hashes[2][4] == _get_unsigned_32_bit_hash("  ")
+    assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][3] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][4] == 0
     assert hashes[3][0] == _get_unsigned_32_bit_hash("gy")
     assert hashes[3][1] == _get_unsigned_32_bit_hash("igy")
     assert hashes[3][2] == _get_unsigned_32_bit_hash("digy")
@@ -1126,30 +1130,127 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
     assert hashes[3][4] == _get_unsigned_32_bit_hash("pr")
 
 
-
-
 def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
     doc = en_tokenizer("sp𐌞Cé")
     ops = get_current_ops()
 
     for p_length in range(1, 8):
         for s_length in range(1, 8):
+            
             hashes = doc.get_character_combination_hashes(
                 cs=False,
                 p_lengths=ops.asarray1i([p_length]),
                 s_lengths=ops.asarray1i([s_length]),
-                ps_search=bytes(),
-                ps_lookup=bytes(),
-                ps_l=0,
+                ps_1byte_ch=bytes(),
+                ps_2byte_ch=bytes(),
+                ps_3byte_ch=bytes(),
+                ps_4byte_ch=bytes(),
                 ps_lengths=ops.asarray1i([]),
-                ss_search=bytes(),
-                ss_lookup=bytes(),
-                ss_l=0,
+                ss_1byte_ch=bytes(),
+                ss_2byte_ch=bytes(),
+                ss_3byte_ch=bytes(),
+                ss_4byte_ch=bytes(),
                 ss_lengths=ops.asarray1i([]),
             )
 
-            assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé   "[:p_length])
-            assert hashes[0][1] == _get_unsigned_32_bit_hash("   sp𐌞cé"[8 - s_length :])
+            assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length])
+            assert hashes[0][1] == _get_unsigned_32_bit_hash("sp𐌞cé"[-s_length:])
+
+
+@pytest.mark.parametrize("case_sensitive", [True, False])
+def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_sensitive):
+    doc = en_tokenizer("İ".lower() + "İ")
+    ops = get_current_ops()
+    s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive)
+    hashes = doc.get_character_combination_hashes(
+        cs=case_sensitive,
+        p_lengths=ops.asarray1i([1, 2, 3, 4]),
+        s_lengths=ops.asarray1i([1, 2, 3, 4]),
+        ps_1byte_ch=s1,
+        ps_2byte_ch=s2,
+        ps_3byte_ch=s3,
+        ps_4byte_ch=s4,
+        ps_lengths=ops.asarray1i([1, 2, 3, 4]),
+        ss_1byte_ch=s1,
+        ss_2byte_ch=s2,
+        ss_3byte_ch=s3,
+        ss_4byte_ch=s4,
+        ss_lengths=ops.asarray1i([1, 2, 3, 4]),
+    )
+
+    COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
+    assert hashes[0][0] == _get_unsigned_32_bit_hash("i")
+    assert hashes[0][1] == _get_unsigned_32_bit_hash("İ".lower())
+    if case_sensitive:
+        assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
+        assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
+        assert hashes[0][4] == _get_unsigned_32_bit_hash("İ")
+        assert hashes[0][5] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ")
+        assert hashes[0][6] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
+        assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
+        assert hashes[0][8] == _get_unsigned_32_bit_hash("İ")
+        assert hashes[0][9] == _get_unsigned_32_bit_hash("İ")
+        assert hashes[0][12] == _get_unsigned_32_bit_hash("İ")
+        assert hashes[0][13] == _get_unsigned_32_bit_hash("İ")
+
+    else:
+        assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "i")
+        assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() * 2)
+        assert hashes[0][4] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE)
+        assert hashes[0][5] == _get_unsigned_32_bit_hash("İ".lower())
+        assert hashes[0][6] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower())
+        assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() * 2)
+        assert hashes[0][8] == _get_unsigned_32_bit_hash("i")
+        assert hashes[0][9] == _get_unsigned_32_bit_hash("İ".lower())
+        assert hashes[0][10] == _get_unsigned_32_bit_hash("İ".lower() + "i")
+        assert hashes[0][11] == _get_unsigned_32_bit_hash("İ".lower() * 2)
+        assert hashes[0][12] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE)
+        assert hashes[0][13] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i")
+        assert hashes[0][14] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE)
+        assert hashes[0][15] == _get_unsigned_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2)
+        
+
+@pytest.mark.parametrize("case_sensitive", [True, False])
+def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer, case_sensitive):
+    symbol = "FLAG19"
+    short_word = "bee"
+    normal_word = "serendipity"
+    long_word = "serendipity" * 50
+    assert len(long_word) > 255
+    doc = en_tokenizer(' '.join((symbol, short_word, normal_word, long_word)))
+    assert len(doc) == 4
+    ops = get_current_ops()
+    ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive)
+    hashes = doc.get_character_combination_hashes(
+        cs=case_sensitive,
+        p_lengths=ops.asarray1i([2]),
+        s_lengths=ops.asarray1i([2]),
+        ps_1byte_ch=ps1,
+        ps_2byte_ch=ps2,
+        ps_3byte_ch=ps3,
+        ps_4byte_ch=ps4,
+        ps_lengths=ops.asarray1i([2]),
+        ss_1byte_ch=bytes(),
+        ss_2byte_ch=bytes(),
+        ss_3byte_ch=bytes(),
+        ss_4byte_ch=bytes(),
+        ss_lengths=ops.asarray1i([]),
+    )
+    assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl")
+    assert hashes[0][1] == _get_unsigned_32_bit_hash("19")
+    assert hashes[0][2] == 0
+    assert hashes[1][0] == _get_unsigned_32_bit_hash("be")
+    assert hashes[1][1] == _get_unsigned_32_bit_hash("ee")
+    if case_sensitive:
+        assert hashes[1][2] == 0
+    else:
+        assert hashes[1][2] == _get_unsigned_32_bit_hash("ee")
+    assert hashes[2][0] == hashes[3][0] == _get_unsigned_32_bit_hash("se")
+    assert hashes[2][1] == hashes[3][1] == _get_unsigned_32_bit_hash("ty")
+    if case_sensitive:
+        assert hashes[2][2] == hashes[3][2] == 0
+    else:
+        assert hashes[2][2] == hashes[3][2] == _get_unsigned_32_bit_hash("ee")
 
 
 def test_character_combination_hashes_empty_lengths(en_tokenizer):
@@ -1159,12 +1260,14 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer):
         cs=True,
         p_lengths=ops.asarray1i([]),
         s_lengths=ops.asarray1i([]),
-        ps_search=bytes(),
-        ps_lookup=bytes(),
-        ps_l=0,
+        ps_1byte_ch=bytes(),
+        ps_2byte_ch=bytes(),
+        ps_3byte_ch=bytes(),
+        ps_4byte_ch=bytes(),
         ps_lengths=ops.asarray1i([]),
-        ss_search=bytes(),
-        ss_lookup=bytes(),
-        ss_l=0,
+        ss_1byte_ch=bytes(),
+        ss_2byte_ch=bytes(),
+        ss_3byte_ch=bytes(),
+        ss_4byte_ch=bytes(),
         ss_lengths=ops.asarray1i([]),
     ).shape == (1, 0)
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 106731e5b..ec9e12731 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -40,13 +40,13 @@ cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
 
 cdef void _set_affix_lengths(
     const unsigned char[:] tok_str,
-    unsigned char* aff_len_buf, 
-    const int pref_len, 
-    const int suff_len,
-) nogil
+    unsigned char* aff_l_buf, 
+    const int pref_l, 
+    const int suff_l,
+)
 
 
-ccdef void _search_for_chars(
+cdef void _search_for_chars(
     const unsigned char[:] tok_str,
     const unsigned char[:] s_1byte_ch,
     const unsigned char[:] s_2byte_ch,
@@ -54,9 +54,9 @@ ccdef void _search_for_chars(
     const unsigned char[:] s_4byte_ch,
     unsigned char* res_buf,
     int max_res_l,
-    unsigned char* len_buf,
+    unsigned char* l_buf,
     bint suffs_not_prefs
-) nogil
+)
 
 
 cdef class Doc:
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 11a2ec1b5..407323236 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1736,7 +1736,7 @@ cdef class Doc:
         return output
 
 
-    def np.ndarray get_character_combination_hashes(self,
+    def get_character_combination_hashes(self,
         *,
         const bint cs, 
         np.ndarray p_lengths, 
@@ -1751,7 +1751,7 @@ cdef class Doc:
         const unsigned char[:] ss_3byte_ch,
         const unsigned char[:] ss_4byte_ch,
         np.ndarray ss_lengths,
-    ) nogil:
+    ):
         """
         Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations 
             derived from the raw text of each token.
@@ -1797,11 +1797,11 @@ cdef class Doc:
 
         # Define / allocate buffers
         cdef int aff_l  = p_max_l + s_max_l
-        cdef char* aff_len_buf = self.mem.alloc(aff_l, 1)
-        cdef char* ps_res_buf = self.mem.alloc(ps_max_l, 4)
-        cdef char* ps_len_buf = self.mem.alloc(ps_max_l, 1)
-        cdef char* ss_res_buf = self.mem.alloc(ss_max_l, 4)
-        cdef char* ss_len_buf = self.mem.alloc(ss_max_l, 1)
+        cdef unsigned char* aff_l_buf = <unsigned char*> self.mem.alloc(aff_l, 1)
+        cdef unsigned char* ps_res_buf = <unsigned char*> self.mem.alloc(ps_max_l, 4)
+        cdef unsigned char* ps_l_buf = <unsigned char*> self.mem.alloc(ps_max_l, 1)
+        cdef unsigned char* ss_res_buf = <unsigned char*> self.mem.alloc(ss_max_l, 4)
+        cdef unsigned char* ss_l_buf = <unsigned char*> self.mem.alloc(ss_max_l, 1)
         
         # Define memory views on length arrays
         cdef int[:] p_lengths_v = p_lengths
@@ -1812,7 +1812,7 @@ cdef class Doc:
         # Define working variables
         cdef TokenC tok_c
         cdef int tok_i, offset
-        cdef uint64_t hash_val
+        cdef uint64_t hash_val = 0
         cdef attr_t num_tok_attr
         cdef const unsigned char[:] tok_str
 
@@ -1822,43 +1822,44 @@ cdef class Doc:
             tok_str = self.vocab.strings.utf8_view(num_tok_attr)
             
             if aff_l > 0:
-                _set_affix_lengths(tok_str, aff_len_buf, p_max_l, s_max_l)
+                _set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l)
+
                 for hash_idx in range(p_h_num):
-                    offset = aff_len_buf[p_lengths_v[hash_idx]]
+                    offset = aff_l_buf[p_lengths_v[hash_idx] - 1]
                     if offset > 0:
                         hash_val = hash32(<void*> &tok_str[0], offset, 0)
                     hashes[tok_i, hash_idx] = hash_val
             
                 for hash_idx in range(p_h_num, s_h_end):
-                    offset = s_lengths_v[hash_idx - p_h_num]
+                    offset = aff_l_buf[s_lengths_v[hash_idx - p_h_num] + p_max_l - 1]
                     if offset > 0:
                         hash_val = hash32(<void*> &tok_str[len(tok_str) - offset], offset, 0)
                     hashes[tok_i, hash_idx] = hash_val
 
             if ps_h_num > 0:
-                _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_res_len, False)
+                _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False)
                 hash_val = 0
                 for hash_idx in range(s_h_end, ps_h_end):
-                    offset = ps_lengths_v[hash_idx - s_h_end]
+                    offset = ps_l_buf[ps_lengths_v[hash_idx - s_h_end] - 1]
                     if offset > 0:
                         hash_val = hash32(ps_res_buf, offset, 0)
                     hashes[tok_i, hash_idx] = hash_val
 
             if ss_h_num > 0:
-                _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_res_len, True)
+                _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True)
                 hash_val = 0
                 for hash_idx in range(ps_h_end, ss_h_end):
-                    offset = ss_lengths_v[hash_idx - ps_h_end]
+                    offset = ss_l_buf[ss_lengths_v[hash_idx - ps_h_end] - 1]
                     if offset > 0:
                         hash_val = hash32(ss_res_buf, offset, 0)
                     hashes[tok_i, hash_idx] = hash_val
             
 
-        self.mem.free(aff_len_buf)
+        self.mem.free(aff_l_buf)
         self.mem.free(ps_res_buf)
-        self.mem.free(ps_len_buf)
+        self.mem.free(ps_l_buf)
         self.mem.free(ss_res_buf)
-        self.mem.free(ss_len_buf)
+        self.mem.free(ss_l_buf)
         return hashes
 
     @staticmethod
@@ -2044,46 +2045,45 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
 
 cdef void _set_affix_lengths(
     const unsigned char[:] tok_str,
-    unsigned char* aff_len_buf, 
-    const int pref_len, 
-    const int suff_len,
-) nogil:
-    """ TODO : Populate *len_buf*, which has length *pref_len+suff_len* with the byte lengths of the first *pref_len* and the last 
-        *suff_len* characters within *tok_str*. If the word is shorter than pref and/or suff, the empty lengths in the middle are 
-        filled with zeros.
+    unsigned char* aff_l_buf, 
+    const int pref_l, 
+    const int suff_l,
+):
+    """ Populate *aff_l_buf*, which has length *pref_l+suff_l* with the byte lengths of the first *pref_l* and the last 
+        *suff_l* characters within *tok_str*. Lengths that are greater than the character length of the whole word are 
+        populated with the byte length of the whole word.
 
         tok_str: a memoryview of a UTF-8 representation of a string.
-        len_buf: a buffer of length *pref_len+suff_len* in which to store the lengths. The calling code ensures that lengths
+        aff_l_buf: a buffer of length *pref_l+suff_l* in which to store the lengths. The calling code ensures that lengths
             greater than 255 cannot occur.
-        pref_len: the number of characters to process at the beginning of the word.
-        suff_len: the number of characters to process at the end of the word.
+        pref_l: the number of characters to process at the beginning of the word.
+        suff_l: the number of characters to process at the end of the word.
     """
-    cdef int tok_str_idx = 0, aff_len_buf_idx = 0, tok_str_len = len(tok_str)
+    cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = len(tok_str)
 
-    while aff_len_buf_idx < pref_len:
-        if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
-            aff_len_buf[aff_len_buf_idx] = tok_str_idx + 1
-            aff_len_buf_idx += 1
+    while aff_l_buf_idx < pref_l:
+        if tok_str_idx == len(tok_str) or ((tok_str[tok_str_idx] & 0xc0) != 0x80): # not a continuation character
+            aff_l_buf[aff_l_buf_idx] = tok_str_idx
+            aff_l_buf_idx += 1
         tok_str_idx += 1
-        if tok_str_idx == len(tok_str):
+        if tok_str_idx > len(tok_str):
             break
 
-    if aff_len_buf_idx < pref_len:
-        memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx)
-        aff_len_buf_idx = pref_len
+    if aff_l_buf_idx < pref_l:
+        memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l - aff_l_buf_idx)
+        aff_l_buf_idx = pref_l
 
-    tok_str_idx = 1
-    while aff_len_buf_idx < pref_len + suff_len:
+    tok_str_idx = tok_str_l - 1
+    while aff_l_buf_idx < pref_l + suff_l:
         if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
-            aff_len_buf[aff_len_buf_idx] = tok_str_len - tok_str_idx
-            aff_len_buf_idx += 1
-        tok_str_idx += 1
-        if tok_str_idx > tok_str_len:
+            aff_l_buf[aff_l_buf_idx] = tok_str_l - tok_str_idx
+            aff_l_buf_idx += 1
+        tok_str_idx -= 1
+        if tok_str_idx < 0:
             break
 
-    if aff_len_buf_idx < pref_len + suff_len:
-        memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx)
-
+    if aff_l_buf_idx < pref_l + suff_l:
+        memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l + suff_l - aff_l_buf_idx)
 
 cdef void _search_for_chars(
     const unsigned char[:] tok_str,
@@ -2093,31 +2093,33 @@ cdef void _search_for_chars(
     const unsigned char[:] s_4byte_ch,
     unsigned char* res_buf,
     int max_res_l,
-    unsigned char* len_buf,
+    unsigned char* l_buf,
     bint suffs_not_prefs
-) nogil:
+):
     """ Search *tok_str* within a string for characters within the *s_<n>byte_ch> buffers, starting at the 
         beginning or end depending on the value of *suffs_not_prefs*. Wherever a character matches,
-        it is added to *res_buf* and the byte length up to that point is added to *len_buf*.
+        it is added to *res_buf* and the byte length up to that point is added to *len_buf*. When nothing
+        more is found, the remainder of *len_buf* is populated wth the byte length from the last result,
+        which may be *0* if the search was not successful.
 
         tok_str: a memoryview of a UTF-8 representation of a string.
         s_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for.
         res_buf: the buffer in which to place the search results.
         max_res_l: the maximum number of found characters to place in *res_buf*.
-        len_buf: a buffer of length *max_res_l* in which to store the byte lengths. 
+        l_buf: a buffer of length *max_res_l* in which to store the byte lengths.
             The calling code ensures that lengths greater than 255 cannot occur. 
         suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
     """
-    cdef int tok_str_len = len(tok_str), search_char_idx = 0, res_buf_idx = 0, len_buf_idx = 0
-    cdef int last_tok_str_idx = tok_str_len if suffs_not_prefs else 0
-    cdef int this_tok_str_idx = tok_str_len - 1 if suffs_not_prefs else 1
-    cdef int ch_wdth, tok_start_idx 
-    cdef char[:] search_chars
+    cdef int tok_str_l = len(tok_str), search_char_idx = 0, res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx
+    cdef const unsigned char[:] search_chars
+
+    cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0
+    cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1
     
     while True:
         if (
-            this_tok_str_idx == tok_str_len or 
-            (tok_str[this_tok_str_idx] & 0xc0) != 0x80 # not continuation character
+            this_tok_str_idx == tok_str_l or 
+            (tok_str[this_tok_str_idx] & 0xc0) != 0x80 # not continuation character, always applies to [0].
         ):
             ch_wdth = abs(this_tok_str_idx - last_tok_str_idx)
             if ch_wdth == 1:
@@ -2129,16 +2131,17 @@ cdef void _search_for_chars(
             else:
                 search_chars = s_4byte_ch
             tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
+
             for search_char_idx in range(0, len(search_chars), ch_wdth):
-                cmp_result = memcmp(tok_str + tok_start_idx, search_chars + search_char_idx, ch_wdth)
+                cmp_result = memcmp(&tok_str[tok_start_idx], &search_chars[search_char_idx], ch_wdth)
                 if cmp_result == 0:
-                    memcpy(res_buf + res_buf_idx, search_chars + search_char_idx, ch_wdth)
+                    memcpy(res_buf + res_buf_idx, &search_chars[search_char_idx], ch_wdth)
                     res_buf_idx += ch_wdth
-                    len_buf[len_buf_idx] = res_buf_idx
-                    len_buf_idx += 1
-                    if len_buf_idx == max_res_l:
+                    l_buf[l_buf_idx] = res_buf_idx
+                    l_buf_idx += 1
+                    if l_buf_idx == max_res_l:
                         return
-                if cmp_result >= 0: 
+                if cmp_result <= 0: 
                     break
             last_tok_str_idx = this_tok_str_idx
         if suffs_not_prefs:
@@ -2147,48 +2150,11 @@ cdef void _search_for_chars(
                 break
         else:
             this_tok_str_idx += 1
-            if this_tok_str_idx >= tok_str_len:
+            if this_tok_str_idx > tok_str_l:
                 break    
 
-    # fill in unused characters in the length buffer with 0
-    memset(res_buf + res_buf_idx, 0, max_res_l - res_buf_idx)
-
-            
-            
-
-            
-
-
-
-
-
-
-    cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
-    cdef int search_buf_idx
-    cdef int cmp_result
-
-    while result_buf_idx < result_buf_len:
-        for search_buf_idx in range (search_buf_len):
-            cmp_result = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, sizeof(Py_UCS4))
-            if cmp_result == 0:
-                memcpy(result_buf + result_buf_idx, lookup_buf + search_buf_idx, sizeof(Py_UCS4))
-                result_buf_idx += 1
-            if cmp_result >= 0: 
-                break
-        if suffs_not_prefs:
-            if text_string_idx <= tok_idx:
-                break
-            text_string_idx -= 1
-        else:
-            text_string_idx += 1
-            if text_string_idx >= tok_idx + tok_len:
-                break
-    
-    # fill in any unused characters in the result buffer with zeros
-    if result_buf_idx < result_buf_len:
-        memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
-    
-    return result_buf_idx > 0
+    # fill in unused characters in the length buffer
+    memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
         
 
 def pickle_doc(doc):