Intermediate state

2025-08-02 11:20:19 +03:00 · 2022-11-01 20:46:55 +01:00 · 2022-11-01 20:46:55 +01:00 · bbf058029a
commit bbf058029a
parent 2552340fb8
4 changed files with 375 additions and 184 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -994,10 +994,8 @@ def test_doc_spans_setdefault(en_tokenizer):
    assert len(doc.spans["key3"]) == 2
-def _get_unsigned_32_bit_hash(input: str) -> int:
+def _get_32_bit_hash(input: str) -> int:
    working_hash = hash(input.encode("UTF-8"))
    if working_hash < 0:
        working_hash = working_hash + (2 << 31)
    return working_hash
@ -1009,79 +1007,91 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
    hashes = doc.get_character_combination_hashes(
        cs=case_sensitive,
        p_lengths=bytes((1, 3, 4,)),
        p_max_l = 4,
        s_lengths=bytes((2, 3, 4, 5,)),
        s_max_l = 5,
        ps_1byte_ch=ps1,
        ps_1byte_ch_l = len(ps1),
        ps_2byte_ch=ps2,
        ps_2byte_ch_l = len(ps2),
        ps_3byte_ch=ps3,
        ps_3byte_ch_l = len(ps3),
        ps_4byte_ch=ps4,
        ps_4byte_ch_l = len(ps4),
        ps_lengths=bytes((2,)),
        ps_max_l = 2,
        ss_1byte_ch=ss1,
        ss_1byte_ch_l = len(ss1),
        ss_2byte_ch=ss2,
        ss_2byte_ch_l = len(ss2),
        ss_3byte_ch=ss3,
        ss_3byte_ch_l = len(ss3),
        ss_4byte_ch=ss4,
        ss_4byte_ch_l = len(ss4),
        ss_lengths=bytes((1, 2,)),
        ss_max_l = 2,
    )
-    assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
+    assert hashes[0][0] == _get_32_bit_hash("s")
-    assert hashes[0][1] == _get_unsigned_32_bit_hash("spa")
+    assert hashes[0][1] == _get_32_bit_hash("spa")
-    assert hashes[0][2] == _get_unsigned_32_bit_hash(
+    assert hashes[0][2] == _get_32_bit_hash(
        "spaC" if case_sensitive else "spac"
    )
-    assert hashes[0][3] == _get_unsigned_32_bit_hash("Cy" if case_sensitive else "cy")
+    assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy")
-    assert hashes[0][4] == _get_unsigned_32_bit_hash("aCy" if case_sensitive else "acy")
+    assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy")
-    assert hashes[0][5] == _get_unsigned_32_bit_hash(
+    assert hashes[0][5] == _get_32_bit_hash(
        "paCy" if case_sensitive else "pacy"
    )
-    assert hashes[0][6] == _get_unsigned_32_bit_hash(
+    assert hashes[0][6] == _get_32_bit_hash(
        "spaCy" if case_sensitive else "spacy"
    )
-    assert hashes[0][7] == _get_unsigned_32_bit_hash("p")
+    assert hashes[0][7] == _get_32_bit_hash("p")
-    assert hashes[0][8] == _get_unsigned_32_bit_hash("p")
+    assert hashes[0][8] == _get_32_bit_hash("p")
-    assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
+    assert hashes[0][9] == _get_32_bit_hash("p")
-    assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][0] == _get_32_bit_hash("✨")
-    assert hashes[1][1] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][1] == _get_32_bit_hash("✨")
-    assert hashes[1][2] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][2] == _get_32_bit_hash("✨")
-    assert hashes[1][3] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][3] == _get_32_bit_hash("✨")
-    assert hashes[1][4] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][4] == _get_32_bit_hash("✨")
-    assert hashes[1][5] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][5] == _get_32_bit_hash("✨")
-    assert hashes[1][6] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][6] == _get_32_bit_hash("✨")
    assert hashes[1][7] == 0
-    assert hashes[1][8] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][8] == _get_32_bit_hash("✨")
-    assert hashes[1][9] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][9] == _get_32_bit_hash("✨")
-    assert hashes[2][0] == _get_unsigned_32_bit_hash("a")
+    assert hashes[2][0] == _get_32_bit_hash("a")
-    assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][1] == _get_32_bit_hash("and")
-    assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][2] == _get_32_bit_hash("and")
-    assert hashes[2][3] == _get_unsigned_32_bit_hash("nd")
+    assert hashes[2][3] == _get_32_bit_hash("nd")
-    assert hashes[2][4] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][4] == _get_32_bit_hash("and")
-    assert hashes[2][5] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][5] == _get_32_bit_hash("and")
-    assert hashes[2][6] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][6] == _get_32_bit_hash("and")
    assert hashes[2][7] == 0
    assert hashes[2][8] == 0
    assert hashes[2][9] == 0
-    assert hashes[3][0] == _get_unsigned_32_bit_hash("P" if case_sensitive else "p")
+    assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p")
-    assert hashes[3][1] == _get_unsigned_32_bit_hash("Pro" if case_sensitive else "pro")
+    assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro")
-    assert hashes[3][2] == _get_unsigned_32_bit_hash(
+    assert hashes[3][2] == _get_32_bit_hash(
        "Prod" if case_sensitive else "prod"
    )
-    assert hashes[3][3] == _get_unsigned_32_bit_hash("gy")
+    assert hashes[3][3] == _get_32_bit_hash("gy")
-    assert hashes[3][4] == _get_unsigned_32_bit_hash("igy")
+    assert hashes[3][4] == _get_32_bit_hash("igy")
-    assert hashes[3][5] == _get_unsigned_32_bit_hash("digy")
+    assert hashes[3][5] == _get_32_bit_hash("digy")
-    assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy")
+    assert hashes[3][6] == _get_32_bit_hash("odigy")
-    assert hashes[3][7] == 0 if case_sensitive else _get_unsigned_32_bit_hash("pr")
+    assert hashes[3][7] == 0 if case_sensitive else _get_32_bit_hash("pr")
-    assert hashes[3][8] == _get_unsigned_32_bit_hash("r")
+    assert hashes[3][8] == _get_32_bit_hash("r")
    if case_sensitive:
-        assert hashes[3][9] == _get_unsigned_32_bit_hash("r")
+        assert hashes[3][9] == _get_32_bit_hash("r")
    else:
-        assert hashes[3][9] == _get_unsigned_32_bit_hash("rp")
+        assert hashes[3][9] == _get_32_bit_hash("rp")
    # check values are the same cross-platform
    if case_sensitive:
-        assert hashes[0][2] == 3041529170
+        assert hashes[0][2] == -1253438126
    else:
-        assert hashes[0][2] == 2199614696
+        assert hashes[0][2] == -2095352600
    assert hashes[1][3] == 910783208
    assert hashes[3][8] == 1553167345
@ -1092,40 +1102,52 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
    hashes = doc.get_character_combination_hashes(
        cs=False,
        p_lengths=bytes(),
        p_max_l = 0,
        s_lengths=bytes((2,3,4,5,)),
        s_max_l = 5,
        ps_1byte_ch=ps1,
        ps_1byte_ch_l = len(ps1),
        ps_2byte_ch=ps2,
        ps_2byte_ch_l = len(ps2),
        ps_3byte_ch=ps3,
        ps_3byte_ch_l = len(ps3),
        ps_4byte_ch=ps4,
        ps_4byte_ch_l = len(ps4),
        ps_lengths=bytes((2,)),
        ps_max_l = 2,
        ss_1byte_ch=bytes(),
        ss_1byte_ch_l = 0,
        ss_2byte_ch=bytes(),
        ss_2byte_ch_l = 0,
        ss_3byte_ch=bytes(),
        ss_3byte_ch_l = 0,
        ss_4byte_ch=bytes(),
        ss_4byte_ch_l = 0,
        ss_lengths=bytes(),
        ss_max_l = 0,
    )
-    assert hashes[0][0] == _get_unsigned_32_bit_hash("cy")
+    assert hashes[0][0] == _get_32_bit_hash("cy")
-    assert hashes[0][1] == _get_unsigned_32_bit_hash("acy")
+    assert hashes[0][1] == _get_32_bit_hash("acy")
-    assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy")
+    assert hashes[0][2] == _get_32_bit_hash("pacy")
-    assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy")
+    assert hashes[0][3] == _get_32_bit_hash("spacy")
-    assert hashes[0][4] == _get_unsigned_32_bit_hash("p")
+    assert hashes[0][4] == _get_32_bit_hash("p")
-    assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][0] == _get_32_bit_hash("✨")
-    assert hashes[1][1] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][1] == _get_32_bit_hash("✨")
-    assert hashes[1][2] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][2] == _get_32_bit_hash("✨")
-    assert hashes[1][3] == _get_unsigned_32_bit_hash("✨")
+    assert hashes[1][3] == _get_32_bit_hash("✨")
    assert hashes[1][4] == 0
-    assert hashes[2][0] == _get_unsigned_32_bit_hash("nd")
+    assert hashes[2][0] == _get_32_bit_hash("nd")
-    assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][1] == _get_32_bit_hash("and")
-    assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][2] == _get_32_bit_hash("and")
-    assert hashes[2][3] == _get_unsigned_32_bit_hash("and")
+    assert hashes[2][3] == _get_32_bit_hash("and")
    assert hashes[2][4] == 0
-    assert hashes[3][0] == _get_unsigned_32_bit_hash("gy")
+    assert hashes[3][0] == _get_32_bit_hash("gy")
-    assert hashes[3][1] == _get_unsigned_32_bit_hash("igy")
+    assert hashes[3][1] == _get_32_bit_hash("igy")
-    assert hashes[3][2] == _get_unsigned_32_bit_hash("digy")
+    assert hashes[3][2] == _get_32_bit_hash("digy")
-    assert hashes[3][3] == _get_unsigned_32_bit_hash("odigy")
+    assert hashes[3][3] == _get_32_bit_hash("odigy")
-    assert hashes[3][4] == _get_unsigned_32_bit_hash("pr")
+    assert hashes[3][4] == _get_32_bit_hash("pr")
 def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
@ -1137,21 +1159,33 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
            hashes = doc.get_character_combination_hashes(
                cs=False,
                p_lengths=bytes((p_length,)),
                p_max_l = p_length,
                s_lengths=bytes((s_length,)),
                s_max_l = s_length,
                ps_1byte_ch=bytes(),
                ps_1byte_ch_l = 0,
                ps_2byte_ch=bytes(),
                ps_2byte_ch_l = 0,
                ps_3byte_ch=bytes(),
                ps_3byte_ch_l = 0,
                ps_4byte_ch=bytes(),
                ps_4byte_ch_l = 0,
                ps_lengths=bytes(),
                ps_max_l = 0,
                ss_1byte_ch=bytes(),
                ss_1byte_ch_l = 0,
                ss_2byte_ch=bytes(),
                ss_2byte_ch_l = 0,
                ss_3byte_ch=bytes(),
                ss_3byte_ch_l = 0,
                ss_4byte_ch=bytes(),
                ss_4byte_ch_l = 0,
                ss_lengths=bytes(),
                ss_max_l = 0
            )
-            assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length])
+            assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length])
-            assert hashes[0][1] == _get_unsigned_32_bit_hash("sp𐌞cé"[-s_length:])
+            assert hashes[0][1] == _get_32_bit_hash("sp𐌞cé"[-s_length:])
@pytest.mark.parametrize("case_sensitive", [True, False])
@ -1161,49 +1195,61 @@ def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_
    hashes = doc.get_character_combination_hashes(
        cs=case_sensitive,
        p_lengths=bytes((1,2,3,4,)),
        p_max_l = 4,
        s_lengths=bytes((1,2,3,4,)),
        s_max_l = 4,
        ps_1byte_ch=s1,
        ps_1byte_ch_l = len(s1),
        ps_2byte_ch=s2,
        ps_2byte_ch_l = len(s2),
        ps_3byte_ch=s3,
        ps_3byte_ch_l = len(s3),
        ps_4byte_ch=s4,
        ps_4byte_ch_l = len(s4),
        ps_lengths=bytes((1,2,3,4,)),
        ps_max_l = 4,
        ss_1byte_ch=s1,
        ss_1byte_ch_l = len(s1),
        ss_2byte_ch=s2,
        ss_2byte_ch_l = len(s2),
        ss_3byte_ch=s3,
        ss_3byte_ch_l = len(s3),
        ss_4byte_ch=s4,
        ss_4byte_ch_l = len(s4),
        ss_lengths=bytes((1,2,3,4,)),
        ss_max_l = 4
    )
    COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
-    assert hashes[0][0] == _get_unsigned_32_bit_hash("i")
+    assert hashes[0][0] == _get_32_bit_hash("i")
-    assert hashes[0][1] == _get_unsigned_32_bit_hash("İ".lower())
+    assert hashes[0][1] == _get_32_bit_hash("İ".lower())
    if case_sensitive:
-        assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
+        assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "İ")
-        assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
+        assert hashes[0][3] == _get_32_bit_hash("İ".lower() + "İ")
-        assert hashes[0][4] == _get_unsigned_32_bit_hash("İ")
+        assert hashes[0][4] == _get_32_bit_hash("İ")
-        assert hashes[0][5] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ")
+        assert hashes[0][5] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ")
-        assert hashes[0][6] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
+        assert hashes[0][6] == _get_32_bit_hash("İ".lower() + "İ")
-        assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
+        assert hashes[0][7] == _get_32_bit_hash("İ".lower() + "İ")
-        assert hashes[0][8] == _get_unsigned_32_bit_hash("İ")
+        assert hashes[0][8] == _get_32_bit_hash("İ")
-        assert hashes[0][9] == _get_unsigned_32_bit_hash("İ")
+        assert hashes[0][9] == _get_32_bit_hash("İ")
-        assert hashes[0][12] == _get_unsigned_32_bit_hash("İ")
+        assert hashes[0][12] == _get_32_bit_hash("İ")
-        assert hashes[0][13] == _get_unsigned_32_bit_hash("İ")
+        assert hashes[0][13] == _get_32_bit_hash("İ")
    else:
-        assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "i")
+        assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "i")
-        assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() * 2)
+        assert hashes[0][3] == _get_32_bit_hash("İ".lower() * 2)
-        assert hashes[0][4] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE)
+        assert hashes[0][4] == _get_32_bit_hash(COMBINING_DOT_ABOVE)
-        assert hashes[0][5] == _get_unsigned_32_bit_hash("İ".lower())
+        assert hashes[0][5] == _get_32_bit_hash("İ".lower())
-        assert hashes[0][6] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower())
+        assert hashes[0][6] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower())
-        assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() * 2)
+        assert hashes[0][7] == _get_32_bit_hash("İ".lower() * 2)
-        assert hashes[0][8] == _get_unsigned_32_bit_hash("i")
+        assert hashes[0][8] == _get_32_bit_hash("i")
-        assert hashes[0][9] == _get_unsigned_32_bit_hash("İ".lower())
+        assert hashes[0][9] == _get_32_bit_hash("İ".lower())
-        assert hashes[0][10] == _get_unsigned_32_bit_hash("İ".lower() + "i")
+        assert hashes[0][10] == _get_32_bit_hash("İ".lower() + "i")
-        assert hashes[0][11] == _get_unsigned_32_bit_hash("İ".lower() * 2)
+        assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2)
-        assert hashes[0][12] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE)
+        assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE)
-        assert hashes[0][13] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i")
+        assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i")
-        assert hashes[0][14] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE)
+        assert hashes[0][14] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE)
-        assert hashes[0][15] == _get_unsigned_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2)
+        assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2)
@pytest.mark.parametrize("case_sensitive", [True, False])
@ -1219,33 +1265,45 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer,
    hashes = doc.get_character_combination_hashes(
        cs=case_sensitive,
        p_lengths=bytes((2,)),
        p_max_l = 2,
        s_lengths=bytes((2,)),
        s_max_l = 2,
        ps_1byte_ch=ps1,
        ps_1byte_ch_l = len(ps1),
        ps_2byte_ch=ps2,
        ps_2byte_ch_l = len(ps2),
        ps_3byte_ch=ps3,
        ps_3byte_ch_l = len(ps3),
        ps_4byte_ch=ps4,
        ps_4byte_ch_l = len(ps4),
        ps_lengths=bytes((2,)),
        ps_max_l = 2,
        ss_1byte_ch=bytes(),
        ss_1byte_ch_l = 0,
        ss_2byte_ch=bytes(),
        ss_2byte_ch_l = 0,
        ss_3byte_ch=bytes(),
        ss_3byte_ch_l = 0,
        ss_4byte_ch=bytes(),
        ss_4byte_ch_l = 0,
        ss_lengths=bytes(),
        ss_max_l = 0
    )
-    assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl")
+    assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl")
-    assert hashes[0][1] == _get_unsigned_32_bit_hash("19")
+    assert hashes[0][1] == _get_32_bit_hash("19")
    assert hashes[0][2] == 0
-    assert hashes[1][0] == _get_unsigned_32_bit_hash("be")
+    assert hashes[1][0] == _get_32_bit_hash("be")
-    assert hashes[1][1] == _get_unsigned_32_bit_hash("ee")
+    assert hashes[1][1] == _get_32_bit_hash("ee")
    if case_sensitive:
        assert hashes[1][2] == 0
    else:
-        assert hashes[1][2] == _get_unsigned_32_bit_hash("ee")
+        assert hashes[1][2] == _get_32_bit_hash("ee")
-    assert hashes[2][0] == hashes[3][0] == _get_unsigned_32_bit_hash("se")
+    assert hashes[2][0] == hashes[3][0] == _get_32_bit_hash("se")
-    assert hashes[2][1] == hashes[3][1] == _get_unsigned_32_bit_hash("ty")
+    assert hashes[2][1] == hashes[3][1] == _get_32_bit_hash("ty")
    if case_sensitive:
        assert hashes[2][2] == hashes[3][2] == 0
    else:
-        assert hashes[2][2] == hashes[3][2] == _get_unsigned_32_bit_hash("ee")
+        assert hashes[2][2] == hashes[3][2] == _get_32_bit_hash("ee")
 def test_character_combination_hashes_empty_lengths(en_tokenizer):
@ -1253,15 +1311,27 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer):
    assert doc.get_character_combination_hashes(
        cs=True,
        p_lengths=bytes(),
        p_max_l = 0,
        s_lengths=bytes(),
        s_max_l = 0,
        ps_1byte_ch=bytes(),
        ps_1byte_ch_l=0,
        ps_2byte_ch=bytes(),
        ps_2byte_ch_l=0,
        ps_3byte_ch=bytes(),
        ps_3byte_ch_l=0,
        ps_4byte_ch=bytes(),
        ps_4byte_ch_l=0,
        ps_lengths=bytes(),
        ps_max_l = 0,
        ss_1byte_ch=bytes(),
        ss_1byte_ch_l=0,
        ss_2byte_ch=bytes(),
        ss_2byte_ch_l=0,
        ss_3byte_ch=bytes(),
        ss_3byte_ch_l=0,
        ss_4byte_ch=bytes(),
        ss_4byte_ch_l=0,
        ss_lengths=bytes(),
        ss_max_l = 0,
    ).shape == (1, 0)
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -38,20 +38,33 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
 cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
-cdef void _set_affix_lengths(
+cdef void _set_prefix_lengths(
    const unsigned char* tok_str,
-    unsigned char* aff_l_buf, 
+    const int tok_str_l,
-    const int pref_l, 
+    unsigned char* pref_l_buf,
-    const int suff_l,
+    const int p_max_l, 
 ) nogil
 cdef void _set_suffix_lengths(
    const unsigned char* tok_str,
    const int tok_str_l,
    unsigned char* suff_l_buf,
    const int s_max_l, 
 )
 cdef void _search_for_chars(
    const unsigned char* tok_str,
    const int tok_str_l,
    const unsigned char* s_1byte_ch,
    const int s_1byte_ch_l,
    const unsigned char* s_2byte_ch,
    const int s_2byte_ch_l,
    const unsigned char* s_3byte_ch,
    const int s_3byte_ch_l,
    const unsigned char* s_4byte_ch,
    const int s_4byte_ch_l,
    unsigned char* res_buf,
    int max_res_l,
    unsigned char* l_buf,
@ -59,6 +72,18 @@ cdef void _search_for_chars(
 ) nogil
 cdef int _write_hashes(
    const unsigned char* res_buf,
    const unsigned char* aff_l_buf,
    const unsigned char* offset_buf,
    const int end_idx,
    np.ndarray[np.int64_t, ndim=2] hashes,
    const int tok_i,
    const int start_hash_idx,
 )  
 cdef class Doc:
    cdef readonly Pool mem
    cdef readonly Vocab vocab
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -179,17 +179,58 @@ class Doc:
        *,
        cs: bool,
        p_lengths: bytes,
        p_max_l: int,
        s_lengths: bytes,
        s_max_l: int,
        ps_1byte_ch: bytes,
        ps_1_byte_ch_l: int,
        ps_2byte_ch: bytes,
        ps_2_byte_ch_l: int,
        ps_3byte_ch: bytes,
        ps_3_byte_ch_l: int,
        ps_4byte_ch: bytes,
        ps_4_byte_ch_l: int,
        ps_lengths: bytes,
        ps_max_l: int,
        ss_1byte_ch: bytes,
        ss_1_byte_ch_l: int,
        ss_2byte_ch: bytes,
        ss_2_byte_ch_l: int,
        ss_3byte_ch: bytes,
        ss_3_byte_ch_l: int,
        ss_4byte_ch: bytes,
        ss_4_byte_ch_l: int,
        ss_lengths: bytes,
        ss_max_l: int,
    ) -> Ints2d: ...
    @staticmethod
    def _get_array_attrs() -> Tuple[Any]: ...
 def get_character_combination_hashes(self,
        *,
        const bint cs, 
        const unsigned char* p_lengths,
        const int p_max_l,
        const unsigned char* s_lengths,
        const int s_max_l,
        const unsigned char* ps_1byte_ch,
        const int ps_1_byte_ch_l,
        const unsigned char* ps_2byte_ch,
        const int ps_2_byte_ch_l,
        const unsigned char* ps_3byte_ch,
        const int ps_3_byte_ch_l,
        const unsigned char* ps_4byte_ch,
        const int ps_4_byte_ch_l,
        const unsigned char* ps_lengths,
        const int ps_max_l,
        const unsigned char* ss_1byte_ch,
        const int ss_1_byte_ch_l,
        const unsigned char* ss_2byte_ch,
        const int ss_2_byte_ch_l,
        const unsigned char* ss_3byte_ch,
        const int ss_3_byte_ch_l,
        const unsigned char* ss_4byte_ch,
        const int ss_4_byte_ch_l,
        const unsigned char* ss_lengths,
        const int ss_max_l,
    )
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -956,7 +956,7 @@ cdef class Doc:
        cdef int i, j
        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=2] output
-        # Handle scalar/list inputs of strings/ints for py_attr_ids
+        # Handle scalar/list inputs of cdef np.strings/ints for py_attr_ids
        # See also #3064
        if isinstance(py_attr_ids, str):
            # Handle inputs like doc.to_array('ORTH')
@ -1735,22 +1735,34 @@ cdef class Doc:
                    j += 1
        return output
-    @cython.boundscheck(False)  # Deactivate bounds checking
+    #@cython.boundscheck(False)  # Deactivate bounds checking
    def get_character_combination_hashes(self,
        *,
        const bint cs, 
-        const unsigned char* p_lengths, 
+        const unsigned char* p_lengths,
        const int p_max_l,
        const unsigned char* s_lengths,
        const int s_max_l,
        const unsigned char* ps_1byte_ch,
        const int ps_1byte_ch_l,
        const unsigned char* ps_2byte_ch,
        const int ps_2byte_ch_l,
        const unsigned char* ps_3byte_ch,
        const int ps_3byte_ch_l,
        const unsigned char* ps_4byte_ch,
        const int ps_4byte_ch_l,
        const unsigned char* ps_lengths,
        const int ps_max_l,
        const unsigned char* ss_1byte_ch,
        const int ss_1byte_ch_l,
        const unsigned char* ss_2byte_ch,
        const int ss_2byte_ch_l,
        const unsigned char* ss_3byte_ch,
        const int ss_3byte_ch_l,
        const unsigned char* ss_4byte_ch,
        const int ss_4byte_ch_l,
        const unsigned char* ss_lengths,
        const int ss_max_l,
    ):
        """
        Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations 
@ -1766,39 +1778,33 @@ cdef class Doc:
        cs: if *False*, hashes are generated based on the lower-case version of each token.
        p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order. 
            For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
        p_max_l: the value of *p_lengths[-1]*, or *0* if *p_lengths==None*. Passed in for speed.
        s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order. 
            For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
        s_max_l: the value of *s_lengths[-1]*, or *0* if *s_lengths==None*. Passed in for speed.
        ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, 
            starting at the beginning.
        ps_<n>byte_ch_l: the length of *ps_<n>byte_ch*. Passed in for speed.
        ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed 
            in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings 
            hashed for "spaCy" would be "a" and "ac".
        ps_max_l: the value of *ps_lengths[-1]*, or *0* if *ps_lengths==None*. Passed in for speed.
        ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, 
            starting at the end.
        ss_<n>byte_ch_l: the length of *ss_<n>byte_ch*. Passed in for speed.
        ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed
             in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings 
             hashed for "spaCy" would be "c" and "ca".
        ss_max_l: the value of *ss_lengths[-1]*, or *0* if *ss_lengths==None*. Passed in for speed.
        """
-        # Define the result array and work out what is used for what in axis 1
+        cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty(
-        cdef int num_toks = len(self)
+            (self.length, p_max_l + s_max_l + ps_max_l + ss_max_l), dtype="int64")
-        cdef int p_h_num = strlen(<char*> p_lengths)
+        
        cdef int s_h_num = strlen(<char*> s_lengths), s_h_end = p_h_num + s_h_num
        cdef int ps_h_num = strlen(<char*> ps_lengths), ps_h_end = s_h_end + ps_h_num
        cdef int ss_h_num = strlen(<char*> ss_lengths), ss_h_end = ps_h_end + ss_h_num
        cdef np.ndarray[np.int64_t, ndim=2] hashes
        hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
        # Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
        cdef int p_max_l = p_lengths[-1] if p_h_num > 0 else 0
        cdef int s_max_l = s_lengths[-1] if s_h_num > 0 else 0
        cdef int ps_max_l = ps_lengths[-1] if ps_h_num > 0 else 0
        cdef int ss_max_l = ss_lengths[-1] if ss_h_num > 0 else 0
        # Define / allocate buffers
        cdef Pool mem = Pool()
-        cdef int aff_l  = p_max_l + s_max_l
+        cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, 1)
-        cdef unsigned char* aff_l_buf = <unsigned char*> mem.alloc(aff_l, 1)
+        cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(p_max_l, 1)
        cdef unsigned char* ps_res_buf = <unsigned char*> mem.alloc(ps_max_l, 4)
        cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, 1)
        cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
@ -1806,48 +1812,34 @@ cdef class Doc:
        # Define working variables
        cdef TokenC tok_c
-        cdef int tok_i, offset
+        cdef int hash_idx, tok_i, tok_str_l
        cdef uint64_t hash_val = 0
        cdef attr_t num_tok_attr
        cdef const unsigned char* tok_str
-        for tok_i in range(num_toks):
+        for tok_i in range(self.length):
            tok_c = self.c[tok_i]
            num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
            tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
            tok_str_l = strlen(<char*> tok_str)
            hash_idx = 0
-            if aff_l > 0:
+            if p_max_l > 0:
-                _set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l)
+                _set_prefix_lengths(tok_str, tok_str_l, pref_l_buf, p_max_l)
                hash_idx = _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes, tok_i, 0)
-                for hash_idx in range(p_h_num):
+            if s_max_l > 0:
-                    offset = aff_l_buf[p_lengths[hash_idx] - 1]
+                _set_suffix_lengths(tok_str, tok_str_l, suff_l_buf, s_max_l)
-                    if offset > 0:
+                hash_idx = _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l, hashes, tok_i, hash_idx)
                        hash_val = hash32(<void*> &tok_str[0], offset, 0)
                    hashes[tok_i, hash_idx] = hash_val
-                for hash_idx in range(p_h_num, s_h_end):
+            if ps_max_l > 0:
-                    offset = aff_l_buf[s_lengths[hash_idx - p_h_num] + p_max_l - 1]
+                _search_for_chars(tok_str, tok_str_l, ps_1byte_ch, ps_1byte_ch_l, ps_2byte_ch, ps_2byte_ch_l, 
-                    if offset > 0:
+                    ps_3byte_ch, ps_3byte_ch_l, ps_4byte_ch, ps_4byte_ch_l, ps_res_buf, ps_max_l, ps_l_buf, False)
-                        hash_val = hash32(<void*> &tok_str[len(tok_str) - offset], offset, 0)
+                hash_idx = _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes, tok_i, hash_idx)
                    hashes[tok_i, hash_idx] = hash_val
-            if ps_h_num > 0:
+            if ss_max_l > 0:
-                _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False)
+                _search_for_chars(tok_str, tok_str_l, ss_1byte_ch, ss_1byte_ch_l, ss_2byte_ch, ss_2byte_ch_l, 
-                hash_val = 0
+                    ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True)
-                for hash_idx in range(s_h_end, ps_h_end):
+                _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes, tok_i, hash_idx)
                    offset = ps_l_buf[ps_lengths[hash_idx - s_h_end] - 1]
                    if offset > 0:
                        hash_val = hash32(ps_res_buf, offset, 0)
                    hashes[tok_i, hash_idx] = hash_val
            if ss_h_num > 0:
                _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True)
                hash_val = 0
                for hash_idx in range(ps_h_end, ss_h_end):
                    offset = ss_l_buf[ss_lengths[hash_idx - ps_h_end] - 1]
                    if offset > 0:
                        hash_val = hash32(ss_res_buf, offset, 0)
                    hashes[tok_i, hash_idx] = hash_val
        return hashes
@ -2031,59 +2023,81 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
                lca_matrix[k, j] = lca - start
    return lca_matrix
-@cython.boundscheck(False)  # Deactivate bounds checking
+#@cython.boundscheck(False)  # Deactivate bounds checking
-cdef void _set_affix_lengths(
+cdef void _set_prefix_lengths(
    const unsigned char* tok_str,
-    unsigned char* aff_l_buf, 
+    const int tok_str_l,
-    const int pref_l, 
+    unsigned char* pref_l_buf,
-    const int suff_l,
+    const int p_max_l, 
 ) nogil:
-    """ Populate *aff_l_buf*, which has length *pref_l+suff_l* with the byte lengths of the first *pref_l* and the last 
+    """ Populate *pref_l_buf*, which has length *pref_l*, with the byte lengths of the first *pref_l* characters within *tok_str*. 
-        *suff_l* characters within *tok_str*. Lengths that are greater than the character length of the whole word are 
+        Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word.
        populated with the byte length of the whole word.
-        tok_str: a memoryview of a UTF-8 representation of a string.
+        tok_str: a UTF-8 representation of a string.
-        aff_l_buf: a buffer of length *pref_l+suff_l* in which to store the lengths. The calling code ensures that lengths
+        tok_str_l: the length of *tok_str*.
        pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The calling code ensures that lengths
            greater than 255 cannot occur.
-        pref_l: the number of characters to process at the beginning of the word.
+        p_max_l: the number of characters to process at the beginning of the word.
        suff_l: the number of characters to process at the end of the word.
    """
-    cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = strlen(<char*> tok_str)
+    cdef int tok_str_idx = 1, pref_l_buf_idx = 0
-    while aff_l_buf_idx < pref_l:
+    while pref_l_buf_idx < p_max_l:
-        if (tok_str_idx == strlen(<char*> tok_str) 
+        if (tok_str[tok_str_idx] == 0 # end of string 
            or 
-            ((tok_str[tok_str_idx] & 0xc0) != 0x80  # not a continuation character
+            ((tok_str[tok_str_idx] & 0xc0) != 0x80)  # not a continuation character
        ):
-            aff_l_buf[aff_l_buf_idx] = tok_str_idx
+            pref_l_buf[pref_l_buf_idx] = tok_str_idx
-            aff_l_buf_idx += 1
+            pref_l_buf_idx += 1
-        tok_str_idx += 1
+        if tok_str[tok_str_idx] == 0: # end of string
        if tok_str_idx > tok_str_l:
            break
        tok_str_idx += 1
-    if aff_l_buf_idx < pref_l:
+    if pref_l_buf_idx < p_max_l:
-        memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l - aff_l_buf_idx)
+        memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
        aff_l_buf_idx = pref_l
-    tok_str_idx = tok_str_l - 1
+
-    while aff_l_buf_idx < pref_l + suff_l:
+#@cython.boundscheck(False)  # Deactivate bounds checking
 cdef void _set_suffix_lengths(
    const unsigned char* tok_str,
    const int tok_str_l,
    unsigned char* suff_l_buf,
    const int s_max_l, 
 ):
    """ Populate *suff_l_buf*, which has length *suff_l*, with the byte lengths of the last *suff_l* characters within *tok_str*. 
        Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word.
        tok_str: a UTF-8 representation of a string.
        tok_str_l: the length of *tok_str*.
        suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The calling code ensures that lengths
            greater than 255 cannot occur.
        s_max_l: the number of characters to process at the end of the word.
    """
    cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0
    while suff_l_buf_idx < s_max_l:
        if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
-            aff_l_buf[aff_l_buf_idx] = tok_str_l - tok_str_idx
+            suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx
-            aff_l_buf_idx += 1
+            suff_l_buf_idx += 1
        tok_str_idx -= 1
        if tok_str_idx < 0:
            break
-    if aff_l_buf_idx < pref_l + suff_l:
+    if suff_l_buf_idx < s_max_l:
-        memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l + suff_l - aff_l_buf_idx)
+        memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
-@cython.boundscheck(False)  # Deactivate bounds checking
+
 #@cython.boundscheck(False)  # Deactivate bounds checking
 cdef void _search_for_chars(
    const unsigned char* tok_str,
    const int tok_str_l,
    const unsigned char* s_1byte_ch,
    const int s_1byte_ch_l,
    const unsigned char* s_2byte_ch,
    const int s_2byte_ch_l,
    const unsigned char* s_3byte_ch,
    const int s_3byte_ch_l,
    const unsigned char* s_4byte_ch,
    const int s_4byte_ch_l,
    unsigned char* res_buf,
    int max_res_l,
    unsigned char* l_buf,
@ -2096,6 +2110,7 @@ cdef void _search_for_chars(
        which may be *0* if the search was not successful.
        tok_str: a memoryview of a UTF-8 representation of a string.
        tok_str_l: the length of *tok_str*.
        s_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for.
        res_buf: the buffer in which to place the search results.
        max_res_l: the maximum number of found characters to place in *res_buf*.
@ -2103,7 +2118,7 @@ cdef void _search_for_chars(
            The calling code ensures that lengths greater than 255 cannot occur. 
        suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
    """
-    cdef int tok_str_l = strlen(<char*> tok_str), res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx
+    cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx
    cdef int search_chars_l
    cdef const unsigned char* search_chars
@ -2121,13 +2136,16 @@ cdef void _search_for_chars(
                ch_wdth = last_tok_str_idx - this_tok_str_idx
            if ch_wdth == 1:
                search_chars = s_1byte_ch
                search_chars_l = s_1byte_ch_l
            elif ch_wdth == 2:
                search_chars = s_2byte_ch
                search_chars_l = s_2byte_ch_l
            elif ch_wdth == 3:
                search_chars = s_3byte_ch
                search_chars_l = s_3byte_ch_l
            else:
                search_chars = s_4byte_ch
-            search_chars_l = strlen(<char*> search_chars)
+                search_chars_l = s_4byte_ch_l
            tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
            search_char_idx = 0
@ -2157,6 +2175,43 @@ cdef void _search_for_chars(
    memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
 cdef int _write_hashes(
    const unsigned char* res_buf,
    const unsigned char* aff_l_buf,
    const unsigned char* offset_buf,
    const int end_idx,
    np.ndarray[np.int64_t, ndim=2] hashes,
    const int tok_i,
    const int start_hash_idx,
 ):    
    """ Write hashes for a token/rich property group combination.
    res_buf: the string from which to generate the hash values.
    aff_l_buf: one-byte lengths describing how many characters to hash.
    offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
    end_idx: if not *0*, the offset within *res_buf* that should end each affix being hashed;
        if *0*, affixes start at the beginning of *res_buf* rather than ending at the end.
    hashes: the 2D Numpy array in which the hashes are stored.
    tok_i: the index of axis 0 of *hashes* to write to.
    start_hash_idx: the index of axis 1 of *hashes* at which to start writing.
    """
    cdef int offset, aff_l, hash_val = 0, hash_idx = start_hash_idx
    while True:
        aff_l = aff_l_buf[hash_idx - start_hash_idx]
        if aff_l == 0:
            return hash_idx     
        offset = offset_buf[aff_l - 1]
        if offset > 0:
            if end_idx != 0:
                hash_val = hash32(<void*> res_buf + end_idx - offset, offset, 0)
            else:
                hash_val = hash32(<void*> res_buf, offset, 0)
        hashes[tok_i, hash_idx] = hash_val
        hash_idx += 1
 def pickle_doc(doc):
    bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
    hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,