From e7626f423a8642fbeeb4709b4372c66dfb473b0f Mon Sep 17 00:00:00 2001
From: "richard@explosion.ai" <richard@explosion.ai>
Date: Wed, 2 Nov 2022 17:11:20 +0100
Subject: [PATCH] Generate Numpy array at end

---
 spacy/tests/doc/test_doc_api.py | 291 +++++++++++++++++++-------------
 spacy/tokens/doc.pxd            |   9 +-
 spacy/tokens/doc.pyi            |  30 +---
 spacy/tokens/doc.pyx            |  59 ++++---
 4 files changed, 211 insertions(+), 178 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 61eab311b..99d0b913e 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -1006,45 +1006,60 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
     ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive)
     hashes = doc.get_character_combination_hashes(
         cs=case_sensitive,
-        p_lengths=bytes((1, 3, 4,)),
-        p_max_l = 4,
-        s_lengths=bytes((2, 3, 4, 5,)),
-        s_max_l = 5,
+        p_lengths=bytes(
+            (
+                1,
+                3,
+                4,
+            )
+        ),
+        p_max_l=4,
+        s_lengths=bytes(
+            (
+                2,
+                3,
+                4,
+                5,
+            )
+        ),
+        s_max_l=5,
         ps_1byte_ch=ps1,
-        ps_1byte_ch_l = len(ps1),
+        ps_1byte_ch_l=len(ps1),
         ps_2byte_ch=ps2,
-        ps_2byte_ch_l = len(ps2),
+        ps_2byte_ch_l=len(ps2),
         ps_3byte_ch=ps3,
-        ps_3byte_ch_l = len(ps3),
+        ps_3byte_ch_l=len(ps3),
         ps_4byte_ch=ps4,
-        ps_4byte_ch_l = len(ps4),
+        ps_4byte_ch_l=len(ps4),
         ps_lengths=bytes((2,)),
-        ps_max_l = 2,
+        ps_max_l=2,
         ss_1byte_ch=ss1,
-        ss_1byte_ch_l = len(ss1),
+        ss_1byte_ch_l=len(ss1),
         ss_2byte_ch=ss2,
-        ss_2byte_ch_l = len(ss2),
+        ss_2byte_ch_l=len(ss2),
         ss_3byte_ch=ss3,
-        ss_3byte_ch_l = len(ss3),
+        ss_3byte_ch_l=len(ss3),
         ss_4byte_ch=ss4,
-        ss_4byte_ch_l = len(ss4),
-        ss_lengths=bytes((1, 2,)),
-        ss_max_l = 2,
+        ss_4byte_ch_l=len(ss4),
+        ss_lengths=bytes(
+            (
+                1,
+                2,
+            )
+        ),
+        ss_max_l=2,
+        hashes_per_tok=10,
     )
 
+    print(hashes)
+
     assert hashes[0][0] == _get_32_bit_hash("s")
     assert hashes[0][1] == _get_32_bit_hash("spa")
-    assert hashes[0][2] == _get_32_bit_hash(
-        "spaC" if case_sensitive else "spac"
-    )
+    assert hashes[0][2] == _get_32_bit_hash("spaC" if case_sensitive else "spac")
     assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy")
     assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy")
-    assert hashes[0][5] == _get_32_bit_hash(
-        "paCy" if case_sensitive else "pacy"
-    )
-    assert hashes[0][6] == _get_32_bit_hash(
-        "spaCy" if case_sensitive else "spacy"
-    )
+    assert hashes[0][5] == _get_32_bit_hash("paCy" if case_sensitive else "pacy")
+    assert hashes[0][6] == _get_32_bit_hash("spaCy" if case_sensitive else "spacy")
 
     assert hashes[0][7] == _get_32_bit_hash("p")
     assert hashes[0][8] == _get_32_bit_hash("p")
@@ -1071,9 +1086,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
     assert hashes[2][9] == 0
     assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p")
     assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro")
-    assert hashes[3][2] == _get_32_bit_hash(
-        "Prod" if case_sensitive else "prod"
-    )
+    assert hashes[3][2] == _get_32_bit_hash("Prod" if case_sensitive else "prod")
     assert hashes[3][3] == _get_32_bit_hash("gy")
     assert hashes[3][4] == _get_32_bit_hash("igy")
     assert hashes[3][5] == _get_32_bit_hash("digy")
@@ -1102,32 +1115,39 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
     hashes = doc.get_character_combination_hashes(
         cs=False,
         p_lengths=bytes(),
-        p_max_l = 0,
-        s_lengths=bytes((2,3,4,5,)),
-        s_max_l = 5,
+        p_max_l=0,
+        s_lengths=bytes(
+            (
+                2,
+                3,
+                4,
+                5,
+            )
+        ),
+        s_max_l=5,
         ps_1byte_ch=ps1,
-        ps_1byte_ch_l = len(ps1),
+        ps_1byte_ch_l=len(ps1),
         ps_2byte_ch=ps2,
-        ps_2byte_ch_l = len(ps2),
+        ps_2byte_ch_l=len(ps2),
         ps_3byte_ch=ps3,
-        ps_3byte_ch_l = len(ps3),
+        ps_3byte_ch_l=len(ps3),
         ps_4byte_ch=ps4,
-        ps_4byte_ch_l = len(ps4),
+        ps_4byte_ch_l=len(ps4),
         ps_lengths=bytes((2,)),
-        ps_max_l = 2,
+        ps_max_l=2,
         ss_1byte_ch=bytes(),
-        ss_1byte_ch_l = 0,
+        ss_1byte_ch_l=0,
         ss_2byte_ch=bytes(),
-        ss_2byte_ch_l = 0,
+        ss_2byte_ch_l=0,
         ss_3byte_ch=bytes(),
-        ss_3byte_ch_l = 0,
+        ss_3byte_ch_l=0,
         ss_4byte_ch=bytes(),
-        ss_4byte_ch_l = 0,
+        ss_4byte_ch_l=0,
         ss_lengths=bytes(),
-        ss_max_l = 0,
+        ss_max_l=0,
+        hashes_per_tok=5,
     )
-    
-    
+
     assert hashes[0][0] == _get_32_bit_hash("cy")
     assert hashes[0][1] == _get_32_bit_hash("acy")
     assert hashes[0][2] == _get_32_bit_hash("pacy")
@@ -1155,33 +1175,34 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
 
     for p_length in range(1, 8):
         for s_length in range(1, 8):
-            
+
             hashes = doc.get_character_combination_hashes(
                 cs=False,
                 p_lengths=bytes((p_length,)),
-                p_max_l = p_length,
+                p_max_l=p_length,
                 s_lengths=bytes((s_length,)),
-                s_max_l = s_length,
+                s_max_l=s_length,
                 ps_1byte_ch=bytes(),
-                ps_1byte_ch_l = 0,
+                ps_1byte_ch_l=0,
                 ps_2byte_ch=bytes(),
-                ps_2byte_ch_l = 0,
+                ps_2byte_ch_l=0,
                 ps_3byte_ch=bytes(),
-                ps_3byte_ch_l = 0,
+                ps_3byte_ch_l=0,
                 ps_4byte_ch=bytes(),
-                ps_4byte_ch_l = 0,
+                ps_4byte_ch_l=0,
                 ps_lengths=bytes(),
-                ps_max_l = 0,
+                ps_max_l=0,
                 ss_1byte_ch=bytes(),
-                ss_1byte_ch_l = 0,
+                ss_1byte_ch_l=0,
                 ss_2byte_ch=bytes(),
-                ss_2byte_ch_l = 0,
+                ss_2byte_ch_l=0,
                 ss_3byte_ch=bytes(),
-                ss_3byte_ch_l = 0,
+                ss_3byte_ch_l=0,
                 ss_4byte_ch=bytes(),
-                ss_4byte_ch_l = 0,
+                ss_4byte_ch_l=0,
                 ss_lengths=bytes(),
-                ss_max_l = 0
+                ss_max_l=0,
+                hashes_per_tok=2,
             )
 
             assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length])
@@ -1189,35 +1210,66 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
 
 
 @pytest.mark.parametrize("case_sensitive", [True, False])
-def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_sensitive):
+def test_get_character_combination_hashes_turkish_i_with_dot(
+    en_tokenizer, case_sensitive
+):
     doc = en_tokenizer("İ".lower() + "İ")
     s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive)
     hashes = doc.get_character_combination_hashes(
         cs=case_sensitive,
-        p_lengths=bytes((1,2,3,4,)),
-        p_max_l = 4,
-        s_lengths=bytes((1,2,3,4,)),
-        s_max_l = 4,
+        p_lengths=bytes(
+            (
+                1,
+                2,
+                3,
+                4,
+            )
+        ),
+        p_max_l=4,
+        s_lengths=bytes(
+            (
+                1,
+                2,
+                3,
+                4,
+            )
+        ),
+        s_max_l=4,
         ps_1byte_ch=s1,
-        ps_1byte_ch_l = len(s1),
+        ps_1byte_ch_l=len(s1),
         ps_2byte_ch=s2,
-        ps_2byte_ch_l = len(s2),
+        ps_2byte_ch_l=len(s2),
         ps_3byte_ch=s3,
-        ps_3byte_ch_l = len(s3),
+        ps_3byte_ch_l=len(s3),
         ps_4byte_ch=s4,
-        ps_4byte_ch_l = len(s4),
-        ps_lengths=bytes((1,2,3,4,)),
-        ps_max_l = 4,
+        ps_4byte_ch_l=len(s4),
+        ps_lengths=bytes(
+            (
+                1,
+                2,
+                3,
+                4,
+            )
+        ),
+        ps_max_l=4,
         ss_1byte_ch=s1,
-        ss_1byte_ch_l = len(s1),
+        ss_1byte_ch_l=len(s1),
         ss_2byte_ch=s2,
-        ss_2byte_ch_l = len(s2),
+        ss_2byte_ch_l=len(s2),
         ss_3byte_ch=s3,
-        ss_3byte_ch_l = len(s3),
+        ss_3byte_ch_l=len(s3),
         ss_4byte_ch=s4,
-        ss_4byte_ch_l = len(s4),
-        ss_lengths=bytes((1,2,3,4,)),
-        ss_max_l = 4
+        ss_4byte_ch_l=len(s4),
+        ss_lengths=bytes(
+            (
+                1,
+                2,
+                3,
+                4,
+            )
+        ),
+        ss_max_l=4,
+        hashes_per_tok=16,
     )
 
     COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
@@ -1248,46 +1300,51 @@ def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_
         assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2)
         assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE)
         assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i")
-        assert hashes[0][14] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE)
+        assert hashes[0][14] == _get_32_bit_hash(
+            COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE
+        )
         assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2)
-        
+
 
 @pytest.mark.parametrize("case_sensitive", [True, False])
-def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer, case_sensitive):
+def test_get_character_combination_hashes_string_store_spec_cases(
+    en_tokenizer, case_sensitive
+):
     symbol = "FLAG19"
     short_word = "bee"
     normal_word = "serendipity"
     long_word = "serendipity" * 50
     assert len(long_word) > 255
-    doc = en_tokenizer(' '.join((symbol, short_word, normal_word, long_word)))
+    doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
     assert len(doc) == 4
     ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive)
     hashes = doc.get_character_combination_hashes(
         cs=case_sensitive,
         p_lengths=bytes((2,)),
-        p_max_l = 2,
+        p_max_l=2,
         s_lengths=bytes((2,)),
-        s_max_l = 2,
+        s_max_l=2,
         ps_1byte_ch=ps1,
-        ps_1byte_ch_l = len(ps1),
+        ps_1byte_ch_l=len(ps1),
         ps_2byte_ch=ps2,
-        ps_2byte_ch_l = len(ps2),
+        ps_2byte_ch_l=len(ps2),
         ps_3byte_ch=ps3,
-        ps_3byte_ch_l = len(ps3),
+        ps_3byte_ch_l=len(ps3),
         ps_4byte_ch=ps4,
-        ps_4byte_ch_l = len(ps4),
+        ps_4byte_ch_l=len(ps4),
         ps_lengths=bytes((2,)),
-        ps_max_l = 2,
+        ps_max_l=2,
         ss_1byte_ch=bytes(),
-        ss_1byte_ch_l = 0,
+        ss_1byte_ch_l=0,
         ss_2byte_ch=bytes(),
-        ss_2byte_ch_l = 0,
+        ss_2byte_ch_l=0,
         ss_3byte_ch=bytes(),
-        ss_3byte_ch_l = 0,
+        ss_3byte_ch_l=0,
         ss_4byte_ch=bytes(),
-        ss_4byte_ch_l = 0,
+        ss_4byte_ch_l=0,
         ss_lengths=bytes(),
-        ss_max_l = 0
+        ss_max_l=0,
+        hashes_per_tok=3,
     )
     assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl")
     assert hashes[0][1] == _get_32_bit_hash("19")
@@ -1308,30 +1365,34 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer,
 
 def test_character_combination_hashes_empty_lengths(en_tokenizer):
     doc = en_tokenizer("and𐌞")
-    assert doc.get_character_combination_hashes(
-        cs=True,
-        p_lengths=bytes(),
-        p_max_l = 0,
-        s_lengths=bytes(),
-        s_max_l = 0,
-        ps_1byte_ch=bytes(),
-        ps_1byte_ch_l=0,
-        ps_2byte_ch=bytes(),
-        ps_2byte_ch_l=0,
-        ps_3byte_ch=bytes(),
-        ps_3byte_ch_l=0,
-        ps_4byte_ch=bytes(),
-        ps_4byte_ch_l=0,
-        ps_lengths=bytes(),
-        ps_max_l = 0,
-        ss_1byte_ch=bytes(),
-        ss_1byte_ch_l=0,
-        ss_2byte_ch=bytes(),
-        ss_2byte_ch_l=0,
-        ss_3byte_ch=bytes(),
-        ss_3byte_ch_l=0,
-        ss_4byte_ch=bytes(),
-        ss_4byte_ch_l=0,
-        ss_lengths=bytes(),
-        ss_max_l = 0,
-    ).shape == (1, 0)
+    assert (
+        doc.get_character_combination_hashes(
+            cs=True,
+            p_lengths=bytes(),
+            p_max_l=0,
+            s_lengths=bytes(),
+            s_max_l=0,
+            ps_1byte_ch=bytes(),
+            ps_1byte_ch_l=0,
+            ps_2byte_ch=bytes(),
+            ps_2byte_ch_l=0,
+            ps_3byte_ch=bytes(),
+            ps_3byte_ch_l=0,
+            ps_4byte_ch=bytes(),
+            ps_4byte_ch_l=0,
+            ps_lengths=bytes(),
+            ps_max_l=0,
+            ss_1byte_ch=bytes(),
+            ss_1byte_ch_l=0,
+            ss_2byte_ch=bytes(),
+            ss_2byte_ch_l=0,
+            ss_3byte_ch=bytes(),
+            ss_3byte_ch_l=0,
+            ss_4byte_ch=bytes(),
+            ss_4byte_ch_l=0,
+            ss_lengths=bytes(),
+            ss_max_l=0,
+            hashes_per_tok=0,
+        ).shape
+        == (1, 0)
+    )
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 8888939df..41d150bb0 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -51,7 +51,7 @@ cdef void _set_suffix_lengths(
     const int tok_str_l,
     unsigned char* suff_l_buf,
     const int s_max_l, 
-)
+) nogil
 
 
 cdef void _search_for_chars(
@@ -72,16 +72,13 @@ cdef void _search_for_chars(
 ) nogil
 
 
-
 cdef int _write_hashes(
     const unsigned char* res_buf,
     const unsigned char* aff_l_buf,
     const unsigned char* offset_buf,
     const int end_idx,
-    np.ndarray[np.int64_t, ndim=2] hashes,
-    const int tok_i,
-    const int start_hash_idx,
-)  
+    np.int64_t* hashes_ptr,
+) nogil 
 
 
 cdef class Doc:
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index b27c68386..231b9b84d 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -202,35 +202,7 @@ class Doc:
         ss_4_byte_ch_l: int,
         ss_lengths: bytes,
         ss_max_l: int,
+        hashes_per_tok: int,
     ) -> Ints2d: ...
     @staticmethod
     def _get_array_attrs() -> Tuple[Any]: ...
-
-def get_character_combination_hashes(self,
-        *,
-        const bint cs, 
-        const unsigned char* p_lengths,
-        const int p_max_l,
-        const unsigned char* s_lengths,
-        const int s_max_l,
-        const unsigned char* ps_1byte_ch,
-        const int ps_1_byte_ch_l,
-        const unsigned char* ps_2byte_ch,
-        const int ps_2_byte_ch_l,
-        const unsigned char* ps_3byte_ch,
-        const int ps_3_byte_ch_l,
-        const unsigned char* ps_4byte_ch,
-        const int ps_4_byte_ch_l,
-        const unsigned char* ps_lengths,
-        const int ps_max_l,
-        const unsigned char* ss_1byte_ch,
-        const int ss_1_byte_ch_l,
-        const unsigned char* ss_2byte_ch,
-        const int ss_2_byte_ch_l,
-        const unsigned char* ss_3byte_ch,
-        const int ss_3_byte_ch_l,
-        const unsigned char* ss_4byte_ch,
-        const int ss_4_byte_ch_l,
-        const unsigned char* ss_lengths,
-        const int ss_max_l,
-    )
\ No newline at end of file
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 91836e15e..5c751d5a5 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1735,7 +1735,7 @@ cdef class Doc:
                     j += 1
         return output
 
-    #@cython.boundscheck(False)  # Deactivate bounds checking
+    @cython.boundscheck(False)  # Deactivate bounds checking
     def get_character_combination_hashes(self,
         *,
         const bint cs, 
@@ -1763,6 +1763,7 @@ cdef class Doc:
         const int ss_4byte_ch_l,
         const unsigned char* ss_lengths,
         const int ss_max_l,
+        const int hashes_per_tok
     ):
         """
         Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations 
@@ -1796,11 +1797,9 @@ cdef class Doc:
              in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings 
              hashed for "spaCy" would be "c" and "ca".
         ss_max_l: the value of *ss_lengths[-1]*, or *0* if *ss_lengths==None*. Passed in for speed.
+        hashes_per_tok: the total number of hashes produced for each token. Passed in for speed.
         """
 
-        cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty(
-            (self.length, p_max_l + s_max_l + ps_max_l + ss_max_l), dtype="int64")
-        
         # Define / allocate buffers
         cdef Pool mem = Pool()
         cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, 1)
@@ -1809,40 +1808,47 @@ cdef class Doc:
         cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, 1)
         cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
         cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
-        
+        cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok 
+        cdef np.int64_t* hashes_ptr = <np.int64_t*> mem.alloc(
+            total_hashes, sizeof(np.int64_t))
+         
         # Define working variables
         cdef TokenC tok_c
         cdef int hash_idx, tok_i, tok_str_l
         cdef attr_t num_tok_attr
         cdef const unsigned char* tok_str
-
-        for tok_i in range(self.length):
+        cdef np.int64_t* w_hashes_ptr = hashes_ptr
+        
+        for tok_i in range(doc_l):
             tok_c = self.c[tok_i]
             num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
             tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
             tok_str_l = strlen(<char*> tok_str)
-            hash_idx = 0
             
             if p_max_l > 0:
                 _set_prefix_lengths(tok_str, tok_str_l, pref_l_buf, p_max_l)
-                hash_idx = _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes, tok_i, 0)
+                w_hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, w_hashes_ptr)
 
             if s_max_l > 0:
                 _set_suffix_lengths(tok_str, tok_str_l, suff_l_buf, s_max_l)
-                hash_idx = _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l, hashes, tok_i, hash_idx)
+                w_hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l, w_hashes_ptr)
             
             if ps_max_l > 0:
                 _search_for_chars(tok_str, tok_str_l, ps_1byte_ch, ps_1byte_ch_l, ps_2byte_ch, ps_2byte_ch_l, 
                     ps_3byte_ch, ps_3byte_ch_l, ps_4byte_ch, ps_4byte_ch_l, ps_res_buf, ps_max_l, ps_l_buf, False)
-                hash_idx = _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes, tok_i, hash_idx)
+                w_hashes_ptr += _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, w_hashes_ptr)
 
             if ss_max_l > 0:
                 _search_for_chars(tok_str, tok_str_l, ss_1byte_ch, ss_1byte_ch_l, ss_2byte_ch, ss_2byte_ch_l, 
                     ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True)
-                _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes, tok_i, hash_idx)
-            
+                w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr)
+        
+        cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty(
+            (doc_l, hashes_per_tok), dtype="int64")
+        memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.int64_t))
         return hashes
 
+
     @staticmethod
     def _get_array_attrs():
         attrs = [LENGTH, SPACY]
@@ -2023,7 +2029,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
                 lca_matrix[k, j] = lca - start
     return lca_matrix
 
-#@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.boundscheck(False)  # Deactivate bounds checking
 cdef void _set_prefix_lengths(
     const unsigned char* tok_str,
     const int tok_str_l,
@@ -2056,13 +2062,13 @@ cdef void _set_prefix_lengths(
         memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
 
 
-#@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.boundscheck(False)  # Deactivate bounds checking
 cdef void _set_suffix_lengths(
     const unsigned char* tok_str,
     const int tok_str_l,
     unsigned char* suff_l_buf,
     const int s_max_l, 
-):
+) nogil:
     """ Populate *suff_l_buf*, which has length *suff_l*, with the byte lengths of the last *suff_l* characters within *tok_str*. 
         Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word.
 
@@ -2086,7 +2092,7 @@ cdef void _set_suffix_lengths(
         memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
 
 
-#@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.boundscheck(False)  # Deactivate bounds checking
 cdef void _search_for_chars(
     const unsigned char* tok_str,
     const int tok_str_l,
@@ -2175,15 +2181,14 @@ cdef void _search_for_chars(
     memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
         
 
+@cython.boundscheck(False)  # Deactivate bounds checking
 cdef int _write_hashes(
     const unsigned char* res_buf,
     const unsigned char* aff_l_buf,
     const unsigned char* offset_buf,
     const int end_idx,
-    np.ndarray[np.int64_t, ndim=2] hashes,
-    const int tok_i,
-    const int start_hash_idx,
-):    
+    np.int64_t* hashes_ptr,
+) nogil:    
     """ Write hashes for a token/rich property group combination.
 
     res_buf: the string from which to generate the hash values.
@@ -2191,24 +2196,22 @@ cdef int _write_hashes(
     offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
     end_idx: if not *0*, the offset within *res_buf* that should end each affix being hashed;
         if *0*, affixes start at the beginning of *res_buf* rather than ending at the end.
-    hashes: the 2D Numpy array in which the hashes are stored.
-    tok_i: the index of axis 0 of *hashes* to write to.
-    start_hash_idx: the index of axis 1 of *hashes* at which to start writing.
+    hashes_ptr: a pointer starting from which the new hashes should be written.
     """
 
-    cdef int offset, aff_l, hash_val = 0, hash_idx = start_hash_idx
+    cdef int offset, aff_l, hash_val = 0, hash_idx = 0
     
     while True:
-        aff_l = aff_l_buf[hash_idx - start_hash_idx]
+        aff_l = aff_l_buf[hash_idx]
         if aff_l == 0:
             return hash_idx     
         offset = offset_buf[aff_l - 1]
         if offset > 0:
             if end_idx != 0:
-                hash_val = hash32(<void*> res_buf + end_idx - offset, offset, 0)
+                hash_val = hash32(<void*> (res_buf + end_idx - offset), offset, 0)
             else:
                 hash_val = hash32(<void*> res_buf, offset, 0)
-        hashes[tok_i, hash_idx] = hash_val
+        hashes_ptr[hash_idx] = hash_val
         hash_idx += 1