Simple changes based on review comments

2025-08-02 11:20:19 +03:00 · 2022-12-12 11:10:10 +01:00 · 2022-12-12 11:10:10 +01:00 · 79b2843a3f
commit 79b2843a3f
parent ec1426700e
5 changed files with 36 additions and 35 deletions
--- a/spacy/ml/richfeatureextractor.py
+++ b/spacy/ml/richfeatureextractor.py
@ -69,7 +69,7 @@ def forward(
    features: List[Ints2d] = []
    for doc in docs:
        hashes = doc.get_character_combination_hashes(
-            cs=case_sensitive,
+            case_sensitive=case_sensitive,
            p_lengths=p_lengths,
            s_lengths=s_lengths,
            ps_search_chars=ps_search_chars,
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -316,7 +316,6 @@ cdef class StringStore:
        self.keys.push_back(key)
        return value

-    @cython.boundscheck(False)  # Deactivate bounds checking
    cdef (const unsigned char*, int) utf8_ptr(self, const attr_t hash_val):
        # Returns a pointer to the UTF-8 string together with its length in bytes.
        # This method presumes the calling code has already checked that *hash_val*
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -4,7 +4,6 @@ import weakref
 import numpy
 from time import time
 from numpy.testing import assert_array_equal
-from murmurhash.mrmr import hash
 import pytest
 import warnings
 from thinc.api import NumpyOps, get_current_ops
@ -1458,7 +1457,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
        "xx✨rp", case_sensitive
    )
    hashes = doc.get_character_combination_hashes(
-        cs=case_sensitive,
+        case_sensitive=case_sensitive,
        p_lengths=bytes(
            (
                1,
@ -1539,7 +1538,7 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
    doc = en_tokenizer("spaCy✨ and Prodigy")
    ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("rp", False)
    hashes = doc.get_character_combination_hashes(
-        cs=False,
+        case_sensitive=False,
        p_lengths=bytes(),
        s_lengths=bytes(
            (
@ -1586,7 +1585,7 @@ def test_get_character_combination_hashes_various_lengths(en_tokenizer):
        for s_length in range(1, 8):

            hashes = doc.get_character_combination_hashes(
-                cs=False,
+                case_sensitive=False,
                p_lengths=bytes((p_length,)),
                s_lengths=bytes((s_length,)),
                ps_search_chars=bytes(),
@ -1608,7 +1607,7 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
    doc = en_tokenizer("İ".lower() + "İ")
    search_chars, width_offsets = get_search_char_byte_arrays("İ", case_sensitive)
    hashes = doc.get_character_combination_hashes(
-        cs=case_sensitive,
+        case_sensitive=case_sensitive,
        p_lengths=bytes(
            (
                1,
@ -1696,7 +1695,7 @@ def test_get_character_combination_hashes_string_store_spec_cases(
    assert len(doc) == 4
    ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("E", case_sensitive)
    hashes = doc.get_character_combination_hashes(
-        cs=case_sensitive,
+        case_sensitive=case_sensitive,
        p_lengths=bytes((2,)),
        s_lengths=bytes((2,)),
        ps_search_chars=ps_search_chars,
@ -1726,7 +1725,7 @@ def test_get_character_combination_hashes_string_store_spec_cases(
 def test_character_combination_hashes_empty_lengths(en_tokenizer):
    doc = en_tokenizer("and𐌞")
    assert doc.get_character_combination_hashes(
-        cs=True,
+        case_sensitive=True,
        p_lengths=bytes(),
        s_lengths=bytes(),
        ps_search_chars=bytes(),
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -177,7 +177,7 @@ class Doc:
    def get_character_combination_hashes(
        self,
        *,
-        cs: bool,
+        case_sensitive: bool,
        p_lengths: bytes,
        s_lengths: bytes,
        ps_search_chars: bytes,
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -41,6 +41,7 @@ from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces

 DEF PADDING = 5
+MAX_UTF8_CHAR_BYTE_WIDTH = 4

 cdef int bounds_check(int i, int length, int padding) except -1:
    if (i + padding) < 0:
@ -1743,10 +1744,9 @@ cdef class Doc:
                    j += 1
        return output

-    @cython.boundscheck(False)  # Deactivate bounds checking
    def get_character_combination_hashes(self,
        *,
-        const bint cs, 
+        const bint case_sensitive, 
        const unsigned char* p_lengths,
        const unsigned char* s_lengths,
        const unsigned char* ps_search_chars,
@ -1789,8 +1789,8 @@ cdef class Doc:
    
        Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of 
        the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
-        for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with maximally four-byte 
-        character widths can never exceed 255.
+        for ensuring that lengths being passed in cannot exceed 63 and hence, with maximally four-byte 
+        character widths, that individual values within buffers can never exceed the capacity of a single byte (255).

        Note that this method performs no data validation itself as it expects the calling code will already have done so, and
        that the behaviour of the code may be erratic if the supplied parameters do not conform to expectations.
@ -1809,12 +1809,14 @@ cdef class Doc:
        
        # Define / allocate buffers
        cdef Pool mem = Pool()
-        cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, 1)
-        cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, 1)
-        cdef unsigned char* ps_res_buf = <unsigned char*> mem.alloc(ps_max_l, 4)
-        cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, 1)
-        cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
-        cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
+        cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
+        cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
+        cdef unsigned char* ps_res_buf = <unsigned char*> mem.alloc(ps_max_l, 
+            MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char))
+        cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, sizeof(char))
+        cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 
+            MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char))
+        cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, sizeof(char))
        cdef int doc_l = self.length
        cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
            (doc_l, hashes_per_tok), dtype="uint64")
@ -1829,7 +1831,7 @@ cdef class Doc:
        
        for tok_i in range(doc_l):
            tok_c = self.c[tok_i]
-            num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
+            num_tok_attr = tok_c.lex.orth if case_sensitive else tok_c.lex.lower
            if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens
                if num_tok_attr == 0:
                    tok_str_bytes = b""
@ -2042,21 +2044,22 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
    return lca_matrix


-@cython.boundscheck(False)  # Deactivate bounds checking
 cdef void _set_prefix_lengths(
    const unsigned char* tok_str,
    const int tok_str_l,
    const int p_max_l, 
    unsigned char* pref_l_buf,
 ) nogil:
-    """ Populate *pref_l_buf*, which has length *pref_l*, with the byte lengths of the first *pref_l* characters within *tok_str*. 
-        Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word.
+    """ Populate *pref_l_buf*, which has length *p_max_l*, with the byte lengths of each of the substrings terminated by the first *p_max_l* 
+        characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length 
+        of the whole word.

        tok_str: a UTF-8 representation of a string.
        tok_str_l: the length of *tok_str*.
        p_max_l: the number of characters to process at the beginning of the word.
-        pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The calling code ensures that lengths
-            greater than 255 cannot occur.
+        pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is 
+        responsible for ensuring that *p_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values 
+        within the buffer can never exceed the capacity of a single byte (255).
    """
    cdef int tok_str_idx = 1, pref_l_buf_idx = 0

@ -2075,21 +2078,22 @@ cdef void _set_prefix_lengths(
        memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)


-@cython.boundscheck(False)  # Deactivate bounds checking
 cdef void _set_suffix_lengths(
    const unsigned char* tok_str,
    const int tok_str_l,
    const int s_max_l,
    unsigned char* suff_l_buf,
 ) nogil:
-    """ Populate *suff_l_buf*, which has length *suff_l*, with the byte lengths of the last *suff_l* characters within *tok_str*. 
-        Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word.
+    """ Populate *suff_l_buf*, which has length *s_max_l*, with the byte lengths of each of the substrings started by the last *s_max_l* 
+        characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length 
+        of the whole word.

        tok_str: a UTF-8 representation of a string.
        tok_str_l: the length of *tok_str*.
        s_max_l: the number of characters to process at the end of the word.
-        suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The calling code ensures that lengths
-            greater than 255 cannot occur.
+        suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is 
+        responsible for ensuring that *s_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values 
+        within the buffer can never exceed the capacity of a single byte (255).
    """
    cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0

@ -2105,7 +2109,6 @@ cdef void _set_suffix_lengths(
        memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)


-@cython.boundscheck(False)  # Deactivate bounds checking
 cdef void _search_for_chars(
    const unsigned char* tok_str,
    const int tok_str_l,