diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py index c7766340a..50ccab6c2 100644 --- a/spacy/ml/richfeatureextractor.py +++ b/spacy/ml/richfeatureextractor.py @@ -69,7 +69,7 @@ def forward( features: List[Ints2d] = [] for doc in docs: hashes = doc.get_character_combination_hashes( - cs=case_sensitive, + case_sensitive=case_sensitive, p_lengths=p_lengths, s_lengths=s_lengths, ps_search_chars=ps_search_chars, diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 9983f6b1f..45c97f90f 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -316,7 +316,6 @@ cdef class StringStore: self.keys.push_back(key) return value - @cython.boundscheck(False) # Deactivate bounds checking cdef (const unsigned char*, int) utf8_ptr(self, const attr_t hash_val): # Returns a pointer to the UTF-8 string together with its length in bytes. # This method presumes the calling code has already checked that *hash_val* diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index a0aac6d54..ac4ddcb01 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -4,7 +4,6 @@ import weakref import numpy from time import time from numpy.testing import assert_array_equal -from murmurhash.mrmr import hash import pytest import warnings from thinc.api import NumpyOps, get_current_ops @@ -1017,10 +1016,10 @@ def _get_fnv1a_hash(input: bytes) -> int: def test_fnv1a_hash(): """Checks the conformity of the 64-bit FNV1A implementation with http://www.isthe.com/chongo/src/fnv/test_fnv.c. - The method called here, _get_fnv1a_hash(), is only used in testing; + The method called here, _get_fnv1a_hash(), is only used in testing; in production code, the hashing is performed in a fashion that is interweaved - with other logic. The conformity of the production code is demonstrated by the - character combination hash tests, where hashes produced by the production code + with other logic. The conformity of the production code is demonstrated by the + character combination hash tests, where hashes produced by the production code are tested for equality against hashes produced by _get_fnv1a_hash(). """ INPUTS = [ @@ -1458,7 +1457,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive "xx✨rp", case_sensitive ) hashes = doc.get_character_combination_hashes( - cs=case_sensitive, + case_sensitive=case_sensitive, p_lengths=bytes( ( 1, @@ -1539,7 +1538,7 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer): doc = en_tokenizer("spaCy✨ and Prodigy") ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("rp", False) hashes = doc.get_character_combination_hashes( - cs=False, + case_sensitive=False, p_lengths=bytes(), s_lengths=bytes( ( @@ -1586,7 +1585,7 @@ def test_get_character_combination_hashes_various_lengths(en_tokenizer): for s_length in range(1, 8): hashes = doc.get_character_combination_hashes( - cs=False, + case_sensitive=False, p_lengths=bytes((p_length,)), s_lengths=bytes((s_length,)), ps_search_chars=bytes(), @@ -1608,7 +1607,7 @@ def test_get_character_combination_hashes_turkish_i_with_dot( doc = en_tokenizer("İ".lower() + "İ") search_chars, width_offsets = get_search_char_byte_arrays("İ", case_sensitive) hashes = doc.get_character_combination_hashes( - cs=case_sensitive, + case_sensitive=case_sensitive, p_lengths=bytes( ( 1, @@ -1696,7 +1695,7 @@ def test_get_character_combination_hashes_string_store_spec_cases( assert len(doc) == 4 ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("E", case_sensitive) hashes = doc.get_character_combination_hashes( - cs=case_sensitive, + case_sensitive=case_sensitive, p_lengths=bytes((2,)), s_lengths=bytes((2,)), ps_search_chars=ps_search_chars, @@ -1726,7 +1725,7 @@ def test_get_character_combination_hashes_string_store_spec_cases( def test_character_combination_hashes_empty_lengths(en_tokenizer): doc = en_tokenizer("and𐌞") assert doc.get_character_combination_hashes( - cs=True, + case_sensitive=True, p_lengths=bytes(), s_lengths=bytes(), ps_search_chars=bytes(), diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index e7a2cd816..91c6b5479 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -177,7 +177,7 @@ class Doc: def get_character_combination_hashes( self, *, - cs: bool, + case_sensitive: bool, p_lengths: bytes, s_lengths: bytes, ps_search_chars: bytes, diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 9cca34a6b..94a4a320c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -41,6 +41,7 @@ from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS from ..util import get_words_and_spaces DEF PADDING = 5 +MAX_UTF8_CHAR_BYTE_WIDTH = 4 cdef int bounds_check(int i, int length, int padding) except -1: if (i + padding) < 0: @@ -1743,10 +1744,9 @@ cdef class Doc: j += 1 return output - @cython.boundscheck(False) # Deactivate bounds checking def get_character_combination_hashes(self, *, - const bint cs, + const bint case_sensitive, const unsigned char* p_lengths, const unsigned char* s_lengths, const unsigned char* ps_search_chars, @@ -1789,8 +1789,8 @@ cdef class Doc: Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible - for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with maximally four-byte - character widths can never exceed 255. + for ensuring that lengths being passed in cannot exceed 63 and hence, with maximally four-byte + character widths, that individual values within buffers can never exceed the capacity of a single byte (255). Note that this method performs no data validation itself as it expects the calling code will already have done so, and that the behaviour of the code may be erratic if the supplied parameters do not conform to expectations. @@ -1809,12 +1809,14 @@ cdef class Doc: # Define / allocate buffers cdef Pool mem = Pool() - cdef unsigned char* pref_l_buf = mem.alloc(p_max_l, 1) - cdef unsigned char* suff_l_buf = mem.alloc(s_max_l, 1) - cdef unsigned char* ps_res_buf = mem.alloc(ps_max_l, 4) - cdef unsigned char* ps_l_buf = mem.alloc(ps_max_l, 1) - cdef unsigned char* ss_res_buf = mem.alloc(ss_max_l, 4) - cdef unsigned char* ss_l_buf = mem.alloc(ss_max_l, 1) + cdef unsigned char* pref_l_buf = mem.alloc(p_max_l, sizeof(char)) + cdef unsigned char* suff_l_buf = mem.alloc(s_max_l, sizeof(char)) + cdef unsigned char* ps_res_buf = mem.alloc(ps_max_l, + MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char)) + cdef unsigned char* ps_l_buf = mem.alloc(ps_max_l, sizeof(char)) + cdef unsigned char* ss_res_buf = mem.alloc(ss_max_l, + MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char)) + cdef unsigned char* ss_l_buf = mem.alloc(ss_max_l, sizeof(char)) cdef int doc_l = self.length cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty( (doc_l, hashes_per_tok), dtype="uint64") @@ -1829,7 +1831,7 @@ cdef class Doc: for tok_i in range(doc_l): tok_c = self.c[tok_i] - num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower + num_tok_attr = tok_c.lex.orth if case_sensitive else tok_c.lex.lower if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens if num_tok_attr == 0: tok_str_bytes = b"" @@ -2042,21 +2044,22 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): return lca_matrix -@cython.boundscheck(False) # Deactivate bounds checking cdef void _set_prefix_lengths( const unsigned char* tok_str, const int tok_str_l, const int p_max_l, unsigned char* pref_l_buf, ) nogil: - """ Populate *pref_l_buf*, which has length *pref_l*, with the byte lengths of the first *pref_l* characters within *tok_str*. - Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word. + """ Populate *pref_l_buf*, which has length *p_max_l*, with the byte lengths of each of the substrings terminated by the first *p_max_l* + characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length + of the whole word. tok_str: a UTF-8 representation of a string. tok_str_l: the length of *tok_str*. p_max_l: the number of characters to process at the beginning of the word. - pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The calling code ensures that lengths - greater than 255 cannot occur. + pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is + responsible for ensuring that *p_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values + within the buffer can never exceed the capacity of a single byte (255). """ cdef int tok_str_idx = 1, pref_l_buf_idx = 0 @@ -2075,21 +2078,22 @@ cdef void _set_prefix_lengths( memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx) -@cython.boundscheck(False) # Deactivate bounds checking cdef void _set_suffix_lengths( const unsigned char* tok_str, const int tok_str_l, const int s_max_l, unsigned char* suff_l_buf, ) nogil: - """ Populate *suff_l_buf*, which has length *suff_l*, with the byte lengths of the last *suff_l* characters within *tok_str*. - Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word. + """ Populate *suff_l_buf*, which has length *s_max_l*, with the byte lengths of each of the substrings started by the last *s_max_l* + characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length + of the whole word. tok_str: a UTF-8 representation of a string. tok_str_l: the length of *tok_str*. s_max_l: the number of characters to process at the end of the word. - suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The calling code ensures that lengths - greater than 255 cannot occur. + suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is + responsible for ensuring that *s_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values + within the buffer can never exceed the capacity of a single byte (255). """ cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0 @@ -2105,7 +2109,6 @@ cdef void _set_suffix_lengths( memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx) -@cython.boundscheck(False) # Deactivate bounds checking cdef void _search_for_chars( const unsigned char* tok_str, const int tok_str_l,