Intermediate state

2025-08-02 11:20:19 +03:00 · 2022-10-13 20:50:25 +02:00 · 2022-10-13 20:50:25 +02:00 · 1e9176f9c5
commit 1e9176f9c5
parent fc99b97e3c
2 changed files with 101 additions and 9 deletions
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -33,6 +33,17 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
 cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
 cdef void _populate_affix_buf(
    const void* str_data_ptr,
    const unsigned int unicode_byte_width,
    const int word_idx, 
    const int word_len,
    Py_UCS4* affix_buf, 
    const int pref_length, 
    const int suff_length,
    const bint to_lower
 )
 cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -42,6 +42,11 @@ from ..util import get_words_and_spaces
 DEF PADDING = 5
 cdef extern from *:
    Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
    void* PyUnicode_DATA(void* o)
    int PyUnicode_KIND(void *data)
    Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
 cdef int bounds_check(int i, int length, int padding) except -1:
    if (i + padding) < 0:
@ -1751,7 +1756,7 @@ cdef class Doc:
        Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations 
            derived from the string (text/orth) of each token.
-        case_sensitive: if *True*, the lower-case version of each token string is used as the basis for generating hashes. Note that
+        case_sensitive: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that
            if *case_sensitive==False*, upper-case characters in *search_chars* will not be found in token strings.
        pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*, 
            the prefixes hashed for "spaCy" would be "sp" and "spa".
@ -1772,14 +1777,36 @@ cdef class Doc:
        [[hash("sp"), [hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")],
        [hash("an"), hash("nd"), hash("and", hash("and"), hash(" "), hash("  "))],
        [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
        UTF-16 is used to encode the token texts, as this results in two-byte representations for all characters that are realistically
        interesting when learning features from words. UTF-16 can also contain four-byte representations, but neither of the byte pairs in 
        a four-byte representation is ever valid in its own right as a two-byte representation. In the rare case that a four-byte 
        representation occurs in a string being analysed, each of its two-byte pairs is treated as a separate character. A four-byte
        representation in *search_chars*, on the other hand, is not supported and results in a ValueError(E1046).
        """
        cdef int longest_pref = max(pref_lengths) if len(pref_lengths) > 0 else 0
        cdef int longest_suff = max(suff_lengths) if len(suff_lengths) > 0 else 0
        cdef Py_UCS4* affix_buf = <Py_UCS4*>self.mem.alloc(4, longest_pref + longest_suff)
        cdef void* text_ptr = <void*> self.text
        cdef void* text_data_ptr = <void*> PyUnicode_DATA(text_ptr) # todo change to const void
        cdef unsigned int unicode_byte_width = PyUnicode_KIND(text_ptr), num_toks = len(self), tok_idx, token_idx, token_len
        cdef TokenC token_c
        cdef str working_str
        for tok_idx in range(num_toks):
            token_c = self.c[tok_idx]
            token_idx = token_c.idx
            token_len = token_c.lex.length
            _populate_affix_buf(
                text_data_ptr, 
                unicode_byte_width, 
                token_idx, 
                token_len, 
                affix_buf, 
                longest_pref, 
                longest_suff, 
                not case_sensitive
            )
        cdef const unsigned char[:] pref_search_chars_v = _get_utf16_memoryview(pref_search_chars, True)
        cdef const unsigned char[:] suff_search_chars_v = _get_utf16_memoryview(suff_search_chars, True)
        cdef unsigned int longest_search_length = max(pref_search_lengths + suff_search_lengths) if len(pref_search_lengths + suff_search_lengths) > 0 else 0         
@ -1788,13 +1815,13 @@ cdef class Doc:
        cdef unsigned int pref_search_chars_v_len = len(pref_search_chars_v), suff_search_chars_v_len = len(suff_search_chars_v), 
        cdef unsigned int found_char_buf_len = len(found_char_buf_bytes)
-        cdef unsigned int num_toks = len(self), num_pref_norm_hashes = len(pref_lengths), num_suff_norm_hashes = len(suff_lengths)
+        cdef unsigned int num_pref_norm_hashes = len(pref_lengths), num_suff_norm_hashes = len(suff_lengths)
        cdef unsigned int num_pref_search_hashes = len(pref_search_lengths)
        cdef unsigned int num_suff_search_hashes = len(suff_search_lengths)
        cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_norm_hashes), dtype="int64")
        cdef const unsigned char[:] tok_str_v
-        cdef unsigned int tok_idx, tok_str_v_len, hash_idx, affix_start, char_comb_len
+        cdef unsigned int tok_str_v_len, hash_idx, affix_start, char_comb_len
        cdef attr_t num_tok_attr
        cdef str str_tok_attr
@ -2028,6 +2055,60 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
    return lca_matrix
 cdef void _populate_affix_buf(
    const void* str_data_ptr,
    const unsigned int unicode_byte_width,
    const int word_idx, 
    const int word_len,
    Py_UCS4* affix_buf, 
    const int pref_length, 
    const int suff_length,
    const bint to_lower
 ):
    """ Populate a buffer of length p+s with the first p and the last s characters of a word within a string.
        If the word is shorter than p and/or s, the empty character positions in the middle are filled with zeros.
        str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical 
            Unicode form (see PEP 393).
        unicode_byte_width: the number of bytes occupied by each character in the containing string.
        word_idx: the index of the first character of the word within the containing string.
        word_len: the length of the word.
        affix_buf: the buffer to populate.
        pref_length: the length of the prefix.
        suff_length: the length of the suffix.
        to_lower: if *True*, any upper case characters in either affix are converted to lower case.
    """
    cdef int affix_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx
    cdef Py_UCS4 working_wchar
    while affix_buf_idx < pref_length and affix_buf_idx < word_len:
        working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, idx)
        if to_lower:
            working_wchar = Py_UNICODE_TOLOWER(working_wchar)
        memcpy(affix_buf + affix_buf_idx, &working_wchar, 4)
        affix_buf_idx += 1
    while (affix_buf_idx < buf_size - suff_length) or (affix_buf_idx < buf_size - word_len):
        # fill out the empty middle part of the buffer with zeros
        affix_buf[affix_buf_idx] = 0
        affix_buf_idx += 1
    while affix_buf_idx < buf_size:
        in_word_idx = affix_buf_idx + word_len - buf_size
        # for suffixes we have to track the in-word index separately from the in-buffer index
        if in_word_idx < pref_length:
            # we've already retrieved this character as part of the prefix, so copy it from there
            # as that's quicker than retrieving it from the input string a second time
            memcpy(affix_buf + affix_buf_idx, affix_buf + in_word_idx, 4)
        else:
           working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, word_idx + in_word_idx)
            if to_lower:
                working_wchar = Py_UNICODE_TOLOWER(working_wchar)
            memcpy(affix_buf + affix_buf_idx, &working_wchar, 4)
        affix_buf_idx += 1
 cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes):
    """
    Return a memory view of the UTF-16 representation of a string with the default endianness of the platform.