Intermediate state

This commit is contained in:
richardpaulhudson 2022-10-13 20:50:25 +02:00
parent fc99b97e3c
commit 1e9176f9c5
2 changed files with 101 additions and 9 deletions

View File

@ -33,6 +33,17 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end) cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef void _populate_affix_buf(
const void* str_data_ptr,
const unsigned int unicode_byte_width,
const int word_idx,
const int word_len,
Py_UCS4* affix_buf,
const int pref_length,
const int suff_length,
const bint to_lower
)
cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes) cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes)

View File

@ -42,6 +42,11 @@ from ..util import get_words_and_spaces
DEF PADDING = 5 DEF PADDING = 5
cdef extern from *:
Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
void* PyUnicode_DATA(void* o)
int PyUnicode_KIND(void *data)
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
cdef int bounds_check(int i, int length, int padding) except -1: cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0: if (i + padding) < 0:
@ -1751,7 +1756,7 @@ cdef class Doc:
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
derived from the string (text/orth) of each token. derived from the string (text/orth) of each token.
case_sensitive: if *True*, the lower-case version of each token string is used as the basis for generating hashes. Note that case_sensitive: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that
if *case_sensitive==False*, upper-case characters in *search_chars* will not be found in token strings. if *case_sensitive==False*, upper-case characters in *search_chars* will not be found in token strings.
pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*, pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*,
the prefixes hashed for "spaCy" would be "sp" and "spa". the prefixes hashed for "spaCy" would be "sp" and "spa".
@ -1772,14 +1777,36 @@ cdef class Doc:
[[hash("sp"), [hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")], [[hash("sp"), [hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")],
[hash("an"), hash("nd"), hash("and", hash("and"), hash(" "), hash(" "))], [hash("an"), hash("nd"), hash("and", hash("and"), hash(" "), hash(" "))],
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]] [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
UTF-16 is used to encode the token texts, as this results in two-byte representations for all characters that are realistically
interesting when learning features from words. UTF-16 can also contain four-byte representations, but neither of the byte pairs in
a four-byte representation is ever valid in its own right as a two-byte representation. In the rare case that a four-byte
representation occurs in a string being analysed, each of its two-byte pairs is treated as a separate character. A four-byte
representation in *search_chars*, on the other hand, is not supported and results in a ValueError(E1046).
""" """
cdef int longest_pref = max(pref_lengths) if len(pref_lengths) > 0 else 0
cdef int longest_suff = max(suff_lengths) if len(suff_lengths) > 0 else 0
cdef Py_UCS4* affix_buf = <Py_UCS4*>self.mem.alloc(4, longest_pref + longest_suff)
cdef void* text_ptr = <void*> self.text
cdef void* text_data_ptr = <void*> PyUnicode_DATA(text_ptr) # todo change to const void
cdef unsigned int unicode_byte_width = PyUnicode_KIND(text_ptr), num_toks = len(self), tok_idx, token_idx, token_len
cdef TokenC token_c
cdef str working_str
for tok_idx in range(num_toks):
token_c = self.c[tok_idx]
token_idx = token_c.idx
token_len = token_c.lex.length
_populate_affix_buf(
text_data_ptr,
unicode_byte_width,
token_idx,
token_len,
affix_buf,
longest_pref,
longest_suff,
not case_sensitive
)
cdef const unsigned char[:] pref_search_chars_v = _get_utf16_memoryview(pref_search_chars, True) cdef const unsigned char[:] pref_search_chars_v = _get_utf16_memoryview(pref_search_chars, True)
cdef const unsigned char[:] suff_search_chars_v = _get_utf16_memoryview(suff_search_chars, True) cdef const unsigned char[:] suff_search_chars_v = _get_utf16_memoryview(suff_search_chars, True)
cdef unsigned int longest_search_length = max(pref_search_lengths + suff_search_lengths) if len(pref_search_lengths + suff_search_lengths) > 0 else 0 cdef unsigned int longest_search_length = max(pref_search_lengths + suff_search_lengths) if len(pref_search_lengths + suff_search_lengths) > 0 else 0
@ -1788,13 +1815,13 @@ cdef class Doc:
cdef unsigned int pref_search_chars_v_len = len(pref_search_chars_v), suff_search_chars_v_len = len(suff_search_chars_v), cdef unsigned int pref_search_chars_v_len = len(pref_search_chars_v), suff_search_chars_v_len = len(suff_search_chars_v),
cdef unsigned int found_char_buf_len = len(found_char_buf_bytes) cdef unsigned int found_char_buf_len = len(found_char_buf_bytes)
cdef unsigned int num_toks = len(self), num_pref_norm_hashes = len(pref_lengths), num_suff_norm_hashes = len(suff_lengths) cdef unsigned int num_pref_norm_hashes = len(pref_lengths), num_suff_norm_hashes = len(suff_lengths)
cdef unsigned int num_pref_search_hashes = len(pref_search_lengths) cdef unsigned int num_pref_search_hashes = len(pref_search_lengths)
cdef unsigned int num_suff_search_hashes = len(suff_search_lengths) cdef unsigned int num_suff_search_hashes = len(suff_search_lengths)
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_norm_hashes), dtype="int64") cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_norm_hashes), dtype="int64")
cdef const unsigned char[:] tok_str_v cdef const unsigned char[:] tok_str_v
cdef unsigned int tok_idx, tok_str_v_len, hash_idx, affix_start, char_comb_len cdef unsigned int tok_str_v_len, hash_idx, affix_start, char_comb_len
cdef attr_t num_tok_attr cdef attr_t num_tok_attr
cdef str str_tok_attr cdef str str_tok_attr
@ -2028,6 +2055,60 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
return lca_matrix return lca_matrix
cdef void _populate_affix_buf(
const void* str_data_ptr,
const unsigned int unicode_byte_width,
const int word_idx,
const int word_len,
Py_UCS4* affix_buf,
const int pref_length,
const int suff_length,
const bint to_lower
):
""" Populate a buffer of length p+s with the first p and the last s characters of a word within a string.
If the word is shorter than p and/or s, the empty character positions in the middle are filled with zeros.
str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical
Unicode form (see PEP 393).
unicode_byte_width: the number of bytes occupied by each character in the containing string.
word_idx: the index of the first character of the word within the containing string.
word_len: the length of the word.
affix_buf: the buffer to populate.
pref_length: the length of the prefix.
suff_length: the length of the suffix.
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
"""
cdef int affix_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx
cdef Py_UCS4 working_wchar
while affix_buf_idx < pref_length and affix_buf_idx < word_len:
working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, idx)
if to_lower:
working_wchar = Py_UNICODE_TOLOWER(working_wchar)
memcpy(affix_buf + affix_buf_idx, &working_wchar, 4)
affix_buf_idx += 1
while (affix_buf_idx < buf_size - suff_length) or (affix_buf_idx < buf_size - word_len):
# fill out the empty middle part of the buffer with zeros
affix_buf[affix_buf_idx] = 0
affix_buf_idx += 1
while affix_buf_idx < buf_size:
in_word_idx = affix_buf_idx + word_len - buf_size
# for suffixes we have to track the in-word index separately from the in-buffer index
if in_word_idx < pref_length:
# we've already retrieved this character as part of the prefix, so copy it from there
# as that's quicker than retrieving it from the input string a second time
memcpy(affix_buf + affix_buf_idx, affix_buf + in_word_idx, 4)
else:
working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, word_idx + in_word_idx)
if to_lower:
working_wchar = Py_UNICODE_TOLOWER(working_wchar)
memcpy(affix_buf + affix_buf_idx, &working_wchar, 4)
affix_buf_idx += 1
cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes): cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes):
""" """
Return a memory view of the UTF-16 representation of a string with the default endianness of the platform. Return a memory view of the UTF-16 representation of a string with the default endianness of the platform.