mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Intermediate state
This commit is contained in:
parent
c140bd6083
commit
13e417e8d1
|
@ -956,7 +956,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"knowledge base, use `InMemoryLookupKB`.")
|
"knowledge base, use `InMemoryLookupKB`.")
|
||||||
E1047 = ("Invalid rich group config '{label}'.")
|
E1047 = ("Invalid rich group config '{label}'.")
|
||||||
E1048 = ("Length > 31 in rich group config '{label}.")
|
E1048 = ("Length > 31 in rich group config '{label}.")
|
||||||
E1049 = ("Error splitting UTF-8 byte string into separate characters.")
|
E1049 = ("Rich group config {label} specifies lengths that are not in ascending order.")
|
||||||
|
E1050 = ("Error splitting UTF-8 byte string into separate characters.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -195,7 +195,6 @@ def _verify_rich_config_group(
|
||||||
rows: Optional[List[int]],
|
rows: Optional[List[int]],
|
||||||
search_chars: Optional[str],
|
search_chars: Optional[str],
|
||||||
is_search_char_group: bool,
|
is_search_char_group: bool,
|
||||||
case_sensitive: bool,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
if lengths is not None or rows is not None:
|
if lengths is not None or rows is not None:
|
||||||
if is_search_char_group and (search_chars is None or len(search_chars) == 0):
|
if is_search_char_group and (search_chars is None or len(search_chars) == 0):
|
||||||
|
@ -208,8 +207,11 @@ def _verify_rich_config_group(
|
||||||
raise ValueError(Errors.E1047.format(label=label))
|
raise ValueError(Errors.E1047.format(label=label))
|
||||||
elif search_chars is not None:
|
elif search_chars is not None:
|
||||||
raise ValueError(Errors.E1047.format(label=label))
|
raise ValueError(Errors.E1047.format(label=label))
|
||||||
if lengths is not None and max(lengths) > 31:
|
if lengths is not None:
|
||||||
raise ValueError(Errors.E1048.format(label=label))
|
if lengths[-1] > 31:
|
||||||
|
raise ValueError(Errors.E1048.format(label=label))
|
||||||
|
if len(lengths) != len(set(lengths)) or lengths != sorted(lengths):
|
||||||
|
raise ValueError(Errors.E1049.format(label=label))
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.RichMultiHashEmbed.v1")
|
@registry.architectures("spacy.RichMultiHashEmbed.v1")
|
||||||
|
@ -258,6 +260,8 @@ def RichMultiHashEmbed(
|
||||||
plural noun does not become `a` if it is the third or fourth vowel from the
|
plural noun does not become `a` if it is the third or fourth vowel from the
|
||||||
end of the word.
|
end of the word.
|
||||||
|
|
||||||
|
All lengths must be specified in ascending order.
|
||||||
|
|
||||||
width (int): The output width. Also used as the width of the embedding tables.
|
width (int): The output width. Also used as the width of the embedding tables.
|
||||||
Recommended values are between 64 and 300.
|
Recommended values are between 64 and 300.
|
||||||
attrs (list of attr IDs): The token attributes to embed. A separate
|
attrs (list of attr IDs): The token attributes to embed. A separate
|
||||||
|
@ -293,19 +297,14 @@ def RichMultiHashEmbed(
|
||||||
if len(rows) != len(attrs):
|
if len(rows) != len(attrs):
|
||||||
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
||||||
|
|
||||||
_verify_rich_config_group(
|
_verify_rich_config_group("prefix", pref_lengths, pref_rows, None, False)
|
||||||
"prefix", pref_lengths, pref_rows, None, False, case_sensitive
|
_verify_rich_config_group("suffix", suff_lengths, suff_rows, None, False)
|
||||||
)
|
|
||||||
_verify_rich_config_group(
|
|
||||||
"suffix", suff_lengths, suff_rows, None, False, case_sensitive
|
|
||||||
)
|
|
||||||
_verify_rich_config_group(
|
_verify_rich_config_group(
|
||||||
"prefix search",
|
"prefix search",
|
||||||
pref_search_lengths,
|
pref_search_lengths,
|
||||||
pref_search_rows,
|
pref_search_rows,
|
||||||
pref_search_chars,
|
pref_search_chars,
|
||||||
True,
|
True,
|
||||||
case_sensitive,
|
|
||||||
)
|
)
|
||||||
_verify_rich_config_group(
|
_verify_rich_config_group(
|
||||||
"suffix search",
|
"suffix search",
|
||||||
|
@ -313,7 +312,6 @@ def RichMultiHashEmbed(
|
||||||
suff_search_rows,
|
suff_search_rows,
|
||||||
suff_search_chars,
|
suff_search_chars,
|
||||||
True,
|
True,
|
||||||
case_sensitive,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if "PREFIX" in attrs or "SUFFIX" in attrs:
|
if "PREFIX" in attrs or "SUFFIX" in attrs:
|
||||||
|
|
|
@ -39,20 +39,21 @@ cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||||
|
|
||||||
|
|
||||||
cdef void _set_affix_lengths(
|
cdef void _set_affix_lengths(
|
||||||
const unsigned char[:] text_buf,
|
const unsigned char[:] tok_str,
|
||||||
char* aff_len_buf,
|
unsigned char* aff_len_buf,
|
||||||
const int pref_len,
|
const int pref_len,
|
||||||
const int suff_len,
|
const int suff_len,
|
||||||
) nogil
|
) nogil
|
||||||
|
|
||||||
|
|
||||||
cdef bint _search_for_chars(
|
ccdef void _search_for_chars(
|
||||||
const unsigned char[:] tok_str,
|
const unsigned char[:] tok_str,
|
||||||
const unsigned char[:] s_1byte_ch,
|
const unsigned char[:] s_1byte_ch,
|
||||||
const unsigned char[:] s_2byte_ch,
|
const unsigned char[:] s_2byte_ch,
|
||||||
const unsigned char[:] s_3byte_ch,
|
const unsigned char[:] s_3byte_ch,
|
||||||
const unsigned char[:] s_4byte_ch,
|
const unsigned char[:] s_4byte_ch,
|
||||||
unsigned char* res_buf,
|
unsigned char* res_buf,
|
||||||
|
int max_res_l,
|
||||||
unsigned char* len_buf,
|
unsigned char* len_buf,
|
||||||
bint suffs_not_prefs
|
bint suffs_not_prefs
|
||||||
) nogil
|
) nogil
|
||||||
|
|
|
@ -1751,7 +1751,7 @@ cdef class Doc:
|
||||||
const unsigned char[:] ss_3byte_ch,
|
const unsigned char[:] ss_3byte_ch,
|
||||||
const unsigned char[:] ss_4byte_ch,
|
const unsigned char[:] ss_4byte_ch,
|
||||||
np.ndarray ss_lengths,
|
np.ndarray ss_lengths,
|
||||||
):
|
) nogil:
|
||||||
"""
|
"""
|
||||||
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
||||||
derived from the raw text of each token.
|
derived from the raw text of each token.
|
||||||
|
@ -1764,19 +1764,19 @@ cdef class Doc:
|
||||||
ss_ variables relate to searches starting at the end of the word
|
ss_ variables relate to searches starting at the end of the word
|
||||||
|
|
||||||
cs: if *False*, hashes are generated based on the lower-case version of each token.
|
cs: if *False*, hashes are generated based on the lower-case version of each token.
|
||||||
p_lengths: an Ints1d specifying the lengths of prefixes to be hashed. For example, if *p_lengths==[2, 3]*,
|
p_lengths: an Ints1d specifying the lengths of prefixes to be hashed in ascending order. For example,
|
||||||
the prefixes hashed for "spaCy" would be "sp" and "spa".
|
if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||||
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
|
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed in ascending order. For example,
|
||||||
*cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
||||||
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
||||||
starting at the beginning.
|
starting at the beginning.
|
||||||
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if
|
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed in ascending order.
|
||||||
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
|
For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
|
||||||
"spaCy" would be "a" and "ac".
|
"spaCy" would be "a" and "ac".
|
||||||
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
||||||
starting at the end.
|
starting at the end.
|
||||||
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if
|
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed in ascending order.
|
||||||
*ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for
|
For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for
|
||||||
"spaCy" would be "c" and "ca".
|
"spaCy" would be "c" and "ca".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -1826,30 +1826,33 @@ cdef class Doc:
|
||||||
for hash_idx in range(p_h_num):
|
for hash_idx in range(p_h_num):
|
||||||
offset = aff_len_buf[p_lengths_v[hash_idx]]
|
offset = aff_len_buf[p_lengths_v[hash_idx]]
|
||||||
if offset > 0:
|
if offset > 0:
|
||||||
hash_val = hash32(<void*> &qcktest2[0], offset, 0)
|
hash_val = hash32(<void*> &tok_str[0], offset, 0)
|
||||||
hashes[tok_i, hash_idx] = hash_val
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
|
||||||
for hash_idx in range(p_h_num, s_h_end):
|
for hash_idx in range(p_h_num, s_h_end):
|
||||||
offset = s_lengths_v[hash_idx - p_h_num]
|
offset = s_lengths_v[hash_idx - p_h_num]
|
||||||
if offset > 0:
|
if offset > 0:
|
||||||
hash_val = hash32(<void*> &qcktest2[len(qcktest2) - offset], offset, 0)
|
hash_val = hash32(<void*> &tok_str[len(tok_str) - offset], offset, 0)
|
||||||
hashes[tok_i, hash_idx] = hash_val
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
|
||||||
if (
|
|
||||||
ps_h_num > 0 and
|
|
||||||
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_res_len, False)
|
|
||||||
):
|
|
||||||
for hash_idx in range(s_h_end, ps_h_end):
|
|
||||||
aff_len = ps_lengths_v[hash_idx - s_h_end]
|
|
||||||
hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
|
||||||
|
|
||||||
if (
|
if ps_h_num > 0:
|
||||||
ss_h_num > 0 and
|
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_res_len, False)
|
||||||
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_res_len, True)
|
hash_val = 0
|
||||||
):
|
for hash_idx in range(s_h_end, ps_h_end):
|
||||||
|
offset = ps_lengths_v[hash_idx - s_h_end]
|
||||||
|
if offset > 0:
|
||||||
|
hash_val = hash32(ps_res_buf, offset, 0)
|
||||||
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
|
||||||
|
if ss_h_num > 0:
|
||||||
|
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_res_len, True)
|
||||||
|
hash_val = 0
|
||||||
for hash_idx in range(ps_h_end, ss_h_end):
|
for hash_idx in range(ps_h_end, ss_h_end):
|
||||||
aff_len = ss_lengths_v[hash_idx - ps_h_end]
|
offset = ss_lengths_v[hash_idx - ps_h_end]
|
||||||
hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
if offset > 0:
|
||||||
|
hash_val = hash32(ss_res_buf, offset, 0)
|
||||||
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
|
||||||
|
|
||||||
self.mem.free(aff_len_buf)
|
self.mem.free(aff_len_buf)
|
||||||
self.mem.free(ps_res_buf)
|
self.mem.free(ps_res_buf)
|
||||||
|
@ -2040,76 +2043,126 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
|
|
||||||
|
|
||||||
cdef void _set_affix_lengths(
|
cdef void _set_affix_lengths(
|
||||||
const unsigned char[:] text_buf,
|
const unsigned char[:] tok_str,
|
||||||
char* aff_len_buf,
|
unsigned char* aff_len_buf,
|
||||||
const int pref_len,
|
const int pref_len,
|
||||||
const int suff_len,
|
const int suff_len,
|
||||||
) nogil:
|
) nogil:
|
||||||
""" TODO : Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
|
""" TODO : Populate *len_buf*, which has length *pref_len+suff_len* with the byte lengths of the first *pref_len* and the last
|
||||||
If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
|
*suff_len* characters within *tok_str*. If the word is shorter than pref and/or suff, the empty lengths in the middle are
|
||||||
|
filled with zeros.
|
||||||
|
|
||||||
text_buf: a pointer to a UTF-32LE representation of the containing string.
|
tok_str: a memoryview of a UTF-8 representation of a string.
|
||||||
tok_idx: the index of the first character of the word within the containing string.
|
len_buf: a buffer of length *pref_len+suff_len* in which to store the lengths. The calling code ensures that lengths
|
||||||
tok_len: the length of the word.
|
greater than 255 cannot occur.
|
||||||
aff_buf: the buffer to populate.
|
pref_len: the number of characters to process at the beginning of the word.
|
||||||
pref_len: the length of the prefix.
|
suff_len: the number of characters to process at the end of the word.
|
||||||
suff_len: the length of the suffix.
|
|
||||||
to_lower: if *True*, any upper case characters in either affix are converted to lower case.
|
|
||||||
"""
|
"""
|
||||||
cdef int text_buf_idx = 0, aff_len_buf_idx = 0, text_buf_len = len(text_buf)
|
cdef int tok_str_idx = 0, aff_len_buf_idx = 0, tok_str_len = len(tok_str)
|
||||||
|
|
||||||
while aff_len_buf_idx < pref_len:
|
while aff_len_buf_idx < pref_len:
|
||||||
if (text_buf[text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
|
if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
|
||||||
aff_len_buf[aff_len_buf_idx] = text_buf_idx + 1
|
aff_len_buf[aff_len_buf_idx] = tok_str_idx + 1
|
||||||
aff_len_buf_idx += 1
|
aff_len_buf_idx += 1
|
||||||
text_buf_idx += 1
|
tok_str_idx += 1
|
||||||
if text_buf_idx == len(text_buf):
|
if tok_str_idx == len(tok_str):
|
||||||
break
|
break
|
||||||
|
|
||||||
if aff_len_buf_idx < pref_len:
|
if aff_len_buf_idx < pref_len:
|
||||||
memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx)
|
memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx)
|
||||||
aff_len_buf_idx = pref_len
|
aff_len_buf_idx = pref_len
|
||||||
|
|
||||||
text_buf_idx = 1
|
tok_str_idx = 1
|
||||||
while aff_len_buf_idx < pref_len + suff_len:
|
while aff_len_buf_idx < pref_len + suff_len:
|
||||||
if (text_buf[text_buf_len - text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
|
if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
|
||||||
aff_len_buf[aff_len_buf_idx] = text_buf_len - text_buf_idx
|
aff_len_buf[aff_len_buf_idx] = tok_str_len - tok_str_idx
|
||||||
aff_len_buf_idx += 1
|
aff_len_buf_idx += 1
|
||||||
text_buf_idx += 1
|
tok_str_idx += 1
|
||||||
if text_buf_idx > text_buf_len:
|
if tok_str_idx > tok_str_len:
|
||||||
break
|
break
|
||||||
|
|
||||||
if aff_len_buf_idx < pref_len + suff_len:
|
if aff_len_buf_idx < pref_len + suff_len:
|
||||||
memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx)
|
memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx)
|
||||||
|
|
||||||
|
|
||||||
cdef bint _search_for_chars(
|
cdef void _search_for_chars(
|
||||||
const unsigned char[:] tok_str,
|
const unsigned char[:] tok_str,
|
||||||
const unsigned char[:] s_1byte_ch,
|
const unsigned char[:] s_1byte_ch,
|
||||||
const unsigned char[:] s_2byte_ch,
|
const unsigned char[:] s_2byte_ch,
|
||||||
const unsigned char[:] s_3byte_ch,
|
const unsigned char[:] s_3byte_ch,
|
||||||
const unsigned char[:] s_4byte_ch,
|
const unsigned char[:] s_4byte_ch,
|
||||||
unsigned char* res_buf,
|
unsigned char* res_buf,
|
||||||
|
int max_res_l,
|
||||||
unsigned char* len_buf,
|
unsigned char* len_buf,
|
||||||
bint suffs_not_prefs
|
bint suffs_not_prefs
|
||||||
) nogil:
|
) nogil:
|
||||||
""" Search a word within a string for characters within *search_buf*, starting at the beginning or
|
""" Search *tok_str* within a string for characters within the *s_<n>byte_ch> buffers, starting at the
|
||||||
end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches,
|
beginning or end depending on the value of *suffs_not_prefs*. Wherever a character matches,
|
||||||
the corresponding character from *lookup_buf* is added to *result_buf*.
|
it is added to *res_buf* and the byte length up to that point is added to *len_buf*.
|
||||||
|
|
||||||
text_buf: a pointer to a UTF-32LE representation of the containing string.
|
tok_str: a memoryview of a UTF-8 representation of a string.
|
||||||
tok_idx: the index of the first character of the word within the containing string.
|
s_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for.
|
||||||
tok_len: the length of the word.
|
res_buf: the buffer in which to place the search results.
|
||||||
search_buf: the characters to search for (ordered).
|
max_res_l: the maximum number of found characters to place in *res_buf*.
|
||||||
lookup_buf: characters corresponding to *search_buf* to add to *result_buf* in the case of a match.
|
len_buf: a buffer of length *max_res_l* in which to store the byte lengths.
|
||||||
Having separate search and lookup arrays enables case-insensitivity to be handled efficiently.
|
The calling code ensures that lengths greater than 255 cannot occur.
|
||||||
search_buf_len: the length of *search_buf* and hence also of *lookup_buf*.
|
|
||||||
result_buf: the buffer in which to place the results.
|
|
||||||
result_buf_len: the length of *result_buf*.
|
|
||||||
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
|
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
|
||||||
|
|
||||||
Returns *True* if at least one character from *search_buf* was found in the word, otherwise *False*.
|
|
||||||
"""
|
"""
|
||||||
|
cdef int tok_str_len = len(tok_str), search_char_idx = 0, res_buf_idx = 0, len_buf_idx = 0
|
||||||
|
cdef int last_tok_str_idx = tok_str_len if suffs_not_prefs else 0
|
||||||
|
cdef int this_tok_str_idx = tok_str_len - 1 if suffs_not_prefs else 1
|
||||||
|
cdef int ch_wdth, tok_start_idx
|
||||||
|
cdef char[:] search_chars
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if (
|
||||||
|
this_tok_str_idx == tok_str_len or
|
||||||
|
(tok_str[this_tok_str_idx] & 0xc0) != 0x80 # not continuation character
|
||||||
|
):
|
||||||
|
ch_wdth = abs(this_tok_str_idx - last_tok_str_idx)
|
||||||
|
if ch_wdth == 1:
|
||||||
|
search_chars = s_1byte_ch
|
||||||
|
elif ch_wdth == 2:
|
||||||
|
search_chars = s_2byte_ch
|
||||||
|
elif ch_wdth == 3:
|
||||||
|
search_chars = s_3byte_ch
|
||||||
|
else:
|
||||||
|
search_chars = s_4byte_ch
|
||||||
|
tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
|
||||||
|
for search_char_idx in range(0, len(search_chars), ch_wdth):
|
||||||
|
cmp_result = memcmp(tok_str + tok_start_idx, search_chars + search_char_idx, ch_wdth)
|
||||||
|
if cmp_result == 0:
|
||||||
|
memcpy(res_buf + res_buf_idx, search_chars + search_char_idx, ch_wdth)
|
||||||
|
res_buf_idx += ch_wdth
|
||||||
|
len_buf[len_buf_idx] = res_buf_idx
|
||||||
|
len_buf_idx += 1
|
||||||
|
if len_buf_idx == max_res_l:
|
||||||
|
return
|
||||||
|
if cmp_result >= 0:
|
||||||
|
break
|
||||||
|
last_tok_str_idx = this_tok_str_idx
|
||||||
|
if suffs_not_prefs:
|
||||||
|
this_tok_str_idx -= 1
|
||||||
|
if this_tok_str_idx < 0:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
this_tok_str_idx += 1
|
||||||
|
if this_tok_str_idx >= tok_str_len:
|
||||||
|
break
|
||||||
|
|
||||||
|
# fill in unused characters in the length buffer with 0
|
||||||
|
memset(res_buf + res_buf_idx, 0, max_res_l - res_buf_idx)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
|
cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
|
||||||
cdef int search_buf_idx
|
cdef int search_buf_idx
|
||||||
cdef int cmp_result
|
cdef int cmp_result
|
||||||
|
|
|
@ -1773,6 +1773,6 @@ def get_search_char_byte_arrays(
|
||||||
elif char_length == 4:
|
elif char_length == 4:
|
||||||
sc4.extend(encoded_search_char_bytes[working_start:idx])
|
sc4.extend(encoded_search_char_bytes[working_start:idx])
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(Errors.E1049)
|
raise RuntimeError(Errors.E1050)
|
||||||
working_start = idx
|
working_start = idx
|
||||||
return bytes(sc1), bytes(sc2), bytes(sc3), bytes(sc4)
|
return bytes(sc1), bytes(sc2), bytes(sc3), bytes(sc4)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user