Get rid of memory views

This commit is contained in:
richardpaulhudson 2022-11-01 14:05:35 +01:00
parent 749da9d348
commit 2552340fb8
7 changed files with 98 additions and 107 deletions

View File

@ -43,26 +43,26 @@ def RichFeatureExtractor(
forward, forward,
attrs={ attrs={
"case_sensitive": case_sensitive, "case_sensitive": case_sensitive,
"pref_lengths": ops.asarray1i(pref_lengths) "pref_lengths": bytes(pref_lengths)
if pref_lengths is not None if pref_lengths is not None
else ops.asarray1i([]), else bytes(),
"suff_lengths": ops.asarray1i(suff_lengths) "suff_lengths": bytes(suff_lengths)
if suff_lengths is not None if suff_lengths is not None
else ops.asarray1i([]), else bytes(),
"pref_search_1_byte": ps_1byte_ch, "pref_search_1_byte": ps_1byte_ch,
"pref_search_2_bytes": ps_2byte_ch, "pref_search_2_bytes": ps_2byte_ch,
"pref_search_3_bytes": ps_3byte_ch, "pref_search_3_bytes": ps_3byte_ch,
"pref_search_4_bytes": ps_4byte_ch, "pref_search_4_bytes": ps_4byte_ch,
"pref_search_lengths": ops.asarray1i(pref_search_lengths) "pref_search_lengths": bytes(pref_search_lengths)
if pref_search_lengths is not None if pref_search_lengths is not None
else ops.asarray1i([]), else bytes(),
"suff_search_1_byte": ss_1byte_ch, "suff_search_1_byte": ss_1byte_ch,
"suff_search_2_bytes": ss_2byte_ch, "suff_search_2_bytes": ss_2byte_ch,
"suff_search_3_bytes": ss_3byte_ch, "suff_search_3_bytes": ss_3byte_ch,
"suff_search_4_bytes": ss_4byte_ch, "suff_search_4_bytes": ss_4byte_ch,
"suff_search_lengths": ops.asarray1i(suff_search_lengths) "suff_search_lengths": bytes(suff_search_lengths)
if suff_search_lengths is not None if suff_search_lengths is not None
else ops.asarray1i([]), else bytes(),
}, },
) )
@ -72,18 +72,18 @@ def forward(
) -> Tuple[List[Ints2d], Callable]: ) -> Tuple[List[Ints2d], Callable]:
ops = model.ops ops = model.ops
case_sensitive: bool = model.attrs["case_sensitive"] case_sensitive: bool = model.attrs["case_sensitive"]
pref_lengths: Ints1d = model.attrs["pref_lengths"] pref_lengths: bytes = model.attrs["pref_lengths"]
suff_lengths: Ints1d = model.attrs["suff_lengths"] suff_lengths: bytes = model.attrs["suff_lengths"]
ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"] ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"] ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"] ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"] ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"] pref_search_lengths: bytes = model.attrs["pref_search_lengths"]
ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"] ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"] ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"] ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"] ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"] suff_search_lengths: bytes = model.attrs["suff_search_lengths"]
features: List[Ints2d] = [] features: List[Ints2d] = []
for doc in docs: for doc in docs:
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(

View File

@ -27,4 +27,4 @@ cdef class StringStore:
cdef const Utf8Str* intern_unicode(self, str py_string) cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
cdef const unsigned char[:] utf8_view(self, attr_t hash_val) cdef const unsigned char* utf8_ptr(self, attr_t hash_val)

View File

@ -316,9 +316,9 @@ cdef class StringStore:
self.keys.push_back(key) self.keys.push_back(key)
return value return value
cdef const unsigned char[:] utf8_view(self, attr_t hash_val): cdef const unsigned char* utf8_ptr(self, attr_t hash_val):
if hash_val == 0: if hash_val == 0:
return "" return "".encode("utf-8")
elif hash_val < len(SYMBOLS_BY_INT): elif hash_val < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[hash_val].encode("utf-8") return SYMBOLS_BY_INT[hash_val].encode("utf-8")
cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val) cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)

View File

@ -1004,23 +1004,22 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
@pytest.mark.parametrize("case_sensitive", [True, False]) @pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
doc = en_tokenizer("spaCy✨ and Prodigy") doc = en_tokenizer("spaCy✨ and Prodigy")
ops = get_current_ops()
ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("Rp", case_sensitive) ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("Rp", case_sensitive)
ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive) ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
p_lengths=ops.asarray1i([1, 3, 4]), p_lengths=bytes((1, 3, 4,)),
s_lengths=ops.asarray1i([2, 3, 4, 5]), s_lengths=bytes((2, 3, 4, 5,)),
ps_1byte_ch=ps1, ps_1byte_ch=ps1,
ps_2byte_ch=ps2, ps_2byte_ch=ps2,
ps_3byte_ch=ps3, ps_3byte_ch=ps3,
ps_4byte_ch=ps4, ps_4byte_ch=ps4,
ps_lengths=ops.asarray1i([2]), ps_lengths=bytes((2,)),
ss_1byte_ch=ss1, ss_1byte_ch=ss1,
ss_2byte_ch=ss2, ss_2byte_ch=ss2,
ss_3byte_ch=ss3, ss_3byte_ch=ss3,
ss_4byte_ch=ss4, ss_4byte_ch=ss4,
ss_lengths=ops.asarray1i([1, 2]), ss_lengths=bytes((1, 2,)),
) )
assert hashes[0][0] == _get_unsigned_32_bit_hash("s") assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
@ -1089,22 +1088,21 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
def test_get_character_combination_hashes_good_case_partial(en_tokenizer): def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
doc = en_tokenizer("spaCy✨ and Prodigy") doc = en_tokenizer("spaCy✨ and Prodigy")
ops = get_current_ops()
ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("rp", False) ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("rp", False)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=False, cs=False,
p_lengths=ops.asarray1i([]), p_lengths=bytes(),
s_lengths=ops.asarray1i([2, 3, 4, 5]), s_lengths=bytes((2,3,4,5,)),
ps_1byte_ch=ps1, ps_1byte_ch=ps1,
ps_2byte_ch=ps2, ps_2byte_ch=ps2,
ps_3byte_ch=ps3, ps_3byte_ch=ps3,
ps_4byte_ch=ps4, ps_4byte_ch=ps4,
ps_lengths=ops.asarray1i([2]), ps_lengths=bytes((2,)),
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_lengths=ops.asarray1i([]), ss_lengths=bytes(),
) )
@ -1132,25 +1130,24 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
doc = en_tokenizer("sp𐌞Cé") doc = en_tokenizer("sp𐌞Cé")
ops = get_current_ops()
for p_length in range(1, 8): for p_length in range(1, 8):
for s_length in range(1, 8): for s_length in range(1, 8):
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=False, cs=False,
p_lengths=ops.asarray1i([p_length]), p_lengths=bytes((p_length,)),
s_lengths=ops.asarray1i([s_length]), s_lengths=bytes((s_length,)),
ps_1byte_ch=bytes(), ps_1byte_ch=bytes(),
ps_2byte_ch=bytes(), ps_2byte_ch=bytes(),
ps_3byte_ch=bytes(), ps_3byte_ch=bytes(),
ps_4byte_ch=bytes(), ps_4byte_ch=bytes(),
ps_lengths=ops.asarray1i([]), ps_lengths=bytes(),
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_lengths=ops.asarray1i([]), ss_lengths=bytes(),
) )
assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length]) assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length])
@ -1160,22 +1157,21 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
@pytest.mark.parametrize("case_sensitive", [True, False]) @pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_sensitive): def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_sensitive):
doc = en_tokenizer("İ".lower() + "İ") doc = en_tokenizer("İ".lower() + "İ")
ops = get_current_ops()
s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive) s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
p_lengths=ops.asarray1i([1, 2, 3, 4]), p_lengths=bytes((1,2,3,4,)),
s_lengths=ops.asarray1i([1, 2, 3, 4]), s_lengths=bytes((1,2,3,4,)),
ps_1byte_ch=s1, ps_1byte_ch=s1,
ps_2byte_ch=s2, ps_2byte_ch=s2,
ps_3byte_ch=s3, ps_3byte_ch=s3,
ps_4byte_ch=s4, ps_4byte_ch=s4,
ps_lengths=ops.asarray1i([1, 2, 3, 4]), ps_lengths=bytes((1,2,3,4,)),
ss_1byte_ch=s1, ss_1byte_ch=s1,
ss_2byte_ch=s2, ss_2byte_ch=s2,
ss_3byte_ch=s3, ss_3byte_ch=s3,
ss_4byte_ch=s4, ss_4byte_ch=s4,
ss_lengths=ops.asarray1i([1, 2, 3, 4]), ss_lengths=bytes((1,2,3,4,)),
) )
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
@ -1219,22 +1215,21 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer,
assert len(long_word) > 255 assert len(long_word) > 255
doc = en_tokenizer(' '.join((symbol, short_word, normal_word, long_word))) doc = en_tokenizer(' '.join((symbol, short_word, normal_word, long_word)))
assert len(doc) == 4 assert len(doc) == 4
ops = get_current_ops()
ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive) ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
p_lengths=ops.asarray1i([2]), p_lengths=bytes((2,)),
s_lengths=ops.asarray1i([2]), s_lengths=bytes((2,)),
ps_1byte_ch=ps1, ps_1byte_ch=ps1,
ps_2byte_ch=ps2, ps_2byte_ch=ps2,
ps_3byte_ch=ps3, ps_3byte_ch=ps3,
ps_4byte_ch=ps4, ps_4byte_ch=ps4,
ps_lengths=ops.asarray1i([2]), ps_lengths=bytes((2,)),
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_lengths=ops.asarray1i([]), ss_lengths=bytes(),
) )
assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl") assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl")
assert hashes[0][1] == _get_unsigned_32_bit_hash("19") assert hashes[0][1] == _get_unsigned_32_bit_hash("19")
@ -1255,19 +1250,18 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer,
def test_character_combination_hashes_empty_lengths(en_tokenizer): def test_character_combination_hashes_empty_lengths(en_tokenizer):
doc = en_tokenizer("and𐌞") doc = en_tokenizer("and𐌞")
ops = get_current_ops() assert doc.get_character_combination_hashes(
hashes = doc.get_character_combination_hashes(
cs=True, cs=True,
p_lengths=ops.asarray1i([]), p_lengths=bytes(),
s_lengths=ops.asarray1i([]), s_lengths=bytes(),
ps_1byte_ch=bytes(), ps_1byte_ch=bytes(),
ps_2byte_ch=bytes(), ps_2byte_ch=bytes(),
ps_3byte_ch=bytes(), ps_3byte_ch=bytes(),
ps_4byte_ch=bytes(), ps_4byte_ch=bytes(),
ps_lengths=ops.asarray1i([]), ps_lengths=bytes(),
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_lengths=ops.asarray1i([]), ss_lengths=bytes(),
).shape == (1, 0) ).shape == (1, 0)

View File

@ -39,7 +39,7 @@ cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef void _set_affix_lengths( cdef void _set_affix_lengths(
const unsigned char[:] tok_str, const unsigned char* tok_str,
unsigned char* aff_l_buf, unsigned char* aff_l_buf,
const int pref_l, const int pref_l,
const int suff_l, const int suff_l,
@ -47,11 +47,11 @@ cdef void _set_affix_lengths(
cdef void _search_for_chars( cdef void _search_for_chars(
const unsigned char[:] tok_str, const unsigned char* tok_str,
const unsigned char[:] s_1byte_ch, const unsigned char* s_1byte_ch,
const unsigned char[:] s_2byte_ch, const unsigned char* s_2byte_ch,
const unsigned char[:] s_3byte_ch, const unsigned char* s_3byte_ch,
const unsigned char[:] s_4byte_ch, const unsigned char* s_4byte_ch,
unsigned char* res_buf, unsigned char* res_buf,
int max_res_l, int max_res_l,
unsigned char* l_buf, unsigned char* l_buf,

View File

@ -178,18 +178,18 @@ class Doc:
self, self,
*, *,
cs: bool, cs: bool,
p_lengths: Ints1d, p_lengths: bytes,
s_lengths: Ints1d, s_lengths: bytes,
ps_1byte_ch: bytes, ps_1byte_ch: bytes,
ps_2byte_ch: bytes, ps_2byte_ch: bytes,
ps_3byte_ch: bytes, ps_3byte_ch: bytes,
ps_4byte_ch: bytes, ps_4byte_ch: bytes,
ps_lengths: Ints1d, ps_lengths: bytes,
ss_1byte_ch: bytes, ss_1byte_ch: bytes,
ss_2byte_ch: bytes, ss_2byte_ch: bytes,
ss_3byte_ch: bytes, ss_3byte_ch: bytes,
ss_4byte_ch: bytes, ss_4byte_ch: bytes,
ss_lengths: Ints1d, ss_lengths: bytes,
) -> Ints2d: ... ) -> Ints2d: ...
@staticmethod @staticmethod
def _get_array_attrs() -> Tuple[Any]: ... def _get_array_attrs() -> Tuple[Any]: ...

View File

@ -4,7 +4,7 @@ from typing import Set, List
cimport cython cimport cython
cimport numpy as np cimport numpy as np
from cpython cimport array from cpython cimport array
from libc.string cimport memcpy, memcmp, memset from libc.string cimport memcpy, memcmp, memset, strlen
from libc.math cimport sqrt from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t from libc.stdint cimport int32_t, uint64_t
@ -1739,18 +1739,18 @@ cdef class Doc:
def get_character_combination_hashes(self, def get_character_combination_hashes(self,
*, *,
const bint cs, const bint cs,
int[:] p_lengths, const unsigned char* p_lengths,
int[:] s_lengths, const unsigned char* s_lengths,
const unsigned char[:] ps_1byte_ch, const unsigned char* ps_1byte_ch,
const unsigned char[:] ps_2byte_ch, const unsigned char* ps_2byte_ch,
const unsigned char[:] ps_3byte_ch, const unsigned char* ps_3byte_ch,
const unsigned char[:] ps_4byte_ch, const unsigned char* ps_4byte_ch,
int[:] ps_lengths, const unsigned char* ps_lengths,
const unsigned char[:] ss_1byte_ch, const unsigned char* ss_1byte_ch,
const unsigned char[:] ss_2byte_ch, const unsigned char* ss_2byte_ch,
const unsigned char[:] ss_3byte_ch, const unsigned char* ss_3byte_ch,
const unsigned char[:] ss_4byte_ch, const unsigned char* ss_4byte_ch,
int[:] ss_lengths, const unsigned char* ss_lengths,
): ):
""" """
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
@ -1764,28 +1764,28 @@ cdef class Doc:
ss_ variables relate to searches starting at the end of the word ss_ variables relate to searches starting at the end of the word
cs: if *False*, hashes are generated based on the lower-case version of each token. cs: if *False*, hashes are generated based on the lower-case version of each token.
p_lengths: an Ints1d specifying the lengths of prefixes to be hashed in ascending order. For example, p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa". For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed in ascending order. For example, s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
starting at the beginning. starting at the beginning.
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed in ascending order. ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed
For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings
"spaCy" would be "a" and "ac". hashed for "spaCy" would be "a" and "ac".
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
starting at the end. starting at the end.
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed in ascending order. ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed
For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings
"spaCy" would be "c" and "ca". hashed for "spaCy" would be "c" and "ca".
""" """
# Define the result array and work out what is used for what in axis 1 # Define the result array and work out what is used for what in axis 1
cdef int num_toks = len(self) cdef int num_toks = len(self)
cdef int p_h_num = len(p_lengths) cdef int p_h_num = strlen(<char*> p_lengths)
cdef int s_h_num = len(s_lengths), s_h_end = p_h_num + s_h_num cdef int s_h_num = strlen(<char*> s_lengths), s_h_end = p_h_num + s_h_num
cdef int ps_h_num = len(ps_lengths), ps_h_end = s_h_end + ps_h_num cdef int ps_h_num = strlen(<char*> ps_lengths), ps_h_end = s_h_end + ps_h_num
cdef int ss_h_num = len(ss_lengths), ss_h_end = ps_h_end + ss_h_num cdef int ss_h_num = strlen(<char*> ss_lengths), ss_h_end = ps_h_end + ss_h_num
cdef np.ndarray[np.int64_t, ndim=2] hashes cdef np.ndarray[np.int64_t, ndim=2] hashes
hashes = numpy.empty((num_toks, ss_h_end), dtype="int64") hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
@ -1804,35 +1804,29 @@ cdef class Doc:
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4) cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1) cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
# Define memory views on length arrays
cdef int[:] p_lengths_v = p_lengths
cdef int[:] s_lengths_v = s_lengths
cdef int[:] ps_lengths_v = ps_lengths
cdef int[:] ss_lengths_v = ss_lengths
# Define working variables # Define working variables
cdef TokenC tok_c cdef TokenC tok_c
cdef int tok_i, offset cdef int tok_i, offset
cdef uint64_t hash_val = 0 cdef uint64_t hash_val = 0
cdef attr_t num_tok_attr cdef attr_t num_tok_attr
cdef const unsigned char[:] tok_str cdef const unsigned char* tok_str
for tok_i in range(num_toks): for tok_i in range(num_toks):
tok_c = self.c[tok_i] tok_c = self.c[tok_i]
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
tok_str = self.vocab.strings.utf8_view(num_tok_attr) tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
if aff_l > 0: if aff_l > 0:
_set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l) _set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l)
for hash_idx in range(p_h_num): for hash_idx in range(p_h_num):
offset = aff_l_buf[p_lengths_v[hash_idx] - 1] offset = aff_l_buf[p_lengths[hash_idx] - 1]
if offset > 0: if offset > 0:
hash_val = hash32(<void*> &tok_str[0], offset, 0) hash_val = hash32(<void*> &tok_str[0], offset, 0)
hashes[tok_i, hash_idx] = hash_val hashes[tok_i, hash_idx] = hash_val
for hash_idx in range(p_h_num, s_h_end): for hash_idx in range(p_h_num, s_h_end):
offset = aff_l_buf[s_lengths_v[hash_idx - p_h_num] + p_max_l - 1] offset = aff_l_buf[s_lengths[hash_idx - p_h_num] + p_max_l - 1]
if offset > 0: if offset > 0:
hash_val = hash32(<void*> &tok_str[len(tok_str) - offset], offset, 0) hash_val = hash32(<void*> &tok_str[len(tok_str) - offset], offset, 0)
hashes[tok_i, hash_idx] = hash_val hashes[tok_i, hash_idx] = hash_val
@ -1841,7 +1835,7 @@ cdef class Doc:
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False) _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False)
hash_val = 0 hash_val = 0
for hash_idx in range(s_h_end, ps_h_end): for hash_idx in range(s_h_end, ps_h_end):
offset = ps_l_buf[ps_lengths_v[hash_idx - s_h_end] - 1] offset = ps_l_buf[ps_lengths[hash_idx - s_h_end] - 1]
if offset > 0: if offset > 0:
hash_val = hash32(ps_res_buf, offset, 0) hash_val = hash32(ps_res_buf, offset, 0)
hashes[tok_i, hash_idx] = hash_val hashes[tok_i, hash_idx] = hash_val
@ -1850,7 +1844,7 @@ cdef class Doc:
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True) _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True)
hash_val = 0 hash_val = 0
for hash_idx in range(ps_h_end, ss_h_end): for hash_idx in range(ps_h_end, ss_h_end):
offset = ss_l_buf[ss_lengths_v[hash_idx - ps_h_end] - 1] offset = ss_l_buf[ss_lengths[hash_idx - ps_h_end] - 1]
if offset > 0: if offset > 0:
hash_val = hash32(ss_res_buf, offset, 0) hash_val = hash32(ss_res_buf, offset, 0)
hashes[tok_i, hash_idx] = hash_val hashes[tok_i, hash_idx] = hash_val
@ -2039,7 +2033,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
@cython.boundscheck(False) # Deactivate bounds checking @cython.boundscheck(False) # Deactivate bounds checking
cdef void _set_affix_lengths( cdef void _set_affix_lengths(
const unsigned char[:] tok_str, const unsigned char* tok_str,
unsigned char* aff_l_buf, unsigned char* aff_l_buf,
const int pref_l, const int pref_l,
const int suff_l, const int suff_l,
@ -2054,14 +2048,17 @@ cdef void _set_affix_lengths(
pref_l: the number of characters to process at the beginning of the word. pref_l: the number of characters to process at the beginning of the word.
suff_l: the number of characters to process at the end of the word. suff_l: the number of characters to process at the end of the word.
""" """
cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = len(tok_str) cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = strlen(<char*> tok_str)
while aff_l_buf_idx < pref_l: while aff_l_buf_idx < pref_l:
if tok_str_idx == len(tok_str) or ((tok_str[tok_str_idx] & 0xc0) != 0x80): # not a continuation character if (tok_str_idx == strlen(<char*> tok_str)
or
((tok_str[tok_str_idx] & 0xc0) != 0x80 # not a continuation character
):
aff_l_buf[aff_l_buf_idx] = tok_str_idx aff_l_buf[aff_l_buf_idx] = tok_str_idx
aff_l_buf_idx += 1 aff_l_buf_idx += 1
tok_str_idx += 1 tok_str_idx += 1
if tok_str_idx > len(tok_str): if tok_str_idx > tok_str_l:
break break
if aff_l_buf_idx < pref_l: if aff_l_buf_idx < pref_l:
@ -2082,11 +2079,11 @@ cdef void _set_affix_lengths(
@cython.boundscheck(False) # Deactivate bounds checking @cython.boundscheck(False) # Deactivate bounds checking
cdef void _search_for_chars( cdef void _search_for_chars(
const unsigned char[:] tok_str, const unsigned char* tok_str,
const unsigned char[:] s_1byte_ch, const unsigned char* s_1byte_ch,
const unsigned char[:] s_2byte_ch, const unsigned char* s_2byte_ch,
const unsigned char[:] s_3byte_ch, const unsigned char* s_3byte_ch,
const unsigned char[:] s_4byte_ch, const unsigned char* s_4byte_ch,
unsigned char* res_buf, unsigned char* res_buf,
int max_res_l, int max_res_l,
unsigned char* l_buf, unsigned char* l_buf,
@ -2106,9 +2103,9 @@ cdef void _search_for_chars(
The calling code ensures that lengths greater than 255 cannot occur. The calling code ensures that lengths greater than 255 cannot occur.
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning. suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
""" """
cdef int tok_str_l = len(tok_str), res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx cdef int tok_str_l = strlen(<char*> tok_str), res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx
cdef int search_chars_l cdef int search_chars_l
cdef const unsigned char[:] search_chars cdef const unsigned char* search_chars
cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0 cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0
cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1 cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1
@ -2130,7 +2127,7 @@ cdef void _search_for_chars(
search_chars = s_3byte_ch search_chars = s_3byte_ch
else: else:
search_chars = s_4byte_ch search_chars = s_4byte_ch
search_chars_l = len(search_chars) search_chars_l = strlen(<char*> search_chars)
tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
search_char_idx = 0 search_char_idx = 0