Generate Numpy array at end

This commit is contained in:
richard@explosion.ai 2022-11-02 17:11:20 +01:00
parent bbf058029a
commit e7626f423a
4 changed files with 211 additions and 178 deletions

View File

@ -1006,45 +1006,60 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive) ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
p_lengths=bytes((1, 3, 4,)), p_lengths=bytes(
p_max_l = 4, (
s_lengths=bytes((2, 3, 4, 5,)), 1,
s_max_l = 5, 3,
4,
)
),
p_max_l=4,
s_lengths=bytes(
(
2,
3,
4,
5,
)
),
s_max_l=5,
ps_1byte_ch=ps1, ps_1byte_ch=ps1,
ps_1byte_ch_l = len(ps1), ps_1byte_ch_l=len(ps1),
ps_2byte_ch=ps2, ps_2byte_ch=ps2,
ps_2byte_ch_l = len(ps2), ps_2byte_ch_l=len(ps2),
ps_3byte_ch=ps3, ps_3byte_ch=ps3,
ps_3byte_ch_l = len(ps3), ps_3byte_ch_l=len(ps3),
ps_4byte_ch=ps4, ps_4byte_ch=ps4,
ps_4byte_ch_l = len(ps4), ps_4byte_ch_l=len(ps4),
ps_lengths=bytes((2,)), ps_lengths=bytes((2,)),
ps_max_l = 2, ps_max_l=2,
ss_1byte_ch=ss1, ss_1byte_ch=ss1,
ss_1byte_ch_l = len(ss1), ss_1byte_ch_l=len(ss1),
ss_2byte_ch=ss2, ss_2byte_ch=ss2,
ss_2byte_ch_l = len(ss2), ss_2byte_ch_l=len(ss2),
ss_3byte_ch=ss3, ss_3byte_ch=ss3,
ss_3byte_ch_l = len(ss3), ss_3byte_ch_l=len(ss3),
ss_4byte_ch=ss4, ss_4byte_ch=ss4,
ss_4byte_ch_l = len(ss4), ss_4byte_ch_l=len(ss4),
ss_lengths=bytes((1, 2,)), ss_lengths=bytes(
ss_max_l = 2, (
1,
2,
)
),
ss_max_l=2,
hashes_per_tok=10,
) )
print(hashes)
assert hashes[0][0] == _get_32_bit_hash("s") assert hashes[0][0] == _get_32_bit_hash("s")
assert hashes[0][1] == _get_32_bit_hash("spa") assert hashes[0][1] == _get_32_bit_hash("spa")
assert hashes[0][2] == _get_32_bit_hash( assert hashes[0][2] == _get_32_bit_hash("spaC" if case_sensitive else "spac")
"spaC" if case_sensitive else "spac"
)
assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy") assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy")
assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy") assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy")
assert hashes[0][5] == _get_32_bit_hash( assert hashes[0][5] == _get_32_bit_hash("paCy" if case_sensitive else "pacy")
"paCy" if case_sensitive else "pacy" assert hashes[0][6] == _get_32_bit_hash("spaCy" if case_sensitive else "spacy")
)
assert hashes[0][6] == _get_32_bit_hash(
"spaCy" if case_sensitive else "spacy"
)
assert hashes[0][7] == _get_32_bit_hash("p") assert hashes[0][7] == _get_32_bit_hash("p")
assert hashes[0][8] == _get_32_bit_hash("p") assert hashes[0][8] == _get_32_bit_hash("p")
@ -1071,9 +1086,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
assert hashes[2][9] == 0 assert hashes[2][9] == 0
assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p") assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p")
assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro") assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro")
assert hashes[3][2] == _get_32_bit_hash( assert hashes[3][2] == _get_32_bit_hash("Prod" if case_sensitive else "prod")
"Prod" if case_sensitive else "prod"
)
assert hashes[3][3] == _get_32_bit_hash("gy") assert hashes[3][3] == _get_32_bit_hash("gy")
assert hashes[3][4] == _get_32_bit_hash("igy") assert hashes[3][4] == _get_32_bit_hash("igy")
assert hashes[3][5] == _get_32_bit_hash("digy") assert hashes[3][5] == _get_32_bit_hash("digy")
@ -1102,32 +1115,39 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=False, cs=False,
p_lengths=bytes(), p_lengths=bytes(),
p_max_l = 0, p_max_l=0,
s_lengths=bytes((2,3,4,5,)), s_lengths=bytes(
s_max_l = 5, (
2,
3,
4,
5,
)
),
s_max_l=5,
ps_1byte_ch=ps1, ps_1byte_ch=ps1,
ps_1byte_ch_l = len(ps1), ps_1byte_ch_l=len(ps1),
ps_2byte_ch=ps2, ps_2byte_ch=ps2,
ps_2byte_ch_l = len(ps2), ps_2byte_ch_l=len(ps2),
ps_3byte_ch=ps3, ps_3byte_ch=ps3,
ps_3byte_ch_l = len(ps3), ps_3byte_ch_l=len(ps3),
ps_4byte_ch=ps4, ps_4byte_ch=ps4,
ps_4byte_ch_l = len(ps4), ps_4byte_ch_l=len(ps4),
ps_lengths=bytes((2,)), ps_lengths=bytes((2,)),
ps_max_l = 2, ps_max_l=2,
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_1byte_ch_l = 0, ss_1byte_ch_l=0,
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_2byte_ch_l = 0, ss_2byte_ch_l=0,
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_3byte_ch_l = 0, ss_3byte_ch_l=0,
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_4byte_ch_l = 0, ss_4byte_ch_l=0,
ss_lengths=bytes(), ss_lengths=bytes(),
ss_max_l = 0, ss_max_l=0,
hashes_per_tok=5,
) )
assert hashes[0][0] == _get_32_bit_hash("cy") assert hashes[0][0] == _get_32_bit_hash("cy")
assert hashes[0][1] == _get_32_bit_hash("acy") assert hashes[0][1] == _get_32_bit_hash("acy")
assert hashes[0][2] == _get_32_bit_hash("pacy") assert hashes[0][2] == _get_32_bit_hash("pacy")
@ -1155,33 +1175,34 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
for p_length in range(1, 8): for p_length in range(1, 8):
for s_length in range(1, 8): for s_length in range(1, 8):
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=False, cs=False,
p_lengths=bytes((p_length,)), p_lengths=bytes((p_length,)),
p_max_l = p_length, p_max_l=p_length,
s_lengths=bytes((s_length,)), s_lengths=bytes((s_length,)),
s_max_l = s_length, s_max_l=s_length,
ps_1byte_ch=bytes(), ps_1byte_ch=bytes(),
ps_1byte_ch_l = 0, ps_1byte_ch_l=0,
ps_2byte_ch=bytes(), ps_2byte_ch=bytes(),
ps_2byte_ch_l = 0, ps_2byte_ch_l=0,
ps_3byte_ch=bytes(), ps_3byte_ch=bytes(),
ps_3byte_ch_l = 0, ps_3byte_ch_l=0,
ps_4byte_ch=bytes(), ps_4byte_ch=bytes(),
ps_4byte_ch_l = 0, ps_4byte_ch_l=0,
ps_lengths=bytes(), ps_lengths=bytes(),
ps_max_l = 0, ps_max_l=0,
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_1byte_ch_l = 0, ss_1byte_ch_l=0,
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_2byte_ch_l = 0, ss_2byte_ch_l=0,
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_3byte_ch_l = 0, ss_3byte_ch_l=0,
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_4byte_ch_l = 0, ss_4byte_ch_l=0,
ss_lengths=bytes(), ss_lengths=bytes(),
ss_max_l = 0 ss_max_l=0,
hashes_per_tok=2,
) )
assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length]) assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length])
@ -1189,35 +1210,66 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
@pytest.mark.parametrize("case_sensitive", [True, False]) @pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_sensitive): def test_get_character_combination_hashes_turkish_i_with_dot(
en_tokenizer, case_sensitive
):
doc = en_tokenizer("İ".lower() + "İ") doc = en_tokenizer("İ".lower() + "İ")
s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive) s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
p_lengths=bytes((1,2,3,4,)), p_lengths=bytes(
p_max_l = 4, (
s_lengths=bytes((1,2,3,4,)), 1,
s_max_l = 4, 2,
3,
4,
)
),
p_max_l=4,
s_lengths=bytes(
(
1,
2,
3,
4,
)
),
s_max_l=4,
ps_1byte_ch=s1, ps_1byte_ch=s1,
ps_1byte_ch_l = len(s1), ps_1byte_ch_l=len(s1),
ps_2byte_ch=s2, ps_2byte_ch=s2,
ps_2byte_ch_l = len(s2), ps_2byte_ch_l=len(s2),
ps_3byte_ch=s3, ps_3byte_ch=s3,
ps_3byte_ch_l = len(s3), ps_3byte_ch_l=len(s3),
ps_4byte_ch=s4, ps_4byte_ch=s4,
ps_4byte_ch_l = len(s4), ps_4byte_ch_l=len(s4),
ps_lengths=bytes((1,2,3,4,)), ps_lengths=bytes(
ps_max_l = 4, (
1,
2,
3,
4,
)
),
ps_max_l=4,
ss_1byte_ch=s1, ss_1byte_ch=s1,
ss_1byte_ch_l = len(s1), ss_1byte_ch_l=len(s1),
ss_2byte_ch=s2, ss_2byte_ch=s2,
ss_2byte_ch_l = len(s2), ss_2byte_ch_l=len(s2),
ss_3byte_ch=s3, ss_3byte_ch=s3,
ss_3byte_ch_l = len(s3), ss_3byte_ch_l=len(s3),
ss_4byte_ch=s4, ss_4byte_ch=s4,
ss_4byte_ch_l = len(s4), ss_4byte_ch_l=len(s4),
ss_lengths=bytes((1,2,3,4,)), ss_lengths=bytes(
ss_max_l = 4 (
1,
2,
3,
4,
)
),
ss_max_l=4,
hashes_per_tok=16,
) )
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
@ -1248,46 +1300,51 @@ def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_
assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2) assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2)
assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE) assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE)
assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i") assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i")
assert hashes[0][14] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE) assert hashes[0][14] == _get_32_bit_hash(
COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE
)
assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2) assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2)
@pytest.mark.parametrize("case_sensitive", [True, False]) @pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer, case_sensitive): def test_get_character_combination_hashes_string_store_spec_cases(
en_tokenizer, case_sensitive
):
symbol = "FLAG19" symbol = "FLAG19"
short_word = "bee" short_word = "bee"
normal_word = "serendipity" normal_word = "serendipity"
long_word = "serendipity" * 50 long_word = "serendipity" * 50
assert len(long_word) > 255 assert len(long_word) > 255
doc = en_tokenizer(' '.join((symbol, short_word, normal_word, long_word))) doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
assert len(doc) == 4 assert len(doc) == 4
ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive) ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive)
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
p_lengths=bytes((2,)), p_lengths=bytes((2,)),
p_max_l = 2, p_max_l=2,
s_lengths=bytes((2,)), s_lengths=bytes((2,)),
s_max_l = 2, s_max_l=2,
ps_1byte_ch=ps1, ps_1byte_ch=ps1,
ps_1byte_ch_l = len(ps1), ps_1byte_ch_l=len(ps1),
ps_2byte_ch=ps2, ps_2byte_ch=ps2,
ps_2byte_ch_l = len(ps2), ps_2byte_ch_l=len(ps2),
ps_3byte_ch=ps3, ps_3byte_ch=ps3,
ps_3byte_ch_l = len(ps3), ps_3byte_ch_l=len(ps3),
ps_4byte_ch=ps4, ps_4byte_ch=ps4,
ps_4byte_ch_l = len(ps4), ps_4byte_ch_l=len(ps4),
ps_lengths=bytes((2,)), ps_lengths=bytes((2,)),
ps_max_l = 2, ps_max_l=2,
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_1byte_ch_l = 0, ss_1byte_ch_l=0,
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_2byte_ch_l = 0, ss_2byte_ch_l=0,
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_3byte_ch_l = 0, ss_3byte_ch_l=0,
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_4byte_ch_l = 0, ss_4byte_ch_l=0,
ss_lengths=bytes(), ss_lengths=bytes(),
ss_max_l = 0 ss_max_l=0,
hashes_per_tok=3,
) )
assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl") assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl")
assert hashes[0][1] == _get_32_bit_hash("19") assert hashes[0][1] == _get_32_bit_hash("19")
@ -1308,30 +1365,34 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer,
def test_character_combination_hashes_empty_lengths(en_tokenizer): def test_character_combination_hashes_empty_lengths(en_tokenizer):
doc = en_tokenizer("and𐌞") doc = en_tokenizer("and𐌞")
assert doc.get_character_combination_hashes( assert (
cs=True, doc.get_character_combination_hashes(
p_lengths=bytes(), cs=True,
p_max_l = 0, p_lengths=bytes(),
s_lengths=bytes(), p_max_l=0,
s_max_l = 0, s_lengths=bytes(),
ps_1byte_ch=bytes(), s_max_l=0,
ps_1byte_ch_l=0, ps_1byte_ch=bytes(),
ps_2byte_ch=bytes(), ps_1byte_ch_l=0,
ps_2byte_ch_l=0, ps_2byte_ch=bytes(),
ps_3byte_ch=bytes(), ps_2byte_ch_l=0,
ps_3byte_ch_l=0, ps_3byte_ch=bytes(),
ps_4byte_ch=bytes(), ps_3byte_ch_l=0,
ps_4byte_ch_l=0, ps_4byte_ch=bytes(),
ps_lengths=bytes(), ps_4byte_ch_l=0,
ps_max_l = 0, ps_lengths=bytes(),
ss_1byte_ch=bytes(), ps_max_l=0,
ss_1byte_ch_l=0, ss_1byte_ch=bytes(),
ss_2byte_ch=bytes(), ss_1byte_ch_l=0,
ss_2byte_ch_l=0, ss_2byte_ch=bytes(),
ss_3byte_ch=bytes(), ss_2byte_ch_l=0,
ss_3byte_ch_l=0, ss_3byte_ch=bytes(),
ss_4byte_ch=bytes(), ss_3byte_ch_l=0,
ss_4byte_ch_l=0, ss_4byte_ch=bytes(),
ss_lengths=bytes(), ss_4byte_ch_l=0,
ss_max_l = 0, ss_lengths=bytes(),
).shape == (1, 0) ss_max_l=0,
hashes_per_tok=0,
).shape
== (1, 0)
)

View File

@ -51,7 +51,7 @@ cdef void _set_suffix_lengths(
const int tok_str_l, const int tok_str_l,
unsigned char* suff_l_buf, unsigned char* suff_l_buf,
const int s_max_l, const int s_max_l,
) ) nogil
cdef void _search_for_chars( cdef void _search_for_chars(
@ -72,16 +72,13 @@ cdef void _search_for_chars(
) nogil ) nogil
cdef int _write_hashes( cdef int _write_hashes(
const unsigned char* res_buf, const unsigned char* res_buf,
const unsigned char* aff_l_buf, const unsigned char* aff_l_buf,
const unsigned char* offset_buf, const unsigned char* offset_buf,
const int end_idx, const int end_idx,
np.ndarray[np.int64_t, ndim=2] hashes, np.int64_t* hashes_ptr,
const int tok_i, ) nogil
const int start_hash_idx,
)
cdef class Doc: cdef class Doc:

View File

@ -202,35 +202,7 @@ class Doc:
ss_4_byte_ch_l: int, ss_4_byte_ch_l: int,
ss_lengths: bytes, ss_lengths: bytes,
ss_max_l: int, ss_max_l: int,
hashes_per_tok: int,
) -> Ints2d: ... ) -> Ints2d: ...
@staticmethod @staticmethod
def _get_array_attrs() -> Tuple[Any]: ... def _get_array_attrs() -> Tuple[Any]: ...
def get_character_combination_hashes(self,
*,
const bint cs,
const unsigned char* p_lengths,
const int p_max_l,
const unsigned char* s_lengths,
const int s_max_l,
const unsigned char* ps_1byte_ch,
const int ps_1_byte_ch_l,
const unsigned char* ps_2byte_ch,
const int ps_2_byte_ch_l,
const unsigned char* ps_3byte_ch,
const int ps_3_byte_ch_l,
const unsigned char* ps_4byte_ch,
const int ps_4_byte_ch_l,
const unsigned char* ps_lengths,
const int ps_max_l,
const unsigned char* ss_1byte_ch,
const int ss_1_byte_ch_l,
const unsigned char* ss_2byte_ch,
const int ss_2_byte_ch_l,
const unsigned char* ss_3byte_ch,
const int ss_3_byte_ch_l,
const unsigned char* ss_4byte_ch,
const int ss_4_byte_ch_l,
const unsigned char* ss_lengths,
const int ss_max_l,
)

View File

@ -1735,7 +1735,7 @@ cdef class Doc:
j += 1 j += 1
return output return output
#@cython.boundscheck(False) # Deactivate bounds checking @cython.boundscheck(False) # Deactivate bounds checking
def get_character_combination_hashes(self, def get_character_combination_hashes(self,
*, *,
const bint cs, const bint cs,
@ -1763,6 +1763,7 @@ cdef class Doc:
const int ss_4byte_ch_l, const int ss_4byte_ch_l,
const unsigned char* ss_lengths, const unsigned char* ss_lengths,
const int ss_max_l, const int ss_max_l,
const int hashes_per_tok
): ):
""" """
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
@ -1796,11 +1797,9 @@ cdef class Doc:
in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings
hashed for "spaCy" would be "c" and "ca". hashed for "spaCy" would be "c" and "ca".
ss_max_l: the value of *ss_lengths[-1]*, or *0* if *ss_lengths==None*. Passed in for speed. ss_max_l: the value of *ss_lengths[-1]*, or *0* if *ss_lengths==None*. Passed in for speed.
hashes_per_tok: the total number of hashes produced for each token. Passed in for speed.
""" """
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty(
(self.length, p_max_l + s_max_l + ps_max_l + ss_max_l), dtype="int64")
# Define / allocate buffers # Define / allocate buffers
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, 1) cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, 1)
@ -1809,40 +1808,47 @@ cdef class Doc:
cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, 1) cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, 1)
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4) cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1) cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok
cdef np.int64_t* hashes_ptr = <np.int64_t*> mem.alloc(
total_hashes, sizeof(np.int64_t))
# Define working variables # Define working variables
cdef TokenC tok_c cdef TokenC tok_c
cdef int hash_idx, tok_i, tok_str_l cdef int hash_idx, tok_i, tok_str_l
cdef attr_t num_tok_attr cdef attr_t num_tok_attr
cdef const unsigned char* tok_str cdef const unsigned char* tok_str
cdef np.int64_t* w_hashes_ptr = hashes_ptr
for tok_i in range(self.length):
for tok_i in range(doc_l):
tok_c = self.c[tok_i] tok_c = self.c[tok_i]
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
tok_str = self.vocab.strings.utf8_ptr(num_tok_attr) tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
tok_str_l = strlen(<char*> tok_str) tok_str_l = strlen(<char*> tok_str)
hash_idx = 0
if p_max_l > 0: if p_max_l > 0:
_set_prefix_lengths(tok_str, tok_str_l, pref_l_buf, p_max_l) _set_prefix_lengths(tok_str, tok_str_l, pref_l_buf, p_max_l)
hash_idx = _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes, tok_i, 0) w_hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, w_hashes_ptr)
if s_max_l > 0: if s_max_l > 0:
_set_suffix_lengths(tok_str, tok_str_l, suff_l_buf, s_max_l) _set_suffix_lengths(tok_str, tok_str_l, suff_l_buf, s_max_l)
hash_idx = _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l, hashes, tok_i, hash_idx) w_hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l, w_hashes_ptr)
if ps_max_l > 0: if ps_max_l > 0:
_search_for_chars(tok_str, tok_str_l, ps_1byte_ch, ps_1byte_ch_l, ps_2byte_ch, ps_2byte_ch_l, _search_for_chars(tok_str, tok_str_l, ps_1byte_ch, ps_1byte_ch_l, ps_2byte_ch, ps_2byte_ch_l,
ps_3byte_ch, ps_3byte_ch_l, ps_4byte_ch, ps_4byte_ch_l, ps_res_buf, ps_max_l, ps_l_buf, False) ps_3byte_ch, ps_3byte_ch_l, ps_4byte_ch, ps_4byte_ch_l, ps_res_buf, ps_max_l, ps_l_buf, False)
hash_idx = _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes, tok_i, hash_idx) w_hashes_ptr += _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, w_hashes_ptr)
if ss_max_l > 0: if ss_max_l > 0:
_search_for_chars(tok_str, tok_str_l, ss_1byte_ch, ss_1byte_ch_l, ss_2byte_ch, ss_2byte_ch_l, _search_for_chars(tok_str, tok_str_l, ss_1byte_ch, ss_1byte_ch_l, ss_2byte_ch, ss_2byte_ch_l,
ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True) ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True)
_write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes, tok_i, hash_idx) w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr)
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty(
(doc_l, hashes_per_tok), dtype="int64")
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.int64_t))
return hashes return hashes
@staticmethod @staticmethod
def _get_array_attrs(): def _get_array_attrs():
attrs = [LENGTH, SPACY] attrs = [LENGTH, SPACY]
@ -2023,7 +2029,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
lca_matrix[k, j] = lca - start lca_matrix[k, j] = lca - start
return lca_matrix return lca_matrix
#@cython.boundscheck(False) # Deactivate bounds checking @cython.boundscheck(False) # Deactivate bounds checking
cdef void _set_prefix_lengths( cdef void _set_prefix_lengths(
const unsigned char* tok_str, const unsigned char* tok_str,
const int tok_str_l, const int tok_str_l,
@ -2056,13 +2062,13 @@ cdef void _set_prefix_lengths(
memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx) memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
#@cython.boundscheck(False) # Deactivate bounds checking @cython.boundscheck(False) # Deactivate bounds checking
cdef void _set_suffix_lengths( cdef void _set_suffix_lengths(
const unsigned char* tok_str, const unsigned char* tok_str,
const int tok_str_l, const int tok_str_l,
unsigned char* suff_l_buf, unsigned char* suff_l_buf,
const int s_max_l, const int s_max_l,
): ) nogil:
""" Populate *suff_l_buf*, which has length *suff_l*, with the byte lengths of the last *suff_l* characters within *tok_str*. """ Populate *suff_l_buf*, which has length *suff_l*, with the byte lengths of the last *suff_l* characters within *tok_str*.
Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word. Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word.
@ -2086,7 +2092,7 @@ cdef void _set_suffix_lengths(
memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx) memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
#@cython.boundscheck(False) # Deactivate bounds checking @cython.boundscheck(False) # Deactivate bounds checking
cdef void _search_for_chars( cdef void _search_for_chars(
const unsigned char* tok_str, const unsigned char* tok_str,
const int tok_str_l, const int tok_str_l,
@ -2175,15 +2181,14 @@ cdef void _search_for_chars(
memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
@cython.boundscheck(False) # Deactivate bounds checking
cdef int _write_hashes( cdef int _write_hashes(
const unsigned char* res_buf, const unsigned char* res_buf,
const unsigned char* aff_l_buf, const unsigned char* aff_l_buf,
const unsigned char* offset_buf, const unsigned char* offset_buf,
const int end_idx, const int end_idx,
np.ndarray[np.int64_t, ndim=2] hashes, np.int64_t* hashes_ptr,
const int tok_i, ) nogil:
const int start_hash_idx,
):
""" Write hashes for a token/rich property group combination. """ Write hashes for a token/rich property group combination.
res_buf: the string from which to generate the hash values. res_buf: the string from which to generate the hash values.
@ -2191,24 +2196,22 @@ cdef int _write_hashes(
offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*. offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
end_idx: if not *0*, the offset within *res_buf* that should end each affix being hashed; end_idx: if not *0*, the offset within *res_buf* that should end each affix being hashed;
if *0*, affixes start at the beginning of *res_buf* rather than ending at the end. if *0*, affixes start at the beginning of *res_buf* rather than ending at the end.
hashes: the 2D Numpy array in which the hashes are stored. hashes_ptr: a pointer starting from which the new hashes should be written.
tok_i: the index of axis 0 of *hashes* to write to.
start_hash_idx: the index of axis 1 of *hashes* at which to start writing.
""" """
cdef int offset, aff_l, hash_val = 0, hash_idx = start_hash_idx cdef int offset, aff_l, hash_val = 0, hash_idx = 0
while True: while True:
aff_l = aff_l_buf[hash_idx - start_hash_idx] aff_l = aff_l_buf[hash_idx]
if aff_l == 0: if aff_l == 0:
return hash_idx return hash_idx
offset = offset_buf[aff_l - 1] offset = offset_buf[aff_l - 1]
if offset > 0: if offset > 0:
if end_idx != 0: if end_idx != 0:
hash_val = hash32(<void*> res_buf + end_idx - offset, offset, 0) hash_val = hash32(<void*> (res_buf + end_idx - offset), offset, 0)
else: else:
hash_val = hash32(<void*> res_buf, offset, 0) hash_val = hash32(<void*> res_buf, offset, 0)
hashes[tok_i, hash_idx] = hash_val hashes_ptr[hash_idx] = hash_val
hash_idx += 1 hash_idx += 1