Intermediate state

This commit is contained in:
richardpaulhudson 2022-11-01 20:46:55 +01:00
parent 2552340fb8
commit bbf058029a
4 changed files with 375 additions and 184 deletions

View File

@ -994,10 +994,8 @@ def test_doc_spans_setdefault(en_tokenizer):
assert len(doc.spans["key3"]) == 2 assert len(doc.spans["key3"]) == 2
def _get_unsigned_32_bit_hash(input: str) -> int: def _get_32_bit_hash(input: str) -> int:
working_hash = hash(input.encode("UTF-8")) working_hash = hash(input.encode("UTF-8"))
if working_hash < 0:
working_hash = working_hash + (2 << 31)
return working_hash return working_hash
@ -1009,79 +1007,91 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
p_lengths=bytes((1, 3, 4,)), p_lengths=bytes((1, 3, 4,)),
p_max_l = 4,
s_lengths=bytes((2, 3, 4, 5,)), s_lengths=bytes((2, 3, 4, 5,)),
s_max_l = 5,
ps_1byte_ch=ps1, ps_1byte_ch=ps1,
ps_1byte_ch_l = len(ps1),
ps_2byte_ch=ps2, ps_2byte_ch=ps2,
ps_2byte_ch_l = len(ps2),
ps_3byte_ch=ps3, ps_3byte_ch=ps3,
ps_3byte_ch_l = len(ps3),
ps_4byte_ch=ps4, ps_4byte_ch=ps4,
ps_4byte_ch_l = len(ps4),
ps_lengths=bytes((2,)), ps_lengths=bytes((2,)),
ps_max_l = 2,
ss_1byte_ch=ss1, ss_1byte_ch=ss1,
ss_1byte_ch_l = len(ss1),
ss_2byte_ch=ss2, ss_2byte_ch=ss2,
ss_2byte_ch_l = len(ss2),
ss_3byte_ch=ss3, ss_3byte_ch=ss3,
ss_3byte_ch_l = len(ss3),
ss_4byte_ch=ss4, ss_4byte_ch=ss4,
ss_4byte_ch_l = len(ss4),
ss_lengths=bytes((1, 2,)), ss_lengths=bytes((1, 2,)),
ss_max_l = 2,
) )
assert hashes[0][0] == _get_unsigned_32_bit_hash("s") assert hashes[0][0] == _get_32_bit_hash("s")
assert hashes[0][1] == _get_unsigned_32_bit_hash("spa") assert hashes[0][1] == _get_32_bit_hash("spa")
assert hashes[0][2] == _get_unsigned_32_bit_hash( assert hashes[0][2] == _get_32_bit_hash(
"spaC" if case_sensitive else "spac" "spaC" if case_sensitive else "spac"
) )
assert hashes[0][3] == _get_unsigned_32_bit_hash("Cy" if case_sensitive else "cy") assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy")
assert hashes[0][4] == _get_unsigned_32_bit_hash("aCy" if case_sensitive else "acy") assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy")
assert hashes[0][5] == _get_unsigned_32_bit_hash( assert hashes[0][5] == _get_32_bit_hash(
"paCy" if case_sensitive else "pacy" "paCy" if case_sensitive else "pacy"
) )
assert hashes[0][6] == _get_unsigned_32_bit_hash( assert hashes[0][6] == _get_32_bit_hash(
"spaCy" if case_sensitive else "spacy" "spaCy" if case_sensitive else "spacy"
) )
assert hashes[0][7] == _get_unsigned_32_bit_hash("p") assert hashes[0][7] == _get_32_bit_hash("p")
assert hashes[0][8] == _get_unsigned_32_bit_hash("p") assert hashes[0][8] == _get_32_bit_hash("p")
assert hashes[0][9] == _get_unsigned_32_bit_hash("p") assert hashes[0][9] == _get_32_bit_hash("p")
assert hashes[1][0] == _get_unsigned_32_bit_hash("") assert hashes[1][0] == _get_32_bit_hash("")
assert hashes[1][1] == _get_unsigned_32_bit_hash("") assert hashes[1][1] == _get_32_bit_hash("")
assert hashes[1][2] == _get_unsigned_32_bit_hash("") assert hashes[1][2] == _get_32_bit_hash("")
assert hashes[1][3] == _get_unsigned_32_bit_hash("") assert hashes[1][3] == _get_32_bit_hash("")
assert hashes[1][4] == _get_unsigned_32_bit_hash("") assert hashes[1][4] == _get_32_bit_hash("")
assert hashes[1][5] == _get_unsigned_32_bit_hash("") assert hashes[1][5] == _get_32_bit_hash("")
assert hashes[1][6] == _get_unsigned_32_bit_hash("") assert hashes[1][6] == _get_32_bit_hash("")
assert hashes[1][7] == 0 assert hashes[1][7] == 0
assert hashes[1][8] == _get_unsigned_32_bit_hash("") assert hashes[1][8] == _get_32_bit_hash("")
assert hashes[1][9] == _get_unsigned_32_bit_hash("") assert hashes[1][9] == _get_32_bit_hash("")
assert hashes[2][0] == _get_unsigned_32_bit_hash("a") assert hashes[2][0] == _get_32_bit_hash("a")
assert hashes[2][1] == _get_unsigned_32_bit_hash("and") assert hashes[2][1] == _get_32_bit_hash("and")
assert hashes[2][2] == _get_unsigned_32_bit_hash("and") assert hashes[2][2] == _get_32_bit_hash("and")
assert hashes[2][3] == _get_unsigned_32_bit_hash("nd") assert hashes[2][3] == _get_32_bit_hash("nd")
assert hashes[2][4] == _get_unsigned_32_bit_hash("and") assert hashes[2][4] == _get_32_bit_hash("and")
assert hashes[2][5] == _get_unsigned_32_bit_hash("and") assert hashes[2][5] == _get_32_bit_hash("and")
assert hashes[2][6] == _get_unsigned_32_bit_hash("and") assert hashes[2][6] == _get_32_bit_hash("and")
assert hashes[2][7] == 0 assert hashes[2][7] == 0
assert hashes[2][8] == 0 assert hashes[2][8] == 0
assert hashes[2][9] == 0 assert hashes[2][9] == 0
assert hashes[3][0] == _get_unsigned_32_bit_hash("P" if case_sensitive else "p") assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p")
assert hashes[3][1] == _get_unsigned_32_bit_hash("Pro" if case_sensitive else "pro") assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro")
assert hashes[3][2] == _get_unsigned_32_bit_hash( assert hashes[3][2] == _get_32_bit_hash(
"Prod" if case_sensitive else "prod" "Prod" if case_sensitive else "prod"
) )
assert hashes[3][3] == _get_unsigned_32_bit_hash("gy") assert hashes[3][3] == _get_32_bit_hash("gy")
assert hashes[3][4] == _get_unsigned_32_bit_hash("igy") assert hashes[3][4] == _get_32_bit_hash("igy")
assert hashes[3][5] == _get_unsigned_32_bit_hash("digy") assert hashes[3][5] == _get_32_bit_hash("digy")
assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy") assert hashes[3][6] == _get_32_bit_hash("odigy")
assert hashes[3][7] == 0 if case_sensitive else _get_unsigned_32_bit_hash("pr") assert hashes[3][7] == 0 if case_sensitive else _get_32_bit_hash("pr")
assert hashes[3][8] == _get_unsigned_32_bit_hash("r") assert hashes[3][8] == _get_32_bit_hash("r")
if case_sensitive: if case_sensitive:
assert hashes[3][9] == _get_unsigned_32_bit_hash("r") assert hashes[3][9] == _get_32_bit_hash("r")
else: else:
assert hashes[3][9] == _get_unsigned_32_bit_hash("rp") assert hashes[3][9] == _get_32_bit_hash("rp")
# check values are the same cross-platform # check values are the same cross-platform
if case_sensitive: if case_sensitive:
assert hashes[0][2] == 3041529170 assert hashes[0][2] == -1253438126
else: else:
assert hashes[0][2] == 2199614696 assert hashes[0][2] == -2095352600
assert hashes[1][3] == 910783208 assert hashes[1][3] == 910783208
assert hashes[3][8] == 1553167345 assert hashes[3][8] == 1553167345
@ -1092,40 +1102,52 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=False, cs=False,
p_lengths=bytes(), p_lengths=bytes(),
p_max_l = 0,
s_lengths=bytes((2,3,4,5,)), s_lengths=bytes((2,3,4,5,)),
s_max_l = 5,
ps_1byte_ch=ps1, ps_1byte_ch=ps1,
ps_1byte_ch_l = len(ps1),
ps_2byte_ch=ps2, ps_2byte_ch=ps2,
ps_2byte_ch_l = len(ps2),
ps_3byte_ch=ps3, ps_3byte_ch=ps3,
ps_3byte_ch_l = len(ps3),
ps_4byte_ch=ps4, ps_4byte_ch=ps4,
ps_4byte_ch_l = len(ps4),
ps_lengths=bytes((2,)), ps_lengths=bytes((2,)),
ps_max_l = 2,
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_1byte_ch_l = 0,
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_2byte_ch_l = 0,
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_3byte_ch_l = 0,
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_4byte_ch_l = 0,
ss_lengths=bytes(), ss_lengths=bytes(),
ss_max_l = 0,
) )
assert hashes[0][0] == _get_unsigned_32_bit_hash("cy") assert hashes[0][0] == _get_32_bit_hash("cy")
assert hashes[0][1] == _get_unsigned_32_bit_hash("acy") assert hashes[0][1] == _get_32_bit_hash("acy")
assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy") assert hashes[0][2] == _get_32_bit_hash("pacy")
assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy") assert hashes[0][3] == _get_32_bit_hash("spacy")
assert hashes[0][4] == _get_unsigned_32_bit_hash("p") assert hashes[0][4] == _get_32_bit_hash("p")
assert hashes[1][0] == _get_unsigned_32_bit_hash("") assert hashes[1][0] == _get_32_bit_hash("")
assert hashes[1][1] == _get_unsigned_32_bit_hash("") assert hashes[1][1] == _get_32_bit_hash("")
assert hashes[1][2] == _get_unsigned_32_bit_hash("") assert hashes[1][2] == _get_32_bit_hash("")
assert hashes[1][3] == _get_unsigned_32_bit_hash("") assert hashes[1][3] == _get_32_bit_hash("")
assert hashes[1][4] == 0 assert hashes[1][4] == 0
assert hashes[2][0] == _get_unsigned_32_bit_hash("nd") assert hashes[2][0] == _get_32_bit_hash("nd")
assert hashes[2][1] == _get_unsigned_32_bit_hash("and") assert hashes[2][1] == _get_32_bit_hash("and")
assert hashes[2][2] == _get_unsigned_32_bit_hash("and") assert hashes[2][2] == _get_32_bit_hash("and")
assert hashes[2][3] == _get_unsigned_32_bit_hash("and") assert hashes[2][3] == _get_32_bit_hash("and")
assert hashes[2][4] == 0 assert hashes[2][4] == 0
assert hashes[3][0] == _get_unsigned_32_bit_hash("gy") assert hashes[3][0] == _get_32_bit_hash("gy")
assert hashes[3][1] == _get_unsigned_32_bit_hash("igy") assert hashes[3][1] == _get_32_bit_hash("igy")
assert hashes[3][2] == _get_unsigned_32_bit_hash("digy") assert hashes[3][2] == _get_32_bit_hash("digy")
assert hashes[3][3] == _get_unsigned_32_bit_hash("odigy") assert hashes[3][3] == _get_32_bit_hash("odigy")
assert hashes[3][4] == _get_unsigned_32_bit_hash("pr") assert hashes[3][4] == _get_32_bit_hash("pr")
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
@ -1137,21 +1159,33 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=False, cs=False,
p_lengths=bytes((p_length,)), p_lengths=bytes((p_length,)),
p_max_l = p_length,
s_lengths=bytes((s_length,)), s_lengths=bytes((s_length,)),
s_max_l = s_length,
ps_1byte_ch=bytes(), ps_1byte_ch=bytes(),
ps_1byte_ch_l = 0,
ps_2byte_ch=bytes(), ps_2byte_ch=bytes(),
ps_2byte_ch_l = 0,
ps_3byte_ch=bytes(), ps_3byte_ch=bytes(),
ps_3byte_ch_l = 0,
ps_4byte_ch=bytes(), ps_4byte_ch=bytes(),
ps_4byte_ch_l = 0,
ps_lengths=bytes(), ps_lengths=bytes(),
ps_max_l = 0,
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_1byte_ch_l = 0,
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_2byte_ch_l = 0,
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_3byte_ch_l = 0,
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_4byte_ch_l = 0,
ss_lengths=bytes(), ss_lengths=bytes(),
ss_max_l = 0
) )
assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length]) assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length])
assert hashes[0][1] == _get_unsigned_32_bit_hash("sp𐌞cé"[-s_length:]) assert hashes[0][1] == _get_32_bit_hash("sp𐌞cé"[-s_length:])
@pytest.mark.parametrize("case_sensitive", [True, False]) @pytest.mark.parametrize("case_sensitive", [True, False])
@ -1161,49 +1195,61 @@ def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
p_lengths=bytes((1,2,3,4,)), p_lengths=bytes((1,2,3,4,)),
p_max_l = 4,
s_lengths=bytes((1,2,3,4,)), s_lengths=bytes((1,2,3,4,)),
s_max_l = 4,
ps_1byte_ch=s1, ps_1byte_ch=s1,
ps_1byte_ch_l = len(s1),
ps_2byte_ch=s2, ps_2byte_ch=s2,
ps_2byte_ch_l = len(s2),
ps_3byte_ch=s3, ps_3byte_ch=s3,
ps_3byte_ch_l = len(s3),
ps_4byte_ch=s4, ps_4byte_ch=s4,
ps_4byte_ch_l = len(s4),
ps_lengths=bytes((1,2,3,4,)), ps_lengths=bytes((1,2,3,4,)),
ps_max_l = 4,
ss_1byte_ch=s1, ss_1byte_ch=s1,
ss_1byte_ch_l = len(s1),
ss_2byte_ch=s2, ss_2byte_ch=s2,
ss_2byte_ch_l = len(s2),
ss_3byte_ch=s3, ss_3byte_ch=s3,
ss_3byte_ch_l = len(s3),
ss_4byte_ch=s4, ss_4byte_ch=s4,
ss_4byte_ch_l = len(s4),
ss_lengths=bytes((1,2,3,4,)), ss_lengths=bytes((1,2,3,4,)),
ss_max_l = 4
) )
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
assert hashes[0][0] == _get_unsigned_32_bit_hash("i") assert hashes[0][0] == _get_32_bit_hash("i")
assert hashes[0][1] == _get_unsigned_32_bit_hash("İ".lower()) assert hashes[0][1] == _get_32_bit_hash("İ".lower())
if case_sensitive: if case_sensitive:
assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "İ") assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "İ")
assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() + "İ") assert hashes[0][3] == _get_32_bit_hash("İ".lower() + "İ")
assert hashes[0][4] == _get_unsigned_32_bit_hash("İ") assert hashes[0][4] == _get_32_bit_hash("İ")
assert hashes[0][5] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ") assert hashes[0][5] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ")
assert hashes[0][6] == _get_unsigned_32_bit_hash("İ".lower() + "İ") assert hashes[0][6] == _get_32_bit_hash("İ".lower() + "İ")
assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() + "İ") assert hashes[0][7] == _get_32_bit_hash("İ".lower() + "İ")
assert hashes[0][8] == _get_unsigned_32_bit_hash("İ") assert hashes[0][8] == _get_32_bit_hash("İ")
assert hashes[0][9] == _get_unsigned_32_bit_hash("İ") assert hashes[0][9] == _get_32_bit_hash("İ")
assert hashes[0][12] == _get_unsigned_32_bit_hash("İ") assert hashes[0][12] == _get_32_bit_hash("İ")
assert hashes[0][13] == _get_unsigned_32_bit_hash("İ") assert hashes[0][13] == _get_32_bit_hash("İ")
else: else:
assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "i") assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "i")
assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() * 2) assert hashes[0][3] == _get_32_bit_hash("İ".lower() * 2)
assert hashes[0][4] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE) assert hashes[0][4] == _get_32_bit_hash(COMBINING_DOT_ABOVE)
assert hashes[0][5] == _get_unsigned_32_bit_hash("İ".lower()) assert hashes[0][5] == _get_32_bit_hash("İ".lower())
assert hashes[0][6] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower()) assert hashes[0][6] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower())
assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() * 2) assert hashes[0][7] == _get_32_bit_hash("İ".lower() * 2)
assert hashes[0][8] == _get_unsigned_32_bit_hash("i") assert hashes[0][8] == _get_32_bit_hash("i")
assert hashes[0][9] == _get_unsigned_32_bit_hash("İ".lower()) assert hashes[0][9] == _get_32_bit_hash("İ".lower())
assert hashes[0][10] == _get_unsigned_32_bit_hash("İ".lower() + "i") assert hashes[0][10] == _get_32_bit_hash("İ".lower() + "i")
assert hashes[0][11] == _get_unsigned_32_bit_hash("İ".lower() * 2) assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2)
assert hashes[0][12] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE) assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE)
assert hashes[0][13] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i") assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i")
assert hashes[0][14] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE) assert hashes[0][14] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE)
assert hashes[0][15] == _get_unsigned_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2) assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2)
@pytest.mark.parametrize("case_sensitive", [True, False]) @pytest.mark.parametrize("case_sensitive", [True, False])
@ -1219,33 +1265,45 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer,
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
p_lengths=bytes((2,)), p_lengths=bytes((2,)),
p_max_l = 2,
s_lengths=bytes((2,)), s_lengths=bytes((2,)),
s_max_l = 2,
ps_1byte_ch=ps1, ps_1byte_ch=ps1,
ps_1byte_ch_l = len(ps1),
ps_2byte_ch=ps2, ps_2byte_ch=ps2,
ps_2byte_ch_l = len(ps2),
ps_3byte_ch=ps3, ps_3byte_ch=ps3,
ps_3byte_ch_l = len(ps3),
ps_4byte_ch=ps4, ps_4byte_ch=ps4,
ps_4byte_ch_l = len(ps4),
ps_lengths=bytes((2,)), ps_lengths=bytes((2,)),
ps_max_l = 2,
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_1byte_ch_l = 0,
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_2byte_ch_l = 0,
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_3byte_ch_l = 0,
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_4byte_ch_l = 0,
ss_lengths=bytes(), ss_lengths=bytes(),
ss_max_l = 0
) )
assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl") assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl")
assert hashes[0][1] == _get_unsigned_32_bit_hash("19") assert hashes[0][1] == _get_32_bit_hash("19")
assert hashes[0][2] == 0 assert hashes[0][2] == 0
assert hashes[1][0] == _get_unsigned_32_bit_hash("be") assert hashes[1][0] == _get_32_bit_hash("be")
assert hashes[1][1] == _get_unsigned_32_bit_hash("ee") assert hashes[1][1] == _get_32_bit_hash("ee")
if case_sensitive: if case_sensitive:
assert hashes[1][2] == 0 assert hashes[1][2] == 0
else: else:
assert hashes[1][2] == _get_unsigned_32_bit_hash("ee") assert hashes[1][2] == _get_32_bit_hash("ee")
assert hashes[2][0] == hashes[3][0] == _get_unsigned_32_bit_hash("se") assert hashes[2][0] == hashes[3][0] == _get_32_bit_hash("se")
assert hashes[2][1] == hashes[3][1] == _get_unsigned_32_bit_hash("ty") assert hashes[2][1] == hashes[3][1] == _get_32_bit_hash("ty")
if case_sensitive: if case_sensitive:
assert hashes[2][2] == hashes[3][2] == 0 assert hashes[2][2] == hashes[3][2] == 0
else: else:
assert hashes[2][2] == hashes[3][2] == _get_unsigned_32_bit_hash("ee") assert hashes[2][2] == hashes[3][2] == _get_32_bit_hash("ee")
def test_character_combination_hashes_empty_lengths(en_tokenizer): def test_character_combination_hashes_empty_lengths(en_tokenizer):
@ -1253,15 +1311,27 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer):
assert doc.get_character_combination_hashes( assert doc.get_character_combination_hashes(
cs=True, cs=True,
p_lengths=bytes(), p_lengths=bytes(),
p_max_l = 0,
s_lengths=bytes(), s_lengths=bytes(),
s_max_l = 0,
ps_1byte_ch=bytes(), ps_1byte_ch=bytes(),
ps_1byte_ch_l=0,
ps_2byte_ch=bytes(), ps_2byte_ch=bytes(),
ps_2byte_ch_l=0,
ps_3byte_ch=bytes(), ps_3byte_ch=bytes(),
ps_3byte_ch_l=0,
ps_4byte_ch=bytes(), ps_4byte_ch=bytes(),
ps_4byte_ch_l=0,
ps_lengths=bytes(), ps_lengths=bytes(),
ps_max_l = 0,
ss_1byte_ch=bytes(), ss_1byte_ch=bytes(),
ss_1byte_ch_l=0,
ss_2byte_ch=bytes(), ss_2byte_ch=bytes(),
ss_2byte_ch_l=0,
ss_3byte_ch=bytes(), ss_3byte_ch=bytes(),
ss_3byte_ch_l=0,
ss_4byte_ch=bytes(), ss_4byte_ch=bytes(),
ss_4byte_ch_l=0,
ss_lengths=bytes(), ss_lengths=bytes(),
ss_max_l = 0,
).shape == (1, 0) ).shape == (1, 0)

View File

@ -38,20 +38,33 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end) cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef void _set_affix_lengths( cdef void _set_prefix_lengths(
const unsigned char* tok_str, const unsigned char* tok_str,
unsigned char* aff_l_buf, const int tok_str_l,
const int pref_l, unsigned char* pref_l_buf,
const int suff_l, const int p_max_l,
) nogil ) nogil
cdef void _set_suffix_lengths(
const unsigned char* tok_str,
const int tok_str_l,
unsigned char* suff_l_buf,
const int s_max_l,
)
cdef void _search_for_chars( cdef void _search_for_chars(
const unsigned char* tok_str, const unsigned char* tok_str,
const int tok_str_l,
const unsigned char* s_1byte_ch, const unsigned char* s_1byte_ch,
const int s_1byte_ch_l,
const unsigned char* s_2byte_ch, const unsigned char* s_2byte_ch,
const int s_2byte_ch_l,
const unsigned char* s_3byte_ch, const unsigned char* s_3byte_ch,
const int s_3byte_ch_l,
const unsigned char* s_4byte_ch, const unsigned char* s_4byte_ch,
const int s_4byte_ch_l,
unsigned char* res_buf, unsigned char* res_buf,
int max_res_l, int max_res_l,
unsigned char* l_buf, unsigned char* l_buf,
@ -59,6 +72,18 @@ cdef void _search_for_chars(
) nogil ) nogil
cdef int _write_hashes(
const unsigned char* res_buf,
const unsigned char* aff_l_buf,
const unsigned char* offset_buf,
const int end_idx,
np.ndarray[np.int64_t, ndim=2] hashes,
const int tok_i,
const int start_hash_idx,
)
cdef class Doc: cdef class Doc:
cdef readonly Pool mem cdef readonly Pool mem
cdef readonly Vocab vocab cdef readonly Vocab vocab

View File

@ -179,17 +179,58 @@ class Doc:
*, *,
cs: bool, cs: bool,
p_lengths: bytes, p_lengths: bytes,
p_max_l: int,
s_lengths: bytes, s_lengths: bytes,
s_max_l: int,
ps_1byte_ch: bytes, ps_1byte_ch: bytes,
ps_1_byte_ch_l: int,
ps_2byte_ch: bytes, ps_2byte_ch: bytes,
ps_2_byte_ch_l: int,
ps_3byte_ch: bytes, ps_3byte_ch: bytes,
ps_3_byte_ch_l: int,
ps_4byte_ch: bytes, ps_4byte_ch: bytes,
ps_4_byte_ch_l: int,
ps_lengths: bytes, ps_lengths: bytes,
ps_max_l: int,
ss_1byte_ch: bytes, ss_1byte_ch: bytes,
ss_1_byte_ch_l: int,
ss_2byte_ch: bytes, ss_2byte_ch: bytes,
ss_2_byte_ch_l: int,
ss_3byte_ch: bytes, ss_3byte_ch: bytes,
ss_3_byte_ch_l: int,
ss_4byte_ch: bytes, ss_4byte_ch: bytes,
ss_4_byte_ch_l: int,
ss_lengths: bytes, ss_lengths: bytes,
ss_max_l: int,
) -> Ints2d: ... ) -> Ints2d: ...
@staticmethod @staticmethod
def _get_array_attrs() -> Tuple[Any]: ... def _get_array_attrs() -> Tuple[Any]: ...
def get_character_combination_hashes(self,
*,
const bint cs,
const unsigned char* p_lengths,
const int p_max_l,
const unsigned char* s_lengths,
const int s_max_l,
const unsigned char* ps_1byte_ch,
const int ps_1_byte_ch_l,
const unsigned char* ps_2byte_ch,
const int ps_2_byte_ch_l,
const unsigned char* ps_3byte_ch,
const int ps_3_byte_ch_l,
const unsigned char* ps_4byte_ch,
const int ps_4_byte_ch_l,
const unsigned char* ps_lengths,
const int ps_max_l,
const unsigned char* ss_1byte_ch,
const int ss_1_byte_ch_l,
const unsigned char* ss_2byte_ch,
const int ss_2_byte_ch_l,
const unsigned char* ss_3byte_ch,
const int ss_3_byte_ch_l,
const unsigned char* ss_4byte_ch,
const int ss_4_byte_ch_l,
const unsigned char* ss_lengths,
const int ss_max_l,
)

View File

@ -956,7 +956,7 @@ cdef class Doc:
cdef int i, j cdef int i, j
cdef attr_id_t feature cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output cdef np.ndarray[attr_t, ndim=2] output
# Handle scalar/list inputs of strings/ints for py_attr_ids # Handle scalar/list inputs of cdef np.strings/ints for py_attr_ids
# See also #3064 # See also #3064
if isinstance(py_attr_ids, str): if isinstance(py_attr_ids, str):
# Handle inputs like doc.to_array('ORTH') # Handle inputs like doc.to_array('ORTH')
@ -1735,22 +1735,34 @@ cdef class Doc:
j += 1 j += 1
return output return output
@cython.boundscheck(False) # Deactivate bounds checking #@cython.boundscheck(False) # Deactivate bounds checking
def get_character_combination_hashes(self, def get_character_combination_hashes(self,
*, *,
const bint cs, const bint cs,
const unsigned char* p_lengths, const unsigned char* p_lengths,
const int p_max_l,
const unsigned char* s_lengths, const unsigned char* s_lengths,
const int s_max_l,
const unsigned char* ps_1byte_ch, const unsigned char* ps_1byte_ch,
const int ps_1byte_ch_l,
const unsigned char* ps_2byte_ch, const unsigned char* ps_2byte_ch,
const int ps_2byte_ch_l,
const unsigned char* ps_3byte_ch, const unsigned char* ps_3byte_ch,
const int ps_3byte_ch_l,
const unsigned char* ps_4byte_ch, const unsigned char* ps_4byte_ch,
const int ps_4byte_ch_l,
const unsigned char* ps_lengths, const unsigned char* ps_lengths,
const int ps_max_l,
const unsigned char* ss_1byte_ch, const unsigned char* ss_1byte_ch,
const int ss_1byte_ch_l,
const unsigned char* ss_2byte_ch, const unsigned char* ss_2byte_ch,
const int ss_2byte_ch_l,
const unsigned char* ss_3byte_ch, const unsigned char* ss_3byte_ch,
const int ss_3byte_ch_l,
const unsigned char* ss_4byte_ch, const unsigned char* ss_4byte_ch,
const int ss_4byte_ch_l,
const unsigned char* ss_lengths, const unsigned char* ss_lengths,
const int ss_max_l,
): ):
""" """
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
@ -1766,39 +1778,33 @@ cdef class Doc:
cs: if *False*, hashes are generated based on the lower-case version of each token. cs: if *False*, hashes are generated based on the lower-case version of each token.
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order. p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa". For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
p_max_l: the value of *p_lengths[-1]*, or *0* if *p_lengths==None*. Passed in for speed.
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order. s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
s_max_l: the value of *s_lengths[-1]*, or *0* if *s_lengths==None*. Passed in for speed.
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
starting at the beginning. starting at the beginning.
ps_<n>byte_ch_l: the length of *ps_<n>byte_ch*. Passed in for speed.
ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed
in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings
hashed for "spaCy" would be "a" and "ac". hashed for "spaCy" would be "a" and "ac".
ps_max_l: the value of *ps_lengths[-1]*, or *0* if *ps_lengths==None*. Passed in for speed.
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
starting at the end. starting at the end.
ss_<n>byte_ch_l: the length of *ss_<n>byte_ch*. Passed in for speed.
ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed
in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings
hashed for "spaCy" would be "c" and "ca". hashed for "spaCy" would be "c" and "ca".
ss_max_l: the value of *ss_lengths[-1]*, or *0* if *ss_lengths==None*. Passed in for speed.
""" """
# Define the result array and work out what is used for what in axis 1 cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty(
cdef int num_toks = len(self) (self.length, p_max_l + s_max_l + ps_max_l + ss_max_l), dtype="int64")
cdef int p_h_num = strlen(<char*> p_lengths)
cdef int s_h_num = strlen(<char*> s_lengths), s_h_end = p_h_num + s_h_num
cdef int ps_h_num = strlen(<char*> ps_lengths), ps_h_end = s_h_end + ps_h_num
cdef int ss_h_num = strlen(<char*> ss_lengths), ss_h_end = ps_h_end + ss_h_num
cdef np.ndarray[np.int64_t, ndim=2] hashes
hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
# Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
cdef int p_max_l = p_lengths[-1] if p_h_num > 0 else 0
cdef int s_max_l = s_lengths[-1] if s_h_num > 0 else 0
cdef int ps_max_l = ps_lengths[-1] if ps_h_num > 0 else 0
cdef int ss_max_l = ss_lengths[-1] if ss_h_num > 0 else 0
# Define / allocate buffers # Define / allocate buffers
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef int aff_l = p_max_l + s_max_l cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, 1)
cdef unsigned char* aff_l_buf = <unsigned char*> mem.alloc(aff_l, 1) cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(p_max_l, 1)
cdef unsigned char* ps_res_buf = <unsigned char*> mem.alloc(ps_max_l, 4) cdef unsigned char* ps_res_buf = <unsigned char*> mem.alloc(ps_max_l, 4)
cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, 1) cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, 1)
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4) cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
@ -1806,48 +1812,34 @@ cdef class Doc:
# Define working variables # Define working variables
cdef TokenC tok_c cdef TokenC tok_c
cdef int tok_i, offset cdef int hash_idx, tok_i, tok_str_l
cdef uint64_t hash_val = 0
cdef attr_t num_tok_attr cdef attr_t num_tok_attr
cdef const unsigned char* tok_str cdef const unsigned char* tok_str
for tok_i in range(num_toks): for tok_i in range(self.length):
tok_c = self.c[tok_i] tok_c = self.c[tok_i]
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
tok_str = self.vocab.strings.utf8_ptr(num_tok_attr) tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
tok_str_l = strlen(<char*> tok_str)
hash_idx = 0
if aff_l > 0: if p_max_l > 0:
_set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l) _set_prefix_lengths(tok_str, tok_str_l, pref_l_buf, p_max_l)
hash_idx = _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes, tok_i, 0)
for hash_idx in range(p_h_num): if s_max_l > 0:
offset = aff_l_buf[p_lengths[hash_idx] - 1] _set_suffix_lengths(tok_str, tok_str_l, suff_l_buf, s_max_l)
if offset > 0: hash_idx = _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l, hashes, tok_i, hash_idx)
hash_val = hash32(<void*> &tok_str[0], offset, 0)
hashes[tok_i, hash_idx] = hash_val
for hash_idx in range(p_h_num, s_h_end): if ps_max_l > 0:
offset = aff_l_buf[s_lengths[hash_idx - p_h_num] + p_max_l - 1] _search_for_chars(tok_str, tok_str_l, ps_1byte_ch, ps_1byte_ch_l, ps_2byte_ch, ps_2byte_ch_l,
if offset > 0: ps_3byte_ch, ps_3byte_ch_l, ps_4byte_ch, ps_4byte_ch_l, ps_res_buf, ps_max_l, ps_l_buf, False)
hash_val = hash32(<void*> &tok_str[len(tok_str) - offset], offset, 0) hash_idx = _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes, tok_i, hash_idx)
hashes[tok_i, hash_idx] = hash_val
if ps_h_num > 0: if ss_max_l > 0:
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False) _search_for_chars(tok_str, tok_str_l, ss_1byte_ch, ss_1byte_ch_l, ss_2byte_ch, ss_2byte_ch_l,
hash_val = 0 ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True)
for hash_idx in range(s_h_end, ps_h_end): _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes, tok_i, hash_idx)
offset = ps_l_buf[ps_lengths[hash_idx - s_h_end] - 1]
if offset > 0:
hash_val = hash32(ps_res_buf, offset, 0)
hashes[tok_i, hash_idx] = hash_val
if ss_h_num > 0:
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True)
hash_val = 0
for hash_idx in range(ps_h_end, ss_h_end):
offset = ss_l_buf[ss_lengths[hash_idx - ps_h_end] - 1]
if offset > 0:
hash_val = hash32(ss_res_buf, offset, 0)
hashes[tok_i, hash_idx] = hash_val
return hashes return hashes
@ -2031,59 +2023,81 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
lca_matrix[k, j] = lca - start lca_matrix[k, j] = lca - start
return lca_matrix return lca_matrix
@cython.boundscheck(False) # Deactivate bounds checking #@cython.boundscheck(False) # Deactivate bounds checking
cdef void _set_affix_lengths( cdef void _set_prefix_lengths(
const unsigned char* tok_str, const unsigned char* tok_str,
unsigned char* aff_l_buf, const int tok_str_l,
const int pref_l, unsigned char* pref_l_buf,
const int suff_l, const int p_max_l,
) nogil: ) nogil:
""" Populate *aff_l_buf*, which has length *pref_l+suff_l* with the byte lengths of the first *pref_l* and the last """ Populate *pref_l_buf*, which has length *pref_l*, with the byte lengths of the first *pref_l* characters within *tok_str*.
*suff_l* characters within *tok_str*. Lengths that are greater than the character length of the whole word are Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word.
populated with the byte length of the whole word.
tok_str: a memoryview of a UTF-8 representation of a string. tok_str: a UTF-8 representation of a string.
aff_l_buf: a buffer of length *pref_l+suff_l* in which to store the lengths. The calling code ensures that lengths tok_str_l: the length of *tok_str*.
pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The calling code ensures that lengths
greater than 255 cannot occur. greater than 255 cannot occur.
pref_l: the number of characters to process at the beginning of the word. p_max_l: the number of characters to process at the beginning of the word.
suff_l: the number of characters to process at the end of the word.
""" """
cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = strlen(<char*> tok_str) cdef int tok_str_idx = 1, pref_l_buf_idx = 0
while aff_l_buf_idx < pref_l: while pref_l_buf_idx < p_max_l:
if (tok_str_idx == strlen(<char*> tok_str) if (tok_str[tok_str_idx] == 0 # end of string
or or
((tok_str[tok_str_idx] & 0xc0) != 0x80 # not a continuation character ((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character
): ):
aff_l_buf[aff_l_buf_idx] = tok_str_idx pref_l_buf[pref_l_buf_idx] = tok_str_idx
aff_l_buf_idx += 1 pref_l_buf_idx += 1
tok_str_idx += 1 if tok_str[tok_str_idx] == 0: # end of string
if tok_str_idx > tok_str_l:
break break
tok_str_idx += 1
if aff_l_buf_idx < pref_l: if pref_l_buf_idx < p_max_l:
memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l - aff_l_buf_idx) memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
aff_l_buf_idx = pref_l
tok_str_idx = tok_str_l - 1
while aff_l_buf_idx < pref_l + suff_l: #@cython.boundscheck(False) # Deactivate bounds checking
cdef void _set_suffix_lengths(
const unsigned char* tok_str,
const int tok_str_l,
unsigned char* suff_l_buf,
const int s_max_l,
):
""" Populate *suff_l_buf*, which has length *suff_l*, with the byte lengths of the last *suff_l* characters within *tok_str*.
Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word.
tok_str: a UTF-8 representation of a string.
tok_str_l: the length of *tok_str*.
suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The calling code ensures that lengths
greater than 255 cannot occur.
s_max_l: the number of characters to process at the end of the word.
"""
cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0
while suff_l_buf_idx < s_max_l:
if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
aff_l_buf[aff_l_buf_idx] = tok_str_l - tok_str_idx suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx
aff_l_buf_idx += 1 suff_l_buf_idx += 1
tok_str_idx -= 1 tok_str_idx -= 1
if tok_str_idx < 0: if tok_str_idx < 0:
break break
if aff_l_buf_idx < pref_l + suff_l: if suff_l_buf_idx < s_max_l:
memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l + suff_l - aff_l_buf_idx) memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
@cython.boundscheck(False) # Deactivate bounds checking
#@cython.boundscheck(False) # Deactivate bounds checking
cdef void _search_for_chars( cdef void _search_for_chars(
const unsigned char* tok_str, const unsigned char* tok_str,
const int tok_str_l,
const unsigned char* s_1byte_ch, const unsigned char* s_1byte_ch,
const int s_1byte_ch_l,
const unsigned char* s_2byte_ch, const unsigned char* s_2byte_ch,
const int s_2byte_ch_l,
const unsigned char* s_3byte_ch, const unsigned char* s_3byte_ch,
const int s_3byte_ch_l,
const unsigned char* s_4byte_ch, const unsigned char* s_4byte_ch,
const int s_4byte_ch_l,
unsigned char* res_buf, unsigned char* res_buf,
int max_res_l, int max_res_l,
unsigned char* l_buf, unsigned char* l_buf,
@ -2096,6 +2110,7 @@ cdef void _search_for_chars(
which may be *0* if the search was not successful. which may be *0* if the search was not successful.
tok_str: a memoryview of a UTF-8 representation of a string. tok_str: a memoryview of a UTF-8 representation of a string.
tok_str_l: the length of *tok_str*.
s_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for. s_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for.
res_buf: the buffer in which to place the search results. res_buf: the buffer in which to place the search results.
max_res_l: the maximum number of found characters to place in *res_buf*. max_res_l: the maximum number of found characters to place in *res_buf*.
@ -2103,7 +2118,7 @@ cdef void _search_for_chars(
The calling code ensures that lengths greater than 255 cannot occur. The calling code ensures that lengths greater than 255 cannot occur.
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning. suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
""" """
cdef int tok_str_l = strlen(<char*> tok_str), res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx
cdef int search_chars_l cdef int search_chars_l
cdef const unsigned char* search_chars cdef const unsigned char* search_chars
@ -2121,13 +2136,16 @@ cdef void _search_for_chars(
ch_wdth = last_tok_str_idx - this_tok_str_idx ch_wdth = last_tok_str_idx - this_tok_str_idx
if ch_wdth == 1: if ch_wdth == 1:
search_chars = s_1byte_ch search_chars = s_1byte_ch
search_chars_l = s_1byte_ch_l
elif ch_wdth == 2: elif ch_wdth == 2:
search_chars = s_2byte_ch search_chars = s_2byte_ch
search_chars_l = s_2byte_ch_l
elif ch_wdth == 3: elif ch_wdth == 3:
search_chars = s_3byte_ch search_chars = s_3byte_ch
search_chars_l = s_3byte_ch_l
else: else:
search_chars = s_4byte_ch search_chars = s_4byte_ch
search_chars_l = strlen(<char*> search_chars) search_chars_l = s_4byte_ch_l
tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
search_char_idx = 0 search_char_idx = 0
@ -2157,6 +2175,43 @@ cdef void _search_for_chars(
memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
cdef int _write_hashes(
const unsigned char* res_buf,
const unsigned char* aff_l_buf,
const unsigned char* offset_buf,
const int end_idx,
np.ndarray[np.int64_t, ndim=2] hashes,
const int tok_i,
const int start_hash_idx,
):
""" Write hashes for a token/rich property group combination.
res_buf: the string from which to generate the hash values.
aff_l_buf: one-byte lengths describing how many characters to hash.
offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
end_idx: if not *0*, the offset within *res_buf* that should end each affix being hashed;
if *0*, affixes start at the beginning of *res_buf* rather than ending at the end.
hashes: the 2D Numpy array in which the hashes are stored.
tok_i: the index of axis 0 of *hashes* to write to.
start_hash_idx: the index of axis 1 of *hashes* at which to start writing.
"""
cdef int offset, aff_l, hash_val = 0, hash_idx = start_hash_idx
while True:
aff_l = aff_l_buf[hash_idx - start_hash_idx]
if aff_l == 0:
return hash_idx
offset = offset_buf[aff_l - 1]
if offset > 0:
if end_idx != 0:
hash_val = hash32(<void*> res_buf + end_idx - offset, offset, 0)
else:
hash_val = hash32(<void*> res_buf, offset, 0)
hashes[tok_i, hash_idx] = hash_val
hash_idx += 1
def pickle_doc(doc): def pickle_doc(doc):
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"]) bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks, hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,