mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Intermediate state
This commit is contained in:
parent
2552340fb8
commit
bbf058029a
|
@ -994,10 +994,8 @@ def test_doc_spans_setdefault(en_tokenizer):
|
||||||
assert len(doc.spans["key3"]) == 2
|
assert len(doc.spans["key3"]) == 2
|
||||||
|
|
||||||
|
|
||||||
def _get_unsigned_32_bit_hash(input: str) -> int:
|
def _get_32_bit_hash(input: str) -> int:
|
||||||
working_hash = hash(input.encode("UTF-8"))
|
working_hash = hash(input.encode("UTF-8"))
|
||||||
if working_hash < 0:
|
|
||||||
working_hash = working_hash + (2 << 31)
|
|
||||||
return working_hash
|
return working_hash
|
||||||
|
|
||||||
|
|
||||||
|
@ -1009,79 +1007,91 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=case_sensitive,
|
cs=case_sensitive,
|
||||||
p_lengths=bytes((1, 3, 4,)),
|
p_lengths=bytes((1, 3, 4,)),
|
||||||
|
p_max_l = 4,
|
||||||
s_lengths=bytes((2, 3, 4, 5,)),
|
s_lengths=bytes((2, 3, 4, 5,)),
|
||||||
|
s_max_l = 5,
|
||||||
ps_1byte_ch=ps1,
|
ps_1byte_ch=ps1,
|
||||||
|
ps_1byte_ch_l = len(ps1),
|
||||||
ps_2byte_ch=ps2,
|
ps_2byte_ch=ps2,
|
||||||
|
ps_2byte_ch_l = len(ps2),
|
||||||
ps_3byte_ch=ps3,
|
ps_3byte_ch=ps3,
|
||||||
|
ps_3byte_ch_l = len(ps3),
|
||||||
ps_4byte_ch=ps4,
|
ps_4byte_ch=ps4,
|
||||||
|
ps_4byte_ch_l = len(ps4),
|
||||||
ps_lengths=bytes((2,)),
|
ps_lengths=bytes((2,)),
|
||||||
|
ps_max_l = 2,
|
||||||
ss_1byte_ch=ss1,
|
ss_1byte_ch=ss1,
|
||||||
|
ss_1byte_ch_l = len(ss1),
|
||||||
ss_2byte_ch=ss2,
|
ss_2byte_ch=ss2,
|
||||||
|
ss_2byte_ch_l = len(ss2),
|
||||||
ss_3byte_ch=ss3,
|
ss_3byte_ch=ss3,
|
||||||
|
ss_3byte_ch_l = len(ss3),
|
||||||
ss_4byte_ch=ss4,
|
ss_4byte_ch=ss4,
|
||||||
|
ss_4byte_ch_l = len(ss4),
|
||||||
ss_lengths=bytes((1, 2,)),
|
ss_lengths=bytes((1, 2,)),
|
||||||
|
ss_max_l = 2,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
|
assert hashes[0][0] == _get_32_bit_hash("s")
|
||||||
assert hashes[0][1] == _get_unsigned_32_bit_hash("spa")
|
assert hashes[0][1] == _get_32_bit_hash("spa")
|
||||||
assert hashes[0][2] == _get_unsigned_32_bit_hash(
|
assert hashes[0][2] == _get_32_bit_hash(
|
||||||
"spaC" if case_sensitive else "spac"
|
"spaC" if case_sensitive else "spac"
|
||||||
)
|
)
|
||||||
assert hashes[0][3] == _get_unsigned_32_bit_hash("Cy" if case_sensitive else "cy")
|
assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy")
|
||||||
assert hashes[0][4] == _get_unsigned_32_bit_hash("aCy" if case_sensitive else "acy")
|
assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy")
|
||||||
assert hashes[0][5] == _get_unsigned_32_bit_hash(
|
assert hashes[0][5] == _get_32_bit_hash(
|
||||||
"paCy" if case_sensitive else "pacy"
|
"paCy" if case_sensitive else "pacy"
|
||||||
)
|
)
|
||||||
assert hashes[0][6] == _get_unsigned_32_bit_hash(
|
assert hashes[0][6] == _get_32_bit_hash(
|
||||||
"spaCy" if case_sensitive else "spacy"
|
"spaCy" if case_sensitive else "spacy"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert hashes[0][7] == _get_unsigned_32_bit_hash("p")
|
assert hashes[0][7] == _get_32_bit_hash("p")
|
||||||
assert hashes[0][8] == _get_unsigned_32_bit_hash("p")
|
assert hashes[0][8] == _get_32_bit_hash("p")
|
||||||
assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
|
assert hashes[0][9] == _get_32_bit_hash("p")
|
||||||
assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][0] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][1] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][1] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][2] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][2] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][3] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][3] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][4] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][4] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][5] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][5] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][6] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][6] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][7] == 0
|
assert hashes[1][7] == 0
|
||||||
assert hashes[1][8] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][8] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][9] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][9] == _get_32_bit_hash("✨")
|
||||||
assert hashes[2][0] == _get_unsigned_32_bit_hash("a")
|
assert hashes[2][0] == _get_32_bit_hash("a")
|
||||||
assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][1] == _get_32_bit_hash("and")
|
||||||
assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][2] == _get_32_bit_hash("and")
|
||||||
assert hashes[2][3] == _get_unsigned_32_bit_hash("nd")
|
assert hashes[2][3] == _get_32_bit_hash("nd")
|
||||||
assert hashes[2][4] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][4] == _get_32_bit_hash("and")
|
||||||
assert hashes[2][5] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][5] == _get_32_bit_hash("and")
|
||||||
assert hashes[2][6] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][6] == _get_32_bit_hash("and")
|
||||||
assert hashes[2][7] == 0
|
assert hashes[2][7] == 0
|
||||||
assert hashes[2][8] == 0
|
assert hashes[2][8] == 0
|
||||||
assert hashes[2][9] == 0
|
assert hashes[2][9] == 0
|
||||||
assert hashes[3][0] == _get_unsigned_32_bit_hash("P" if case_sensitive else "p")
|
assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p")
|
||||||
assert hashes[3][1] == _get_unsigned_32_bit_hash("Pro" if case_sensitive else "pro")
|
assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro")
|
||||||
assert hashes[3][2] == _get_unsigned_32_bit_hash(
|
assert hashes[3][2] == _get_32_bit_hash(
|
||||||
"Prod" if case_sensitive else "prod"
|
"Prod" if case_sensitive else "prod"
|
||||||
)
|
)
|
||||||
assert hashes[3][3] == _get_unsigned_32_bit_hash("gy")
|
assert hashes[3][3] == _get_32_bit_hash("gy")
|
||||||
assert hashes[3][4] == _get_unsigned_32_bit_hash("igy")
|
assert hashes[3][4] == _get_32_bit_hash("igy")
|
||||||
assert hashes[3][5] == _get_unsigned_32_bit_hash("digy")
|
assert hashes[3][5] == _get_32_bit_hash("digy")
|
||||||
assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy")
|
assert hashes[3][6] == _get_32_bit_hash("odigy")
|
||||||
assert hashes[3][7] == 0 if case_sensitive else _get_unsigned_32_bit_hash("pr")
|
assert hashes[3][7] == 0 if case_sensitive else _get_32_bit_hash("pr")
|
||||||
|
|
||||||
assert hashes[3][8] == _get_unsigned_32_bit_hash("r")
|
assert hashes[3][8] == _get_32_bit_hash("r")
|
||||||
|
|
||||||
if case_sensitive:
|
if case_sensitive:
|
||||||
assert hashes[3][9] == _get_unsigned_32_bit_hash("r")
|
assert hashes[3][9] == _get_32_bit_hash("r")
|
||||||
else:
|
else:
|
||||||
assert hashes[3][9] == _get_unsigned_32_bit_hash("rp")
|
assert hashes[3][9] == _get_32_bit_hash("rp")
|
||||||
|
|
||||||
# check values are the same cross-platform
|
# check values are the same cross-platform
|
||||||
if case_sensitive:
|
if case_sensitive:
|
||||||
assert hashes[0][2] == 3041529170
|
assert hashes[0][2] == -1253438126
|
||||||
else:
|
else:
|
||||||
assert hashes[0][2] == 2199614696
|
assert hashes[0][2] == -2095352600
|
||||||
assert hashes[1][3] == 910783208
|
assert hashes[1][3] == 910783208
|
||||||
assert hashes[3][8] == 1553167345
|
assert hashes[3][8] == 1553167345
|
||||||
|
|
||||||
|
@ -1092,40 +1102,52 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=False,
|
cs=False,
|
||||||
p_lengths=bytes(),
|
p_lengths=bytes(),
|
||||||
|
p_max_l = 0,
|
||||||
s_lengths=bytes((2,3,4,5,)),
|
s_lengths=bytes((2,3,4,5,)),
|
||||||
|
s_max_l = 5,
|
||||||
ps_1byte_ch=ps1,
|
ps_1byte_ch=ps1,
|
||||||
|
ps_1byte_ch_l = len(ps1),
|
||||||
ps_2byte_ch=ps2,
|
ps_2byte_ch=ps2,
|
||||||
|
ps_2byte_ch_l = len(ps2),
|
||||||
ps_3byte_ch=ps3,
|
ps_3byte_ch=ps3,
|
||||||
|
ps_3byte_ch_l = len(ps3),
|
||||||
ps_4byte_ch=ps4,
|
ps_4byte_ch=ps4,
|
||||||
|
ps_4byte_ch_l = len(ps4),
|
||||||
ps_lengths=bytes((2,)),
|
ps_lengths=bytes((2,)),
|
||||||
|
ps_max_l = 2,
|
||||||
ss_1byte_ch=bytes(),
|
ss_1byte_ch=bytes(),
|
||||||
|
ss_1byte_ch_l = 0,
|
||||||
ss_2byte_ch=bytes(),
|
ss_2byte_ch=bytes(),
|
||||||
|
ss_2byte_ch_l = 0,
|
||||||
ss_3byte_ch=bytes(),
|
ss_3byte_ch=bytes(),
|
||||||
|
ss_3byte_ch_l = 0,
|
||||||
ss_4byte_ch=bytes(),
|
ss_4byte_ch=bytes(),
|
||||||
|
ss_4byte_ch_l = 0,
|
||||||
ss_lengths=bytes(),
|
ss_lengths=bytes(),
|
||||||
|
ss_max_l = 0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("cy")
|
assert hashes[0][0] == _get_32_bit_hash("cy")
|
||||||
assert hashes[0][1] == _get_unsigned_32_bit_hash("acy")
|
assert hashes[0][1] == _get_32_bit_hash("acy")
|
||||||
assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy")
|
assert hashes[0][2] == _get_32_bit_hash("pacy")
|
||||||
assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy")
|
assert hashes[0][3] == _get_32_bit_hash("spacy")
|
||||||
assert hashes[0][4] == _get_unsigned_32_bit_hash("p")
|
assert hashes[0][4] == _get_32_bit_hash("p")
|
||||||
assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][0] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][1] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][1] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][2] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][2] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][3] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][3] == _get_32_bit_hash("✨")
|
||||||
assert hashes[1][4] == 0
|
assert hashes[1][4] == 0
|
||||||
assert hashes[2][0] == _get_unsigned_32_bit_hash("nd")
|
assert hashes[2][0] == _get_32_bit_hash("nd")
|
||||||
assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][1] == _get_32_bit_hash("and")
|
||||||
assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][2] == _get_32_bit_hash("and")
|
||||||
assert hashes[2][3] == _get_unsigned_32_bit_hash("and")
|
assert hashes[2][3] == _get_32_bit_hash("and")
|
||||||
assert hashes[2][4] == 0
|
assert hashes[2][4] == 0
|
||||||
assert hashes[3][0] == _get_unsigned_32_bit_hash("gy")
|
assert hashes[3][0] == _get_32_bit_hash("gy")
|
||||||
assert hashes[3][1] == _get_unsigned_32_bit_hash("igy")
|
assert hashes[3][1] == _get_32_bit_hash("igy")
|
||||||
assert hashes[3][2] == _get_unsigned_32_bit_hash("digy")
|
assert hashes[3][2] == _get_32_bit_hash("digy")
|
||||||
assert hashes[3][3] == _get_unsigned_32_bit_hash("odigy")
|
assert hashes[3][3] == _get_32_bit_hash("odigy")
|
||||||
assert hashes[3][4] == _get_unsigned_32_bit_hash("pr")
|
assert hashes[3][4] == _get_32_bit_hash("pr")
|
||||||
|
|
||||||
|
|
||||||
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
|
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
|
||||||
|
@ -1137,21 +1159,33 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=False,
|
cs=False,
|
||||||
p_lengths=bytes((p_length,)),
|
p_lengths=bytes((p_length,)),
|
||||||
|
p_max_l = p_length,
|
||||||
s_lengths=bytes((s_length,)),
|
s_lengths=bytes((s_length,)),
|
||||||
|
s_max_l = s_length,
|
||||||
ps_1byte_ch=bytes(),
|
ps_1byte_ch=bytes(),
|
||||||
|
ps_1byte_ch_l = 0,
|
||||||
ps_2byte_ch=bytes(),
|
ps_2byte_ch=bytes(),
|
||||||
|
ps_2byte_ch_l = 0,
|
||||||
ps_3byte_ch=bytes(),
|
ps_3byte_ch=bytes(),
|
||||||
|
ps_3byte_ch_l = 0,
|
||||||
ps_4byte_ch=bytes(),
|
ps_4byte_ch=bytes(),
|
||||||
|
ps_4byte_ch_l = 0,
|
||||||
ps_lengths=bytes(),
|
ps_lengths=bytes(),
|
||||||
|
ps_max_l = 0,
|
||||||
ss_1byte_ch=bytes(),
|
ss_1byte_ch=bytes(),
|
||||||
|
ss_1byte_ch_l = 0,
|
||||||
ss_2byte_ch=bytes(),
|
ss_2byte_ch=bytes(),
|
||||||
|
ss_2byte_ch_l = 0,
|
||||||
ss_3byte_ch=bytes(),
|
ss_3byte_ch=bytes(),
|
||||||
|
ss_3byte_ch_l = 0,
|
||||||
ss_4byte_ch=bytes(),
|
ss_4byte_ch=bytes(),
|
||||||
|
ss_4byte_ch_l = 0,
|
||||||
ss_lengths=bytes(),
|
ss_lengths=bytes(),
|
||||||
|
ss_max_l = 0
|
||||||
)
|
)
|
||||||
|
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length])
|
assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length])
|
||||||
assert hashes[0][1] == _get_unsigned_32_bit_hash("sp𐌞cé"[-s_length:])
|
assert hashes[0][1] == _get_32_bit_hash("sp𐌞cé"[-s_length:])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
|
@ -1161,49 +1195,61 @@ def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=case_sensitive,
|
cs=case_sensitive,
|
||||||
p_lengths=bytes((1,2,3,4,)),
|
p_lengths=bytes((1,2,3,4,)),
|
||||||
|
p_max_l = 4,
|
||||||
s_lengths=bytes((1,2,3,4,)),
|
s_lengths=bytes((1,2,3,4,)),
|
||||||
|
s_max_l = 4,
|
||||||
ps_1byte_ch=s1,
|
ps_1byte_ch=s1,
|
||||||
|
ps_1byte_ch_l = len(s1),
|
||||||
ps_2byte_ch=s2,
|
ps_2byte_ch=s2,
|
||||||
|
ps_2byte_ch_l = len(s2),
|
||||||
ps_3byte_ch=s3,
|
ps_3byte_ch=s3,
|
||||||
|
ps_3byte_ch_l = len(s3),
|
||||||
ps_4byte_ch=s4,
|
ps_4byte_ch=s4,
|
||||||
|
ps_4byte_ch_l = len(s4),
|
||||||
ps_lengths=bytes((1,2,3,4,)),
|
ps_lengths=bytes((1,2,3,4,)),
|
||||||
|
ps_max_l = 4,
|
||||||
ss_1byte_ch=s1,
|
ss_1byte_ch=s1,
|
||||||
|
ss_1byte_ch_l = len(s1),
|
||||||
ss_2byte_ch=s2,
|
ss_2byte_ch=s2,
|
||||||
|
ss_2byte_ch_l = len(s2),
|
||||||
ss_3byte_ch=s3,
|
ss_3byte_ch=s3,
|
||||||
|
ss_3byte_ch_l = len(s3),
|
||||||
ss_4byte_ch=s4,
|
ss_4byte_ch=s4,
|
||||||
|
ss_4byte_ch_l = len(s4),
|
||||||
ss_lengths=bytes((1,2,3,4,)),
|
ss_lengths=bytes((1,2,3,4,)),
|
||||||
|
ss_max_l = 4
|
||||||
)
|
)
|
||||||
|
|
||||||
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
|
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("i")
|
assert hashes[0][0] == _get_32_bit_hash("i")
|
||||||
assert hashes[0][1] == _get_unsigned_32_bit_hash("İ".lower())
|
assert hashes[0][1] == _get_32_bit_hash("İ".lower())
|
||||||
if case_sensitive:
|
if case_sensitive:
|
||||||
assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
|
assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "İ")
|
||||||
assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
|
assert hashes[0][3] == _get_32_bit_hash("İ".lower() + "İ")
|
||||||
assert hashes[0][4] == _get_unsigned_32_bit_hash("İ")
|
assert hashes[0][4] == _get_32_bit_hash("İ")
|
||||||
assert hashes[0][5] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ")
|
assert hashes[0][5] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ")
|
||||||
assert hashes[0][6] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
|
assert hashes[0][6] == _get_32_bit_hash("İ".lower() + "İ")
|
||||||
assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() + "İ")
|
assert hashes[0][7] == _get_32_bit_hash("İ".lower() + "İ")
|
||||||
assert hashes[0][8] == _get_unsigned_32_bit_hash("İ")
|
assert hashes[0][8] == _get_32_bit_hash("İ")
|
||||||
assert hashes[0][9] == _get_unsigned_32_bit_hash("İ")
|
assert hashes[0][9] == _get_32_bit_hash("İ")
|
||||||
assert hashes[0][12] == _get_unsigned_32_bit_hash("İ")
|
assert hashes[0][12] == _get_32_bit_hash("İ")
|
||||||
assert hashes[0][13] == _get_unsigned_32_bit_hash("İ")
|
assert hashes[0][13] == _get_32_bit_hash("İ")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "i")
|
assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "i")
|
||||||
assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() * 2)
|
assert hashes[0][3] == _get_32_bit_hash("İ".lower() * 2)
|
||||||
assert hashes[0][4] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE)
|
assert hashes[0][4] == _get_32_bit_hash(COMBINING_DOT_ABOVE)
|
||||||
assert hashes[0][5] == _get_unsigned_32_bit_hash("İ".lower())
|
assert hashes[0][5] == _get_32_bit_hash("İ".lower())
|
||||||
assert hashes[0][6] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower())
|
assert hashes[0][6] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower())
|
||||||
assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() * 2)
|
assert hashes[0][7] == _get_32_bit_hash("İ".lower() * 2)
|
||||||
assert hashes[0][8] == _get_unsigned_32_bit_hash("i")
|
assert hashes[0][8] == _get_32_bit_hash("i")
|
||||||
assert hashes[0][9] == _get_unsigned_32_bit_hash("İ".lower())
|
assert hashes[0][9] == _get_32_bit_hash("İ".lower())
|
||||||
assert hashes[0][10] == _get_unsigned_32_bit_hash("İ".lower() + "i")
|
assert hashes[0][10] == _get_32_bit_hash("İ".lower() + "i")
|
||||||
assert hashes[0][11] == _get_unsigned_32_bit_hash("İ".lower() * 2)
|
assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2)
|
||||||
assert hashes[0][12] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE)
|
assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE)
|
||||||
assert hashes[0][13] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i")
|
assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i")
|
||||||
assert hashes[0][14] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE)
|
assert hashes[0][14] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE)
|
||||||
assert hashes[0][15] == _get_unsigned_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2)
|
assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
|
@ -1219,33 +1265,45 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer,
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=case_sensitive,
|
cs=case_sensitive,
|
||||||
p_lengths=bytes((2,)),
|
p_lengths=bytes((2,)),
|
||||||
|
p_max_l = 2,
|
||||||
s_lengths=bytes((2,)),
|
s_lengths=bytes((2,)),
|
||||||
|
s_max_l = 2,
|
||||||
ps_1byte_ch=ps1,
|
ps_1byte_ch=ps1,
|
||||||
|
ps_1byte_ch_l = len(ps1),
|
||||||
ps_2byte_ch=ps2,
|
ps_2byte_ch=ps2,
|
||||||
|
ps_2byte_ch_l = len(ps2),
|
||||||
ps_3byte_ch=ps3,
|
ps_3byte_ch=ps3,
|
||||||
|
ps_3byte_ch_l = len(ps3),
|
||||||
ps_4byte_ch=ps4,
|
ps_4byte_ch=ps4,
|
||||||
|
ps_4byte_ch_l = len(ps4),
|
||||||
ps_lengths=bytes((2,)),
|
ps_lengths=bytes((2,)),
|
||||||
|
ps_max_l = 2,
|
||||||
ss_1byte_ch=bytes(),
|
ss_1byte_ch=bytes(),
|
||||||
|
ss_1byte_ch_l = 0,
|
||||||
ss_2byte_ch=bytes(),
|
ss_2byte_ch=bytes(),
|
||||||
|
ss_2byte_ch_l = 0,
|
||||||
ss_3byte_ch=bytes(),
|
ss_3byte_ch=bytes(),
|
||||||
|
ss_3byte_ch_l = 0,
|
||||||
ss_4byte_ch=bytes(),
|
ss_4byte_ch=bytes(),
|
||||||
|
ss_4byte_ch_l = 0,
|
||||||
ss_lengths=bytes(),
|
ss_lengths=bytes(),
|
||||||
|
ss_max_l = 0
|
||||||
)
|
)
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl")
|
assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl")
|
||||||
assert hashes[0][1] == _get_unsigned_32_bit_hash("19")
|
assert hashes[0][1] == _get_32_bit_hash("19")
|
||||||
assert hashes[0][2] == 0
|
assert hashes[0][2] == 0
|
||||||
assert hashes[1][0] == _get_unsigned_32_bit_hash("be")
|
assert hashes[1][0] == _get_32_bit_hash("be")
|
||||||
assert hashes[1][1] == _get_unsigned_32_bit_hash("ee")
|
assert hashes[1][1] == _get_32_bit_hash("ee")
|
||||||
if case_sensitive:
|
if case_sensitive:
|
||||||
assert hashes[1][2] == 0
|
assert hashes[1][2] == 0
|
||||||
else:
|
else:
|
||||||
assert hashes[1][2] == _get_unsigned_32_bit_hash("ee")
|
assert hashes[1][2] == _get_32_bit_hash("ee")
|
||||||
assert hashes[2][0] == hashes[3][0] == _get_unsigned_32_bit_hash("se")
|
assert hashes[2][0] == hashes[3][0] == _get_32_bit_hash("se")
|
||||||
assert hashes[2][1] == hashes[3][1] == _get_unsigned_32_bit_hash("ty")
|
assert hashes[2][1] == hashes[3][1] == _get_32_bit_hash("ty")
|
||||||
if case_sensitive:
|
if case_sensitive:
|
||||||
assert hashes[2][2] == hashes[3][2] == 0
|
assert hashes[2][2] == hashes[3][2] == 0
|
||||||
else:
|
else:
|
||||||
assert hashes[2][2] == hashes[3][2] == _get_unsigned_32_bit_hash("ee")
|
assert hashes[2][2] == hashes[3][2] == _get_32_bit_hash("ee")
|
||||||
|
|
||||||
|
|
||||||
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||||
|
@ -1253,15 +1311,27 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||||
assert doc.get_character_combination_hashes(
|
assert doc.get_character_combination_hashes(
|
||||||
cs=True,
|
cs=True,
|
||||||
p_lengths=bytes(),
|
p_lengths=bytes(),
|
||||||
|
p_max_l = 0,
|
||||||
s_lengths=bytes(),
|
s_lengths=bytes(),
|
||||||
|
s_max_l = 0,
|
||||||
ps_1byte_ch=bytes(),
|
ps_1byte_ch=bytes(),
|
||||||
|
ps_1byte_ch_l=0,
|
||||||
ps_2byte_ch=bytes(),
|
ps_2byte_ch=bytes(),
|
||||||
|
ps_2byte_ch_l=0,
|
||||||
ps_3byte_ch=bytes(),
|
ps_3byte_ch=bytes(),
|
||||||
|
ps_3byte_ch_l=0,
|
||||||
ps_4byte_ch=bytes(),
|
ps_4byte_ch=bytes(),
|
||||||
|
ps_4byte_ch_l=0,
|
||||||
ps_lengths=bytes(),
|
ps_lengths=bytes(),
|
||||||
|
ps_max_l = 0,
|
||||||
ss_1byte_ch=bytes(),
|
ss_1byte_ch=bytes(),
|
||||||
|
ss_1byte_ch_l=0,
|
||||||
ss_2byte_ch=bytes(),
|
ss_2byte_ch=bytes(),
|
||||||
|
ss_2byte_ch_l=0,
|
||||||
ss_3byte_ch=bytes(),
|
ss_3byte_ch=bytes(),
|
||||||
|
ss_3byte_ch_l=0,
|
||||||
ss_4byte_ch=bytes(),
|
ss_4byte_ch=bytes(),
|
||||||
|
ss_4byte_ch_l=0,
|
||||||
ss_lengths=bytes(),
|
ss_lengths=bytes(),
|
||||||
|
ss_max_l = 0,
|
||||||
).shape == (1, 0)
|
).shape == (1, 0)
|
||||||
|
|
|
@ -38,20 +38,33 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
||||||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||||
|
|
||||||
|
|
||||||
cdef void _set_affix_lengths(
|
cdef void _set_prefix_lengths(
|
||||||
const unsigned char* tok_str,
|
const unsigned char* tok_str,
|
||||||
unsigned char* aff_l_buf,
|
const int tok_str_l,
|
||||||
const int pref_l,
|
unsigned char* pref_l_buf,
|
||||||
const int suff_l,
|
const int p_max_l,
|
||||||
) nogil
|
) nogil
|
||||||
|
|
||||||
|
|
||||||
|
cdef void _set_suffix_lengths(
|
||||||
|
const unsigned char* tok_str,
|
||||||
|
const int tok_str_l,
|
||||||
|
unsigned char* suff_l_buf,
|
||||||
|
const int s_max_l,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
cdef void _search_for_chars(
|
cdef void _search_for_chars(
|
||||||
const unsigned char* tok_str,
|
const unsigned char* tok_str,
|
||||||
|
const int tok_str_l,
|
||||||
const unsigned char* s_1byte_ch,
|
const unsigned char* s_1byte_ch,
|
||||||
|
const int s_1byte_ch_l,
|
||||||
const unsigned char* s_2byte_ch,
|
const unsigned char* s_2byte_ch,
|
||||||
|
const int s_2byte_ch_l,
|
||||||
const unsigned char* s_3byte_ch,
|
const unsigned char* s_3byte_ch,
|
||||||
|
const int s_3byte_ch_l,
|
||||||
const unsigned char* s_4byte_ch,
|
const unsigned char* s_4byte_ch,
|
||||||
|
const int s_4byte_ch_l,
|
||||||
unsigned char* res_buf,
|
unsigned char* res_buf,
|
||||||
int max_res_l,
|
int max_res_l,
|
||||||
unsigned char* l_buf,
|
unsigned char* l_buf,
|
||||||
|
@ -59,6 +72,18 @@ cdef void _search_for_chars(
|
||||||
) nogil
|
) nogil
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _write_hashes(
|
||||||
|
const unsigned char* res_buf,
|
||||||
|
const unsigned char* aff_l_buf,
|
||||||
|
const unsigned char* offset_buf,
|
||||||
|
const int end_idx,
|
||||||
|
np.ndarray[np.int64_t, ndim=2] hashes,
|
||||||
|
const int tok_i,
|
||||||
|
const int start_hash_idx,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
cdef readonly Pool mem
|
cdef readonly Pool mem
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
|
|
@ -179,17 +179,58 @@ class Doc:
|
||||||
*,
|
*,
|
||||||
cs: bool,
|
cs: bool,
|
||||||
p_lengths: bytes,
|
p_lengths: bytes,
|
||||||
|
p_max_l: int,
|
||||||
s_lengths: bytes,
|
s_lengths: bytes,
|
||||||
|
s_max_l: int,
|
||||||
ps_1byte_ch: bytes,
|
ps_1byte_ch: bytes,
|
||||||
|
ps_1_byte_ch_l: int,
|
||||||
ps_2byte_ch: bytes,
|
ps_2byte_ch: bytes,
|
||||||
|
ps_2_byte_ch_l: int,
|
||||||
ps_3byte_ch: bytes,
|
ps_3byte_ch: bytes,
|
||||||
|
ps_3_byte_ch_l: int,
|
||||||
ps_4byte_ch: bytes,
|
ps_4byte_ch: bytes,
|
||||||
|
ps_4_byte_ch_l: int,
|
||||||
ps_lengths: bytes,
|
ps_lengths: bytes,
|
||||||
|
ps_max_l: int,
|
||||||
ss_1byte_ch: bytes,
|
ss_1byte_ch: bytes,
|
||||||
|
ss_1_byte_ch_l: int,
|
||||||
ss_2byte_ch: bytes,
|
ss_2byte_ch: bytes,
|
||||||
|
ss_2_byte_ch_l: int,
|
||||||
ss_3byte_ch: bytes,
|
ss_3byte_ch: bytes,
|
||||||
|
ss_3_byte_ch_l: int,
|
||||||
ss_4byte_ch: bytes,
|
ss_4byte_ch: bytes,
|
||||||
|
ss_4_byte_ch_l: int,
|
||||||
ss_lengths: bytes,
|
ss_lengths: bytes,
|
||||||
|
ss_max_l: int,
|
||||||
) -> Ints2d: ...
|
) -> Ints2d: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_array_attrs() -> Tuple[Any]: ...
|
def _get_array_attrs() -> Tuple[Any]: ...
|
||||||
|
|
||||||
|
def get_character_combination_hashes(self,
|
||||||
|
*,
|
||||||
|
const bint cs,
|
||||||
|
const unsigned char* p_lengths,
|
||||||
|
const int p_max_l,
|
||||||
|
const unsigned char* s_lengths,
|
||||||
|
const int s_max_l,
|
||||||
|
const unsigned char* ps_1byte_ch,
|
||||||
|
const int ps_1_byte_ch_l,
|
||||||
|
const unsigned char* ps_2byte_ch,
|
||||||
|
const int ps_2_byte_ch_l,
|
||||||
|
const unsigned char* ps_3byte_ch,
|
||||||
|
const int ps_3_byte_ch_l,
|
||||||
|
const unsigned char* ps_4byte_ch,
|
||||||
|
const int ps_4_byte_ch_l,
|
||||||
|
const unsigned char* ps_lengths,
|
||||||
|
const int ps_max_l,
|
||||||
|
const unsigned char* ss_1byte_ch,
|
||||||
|
const int ss_1_byte_ch_l,
|
||||||
|
const unsigned char* ss_2byte_ch,
|
||||||
|
const int ss_2_byte_ch_l,
|
||||||
|
const unsigned char* ss_3byte_ch,
|
||||||
|
const int ss_3_byte_ch_l,
|
||||||
|
const unsigned char* ss_4byte_ch,
|
||||||
|
const int ss_4_byte_ch_l,
|
||||||
|
const unsigned char* ss_lengths,
|
||||||
|
const int ss_max_l,
|
||||||
|
)
|
|
@ -956,7 +956,7 @@ cdef class Doc:
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef attr_id_t feature
|
cdef attr_id_t feature
|
||||||
cdef np.ndarray[attr_t, ndim=2] output
|
cdef np.ndarray[attr_t, ndim=2] output
|
||||||
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
# Handle scalar/list inputs of cdef np.strings/ints for py_attr_ids
|
||||||
# See also #3064
|
# See also #3064
|
||||||
if isinstance(py_attr_ids, str):
|
if isinstance(py_attr_ids, str):
|
||||||
# Handle inputs like doc.to_array('ORTH')
|
# Handle inputs like doc.to_array('ORTH')
|
||||||
|
@ -1735,22 +1735,34 @@ cdef class Doc:
|
||||||
j += 1
|
j += 1
|
||||||
return output
|
return output
|
||||||
|
|
||||||
@cython.boundscheck(False) # Deactivate bounds checking
|
#@cython.boundscheck(False) # Deactivate bounds checking
|
||||||
def get_character_combination_hashes(self,
|
def get_character_combination_hashes(self,
|
||||||
*,
|
*,
|
||||||
const bint cs,
|
const bint cs,
|
||||||
const unsigned char* p_lengths,
|
const unsigned char* p_lengths,
|
||||||
|
const int p_max_l,
|
||||||
const unsigned char* s_lengths,
|
const unsigned char* s_lengths,
|
||||||
|
const int s_max_l,
|
||||||
const unsigned char* ps_1byte_ch,
|
const unsigned char* ps_1byte_ch,
|
||||||
|
const int ps_1byte_ch_l,
|
||||||
const unsigned char* ps_2byte_ch,
|
const unsigned char* ps_2byte_ch,
|
||||||
|
const int ps_2byte_ch_l,
|
||||||
const unsigned char* ps_3byte_ch,
|
const unsigned char* ps_3byte_ch,
|
||||||
|
const int ps_3byte_ch_l,
|
||||||
const unsigned char* ps_4byte_ch,
|
const unsigned char* ps_4byte_ch,
|
||||||
|
const int ps_4byte_ch_l,
|
||||||
const unsigned char* ps_lengths,
|
const unsigned char* ps_lengths,
|
||||||
|
const int ps_max_l,
|
||||||
const unsigned char* ss_1byte_ch,
|
const unsigned char* ss_1byte_ch,
|
||||||
|
const int ss_1byte_ch_l,
|
||||||
const unsigned char* ss_2byte_ch,
|
const unsigned char* ss_2byte_ch,
|
||||||
|
const int ss_2byte_ch_l,
|
||||||
const unsigned char* ss_3byte_ch,
|
const unsigned char* ss_3byte_ch,
|
||||||
|
const int ss_3byte_ch_l,
|
||||||
const unsigned char* ss_4byte_ch,
|
const unsigned char* ss_4byte_ch,
|
||||||
|
const int ss_4byte_ch_l,
|
||||||
const unsigned char* ss_lengths,
|
const unsigned char* ss_lengths,
|
||||||
|
const int ss_max_l,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
||||||
|
@ -1766,39 +1778,33 @@ cdef class Doc:
|
||||||
cs: if *False*, hashes are generated based on the lower-case version of each token.
|
cs: if *False*, hashes are generated based on the lower-case version of each token.
|
||||||
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
|
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
|
||||||
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
|
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||||
|
p_max_l: the value of *p_lengths[-1]*, or *0* if *p_lengths==None*. Passed in for speed.
|
||||||
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
|
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
|
||||||
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
||||||
|
s_max_l: the value of *s_lengths[-1]*, or *0* if *s_lengths==None*. Passed in for speed.
|
||||||
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
||||||
starting at the beginning.
|
starting at the beginning.
|
||||||
|
ps_<n>byte_ch_l: the length of *ps_<n>byte_ch*. Passed in for speed.
|
||||||
ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed
|
ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed
|
||||||
in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings
|
in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings
|
||||||
hashed for "spaCy" would be "a" and "ac".
|
hashed for "spaCy" would be "a" and "ac".
|
||||||
|
ps_max_l: the value of *ps_lengths[-1]*, or *0* if *ps_lengths==None*. Passed in for speed.
|
||||||
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
||||||
starting at the end.
|
starting at the end.
|
||||||
|
ss_<n>byte_ch_l: the length of *ss_<n>byte_ch*. Passed in for speed.
|
||||||
ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed
|
ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed
|
||||||
in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings
|
in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings
|
||||||
hashed for "spaCy" would be "c" and "ca".
|
hashed for "spaCy" would be "c" and "ca".
|
||||||
|
ss_max_l: the value of *ss_lengths[-1]*, or *0* if *ss_lengths==None*. Passed in for speed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Define the result array and work out what is used for what in axis 1
|
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty(
|
||||||
cdef int num_toks = len(self)
|
(self.length, p_max_l + s_max_l + ps_max_l + ss_max_l), dtype="int64")
|
||||||
cdef int p_h_num = strlen(<char*> p_lengths)
|
|
||||||
cdef int s_h_num = strlen(<char*> s_lengths), s_h_end = p_h_num + s_h_num
|
|
||||||
cdef int ps_h_num = strlen(<char*> ps_lengths), ps_h_end = s_h_end + ps_h_num
|
|
||||||
cdef int ss_h_num = strlen(<char*> ss_lengths), ss_h_end = ps_h_end + ss_h_num
|
|
||||||
cdef np.ndarray[np.int64_t, ndim=2] hashes
|
|
||||||
hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
|
|
||||||
|
|
||||||
# Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
|
|
||||||
cdef int p_max_l = p_lengths[-1] if p_h_num > 0 else 0
|
|
||||||
cdef int s_max_l = s_lengths[-1] if s_h_num > 0 else 0
|
|
||||||
cdef int ps_max_l = ps_lengths[-1] if ps_h_num > 0 else 0
|
|
||||||
cdef int ss_max_l = ss_lengths[-1] if ss_h_num > 0 else 0
|
|
||||||
|
|
||||||
# Define / allocate buffers
|
# Define / allocate buffers
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef int aff_l = p_max_l + s_max_l
|
cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, 1)
|
||||||
cdef unsigned char* aff_l_buf = <unsigned char*> mem.alloc(aff_l, 1)
|
cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(p_max_l, 1)
|
||||||
cdef unsigned char* ps_res_buf = <unsigned char*> mem.alloc(ps_max_l, 4)
|
cdef unsigned char* ps_res_buf = <unsigned char*> mem.alloc(ps_max_l, 4)
|
||||||
cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, 1)
|
cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, 1)
|
||||||
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
|
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
|
||||||
|
@ -1806,48 +1812,34 @@ cdef class Doc:
|
||||||
|
|
||||||
# Define working variables
|
# Define working variables
|
||||||
cdef TokenC tok_c
|
cdef TokenC tok_c
|
||||||
cdef int tok_i, offset
|
cdef int hash_idx, tok_i, tok_str_l
|
||||||
cdef uint64_t hash_val = 0
|
|
||||||
cdef attr_t num_tok_attr
|
cdef attr_t num_tok_attr
|
||||||
cdef const unsigned char* tok_str
|
cdef const unsigned char* tok_str
|
||||||
|
|
||||||
for tok_i in range(num_toks):
|
for tok_i in range(self.length):
|
||||||
tok_c = self.c[tok_i]
|
tok_c = self.c[tok_i]
|
||||||
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
|
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
|
||||||
tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
|
tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
|
||||||
|
tok_str_l = strlen(<char*> tok_str)
|
||||||
|
hash_idx = 0
|
||||||
|
|
||||||
if aff_l > 0:
|
if p_max_l > 0:
|
||||||
_set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l)
|
_set_prefix_lengths(tok_str, tok_str_l, pref_l_buf, p_max_l)
|
||||||
|
hash_idx = _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes, tok_i, 0)
|
||||||
|
|
||||||
for hash_idx in range(p_h_num):
|
if s_max_l > 0:
|
||||||
offset = aff_l_buf[p_lengths[hash_idx] - 1]
|
_set_suffix_lengths(tok_str, tok_str_l, suff_l_buf, s_max_l)
|
||||||
if offset > 0:
|
hash_idx = _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l, hashes, tok_i, hash_idx)
|
||||||
hash_val = hash32(<void*> &tok_str[0], offset, 0)
|
|
||||||
hashes[tok_i, hash_idx] = hash_val
|
|
||||||
|
|
||||||
for hash_idx in range(p_h_num, s_h_end):
|
if ps_max_l > 0:
|
||||||
offset = aff_l_buf[s_lengths[hash_idx - p_h_num] + p_max_l - 1]
|
_search_for_chars(tok_str, tok_str_l, ps_1byte_ch, ps_1byte_ch_l, ps_2byte_ch, ps_2byte_ch_l,
|
||||||
if offset > 0:
|
ps_3byte_ch, ps_3byte_ch_l, ps_4byte_ch, ps_4byte_ch_l, ps_res_buf, ps_max_l, ps_l_buf, False)
|
||||||
hash_val = hash32(<void*> &tok_str[len(tok_str) - offset], offset, 0)
|
hash_idx = _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes, tok_i, hash_idx)
|
||||||
hashes[tok_i, hash_idx] = hash_val
|
|
||||||
|
|
||||||
if ps_h_num > 0:
|
if ss_max_l > 0:
|
||||||
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False)
|
_search_for_chars(tok_str, tok_str_l, ss_1byte_ch, ss_1byte_ch_l, ss_2byte_ch, ss_2byte_ch_l,
|
||||||
hash_val = 0
|
ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True)
|
||||||
for hash_idx in range(s_h_end, ps_h_end):
|
_write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes, tok_i, hash_idx)
|
||||||
offset = ps_l_buf[ps_lengths[hash_idx - s_h_end] - 1]
|
|
||||||
if offset > 0:
|
|
||||||
hash_val = hash32(ps_res_buf, offset, 0)
|
|
||||||
hashes[tok_i, hash_idx] = hash_val
|
|
||||||
|
|
||||||
if ss_h_num > 0:
|
|
||||||
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True)
|
|
||||||
hash_val = 0
|
|
||||||
for hash_idx in range(ps_h_end, ss_h_end):
|
|
||||||
offset = ss_l_buf[ss_lengths[hash_idx - ps_h_end] - 1]
|
|
||||||
if offset > 0:
|
|
||||||
hash_val = hash32(ss_res_buf, offset, 0)
|
|
||||||
hashes[tok_i, hash_idx] = hash_val
|
|
||||||
|
|
||||||
return hashes
|
return hashes
|
||||||
|
|
||||||
|
@ -2031,59 +2023,81 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
lca_matrix[k, j] = lca - start
|
lca_matrix[k, j] = lca - start
|
||||||
return lca_matrix
|
return lca_matrix
|
||||||
|
|
||||||
@cython.boundscheck(False) # Deactivate bounds checking
|
#@cython.boundscheck(False) # Deactivate bounds checking
|
||||||
cdef void _set_affix_lengths(
|
cdef void _set_prefix_lengths(
|
||||||
const unsigned char* tok_str,
|
const unsigned char* tok_str,
|
||||||
unsigned char* aff_l_buf,
|
const int tok_str_l,
|
||||||
const int pref_l,
|
unsigned char* pref_l_buf,
|
||||||
const int suff_l,
|
const int p_max_l,
|
||||||
) nogil:
|
) nogil:
|
||||||
""" Populate *aff_l_buf*, which has length *pref_l+suff_l* with the byte lengths of the first *pref_l* and the last
|
""" Populate *pref_l_buf*, which has length *pref_l*, with the byte lengths of the first *pref_l* characters within *tok_str*.
|
||||||
*suff_l* characters within *tok_str*. Lengths that are greater than the character length of the whole word are
|
Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word.
|
||||||
populated with the byte length of the whole word.
|
|
||||||
|
|
||||||
tok_str: a memoryview of a UTF-8 representation of a string.
|
tok_str: a UTF-8 representation of a string.
|
||||||
aff_l_buf: a buffer of length *pref_l+suff_l* in which to store the lengths. The calling code ensures that lengths
|
tok_str_l: the length of *tok_str*.
|
||||||
|
pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The calling code ensures that lengths
|
||||||
greater than 255 cannot occur.
|
greater than 255 cannot occur.
|
||||||
pref_l: the number of characters to process at the beginning of the word.
|
p_max_l: the number of characters to process at the beginning of the word.
|
||||||
suff_l: the number of characters to process at the end of the word.
|
|
||||||
"""
|
"""
|
||||||
cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = strlen(<char*> tok_str)
|
cdef int tok_str_idx = 1, pref_l_buf_idx = 0
|
||||||
|
|
||||||
while aff_l_buf_idx < pref_l:
|
while pref_l_buf_idx < p_max_l:
|
||||||
if (tok_str_idx == strlen(<char*> tok_str)
|
if (tok_str[tok_str_idx] == 0 # end of string
|
||||||
or
|
or
|
||||||
((tok_str[tok_str_idx] & 0xc0) != 0x80 # not a continuation character
|
((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character
|
||||||
):
|
):
|
||||||
aff_l_buf[aff_l_buf_idx] = tok_str_idx
|
pref_l_buf[pref_l_buf_idx] = tok_str_idx
|
||||||
aff_l_buf_idx += 1
|
pref_l_buf_idx += 1
|
||||||
tok_str_idx += 1
|
if tok_str[tok_str_idx] == 0: # end of string
|
||||||
if tok_str_idx > tok_str_l:
|
|
||||||
break
|
break
|
||||||
|
tok_str_idx += 1
|
||||||
|
|
||||||
if aff_l_buf_idx < pref_l:
|
if pref_l_buf_idx < p_max_l:
|
||||||
memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l - aff_l_buf_idx)
|
memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
|
||||||
aff_l_buf_idx = pref_l
|
|
||||||
|
|
||||||
tok_str_idx = tok_str_l - 1
|
|
||||||
while aff_l_buf_idx < pref_l + suff_l:
|
#@cython.boundscheck(False) # Deactivate bounds checking
|
||||||
|
cdef void _set_suffix_lengths(
|
||||||
|
const unsigned char* tok_str,
|
||||||
|
const int tok_str_l,
|
||||||
|
unsigned char* suff_l_buf,
|
||||||
|
const int s_max_l,
|
||||||
|
):
|
||||||
|
""" Populate *suff_l_buf*, which has length *suff_l*, with the byte lengths of the last *suff_l* characters within *tok_str*.
|
||||||
|
Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word.
|
||||||
|
|
||||||
|
tok_str: a UTF-8 representation of a string.
|
||||||
|
tok_str_l: the length of *tok_str*.
|
||||||
|
suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The calling code ensures that lengths
|
||||||
|
greater than 255 cannot occur.
|
||||||
|
s_max_l: the number of characters to process at the end of the word.
|
||||||
|
"""
|
||||||
|
cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0
|
||||||
|
|
||||||
|
while suff_l_buf_idx < s_max_l:
|
||||||
if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
|
if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
|
||||||
aff_l_buf[aff_l_buf_idx] = tok_str_l - tok_str_idx
|
suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx
|
||||||
aff_l_buf_idx += 1
|
suff_l_buf_idx += 1
|
||||||
tok_str_idx -= 1
|
tok_str_idx -= 1
|
||||||
if tok_str_idx < 0:
|
if tok_str_idx < 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
if aff_l_buf_idx < pref_l + suff_l:
|
if suff_l_buf_idx < s_max_l:
|
||||||
memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l + suff_l - aff_l_buf_idx)
|
memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
|
||||||
|
|
||||||
@cython.boundscheck(False) # Deactivate bounds checking
|
|
||||||
|
#@cython.boundscheck(False) # Deactivate bounds checking
|
||||||
cdef void _search_for_chars(
|
cdef void _search_for_chars(
|
||||||
const unsigned char* tok_str,
|
const unsigned char* tok_str,
|
||||||
|
const int tok_str_l,
|
||||||
const unsigned char* s_1byte_ch,
|
const unsigned char* s_1byte_ch,
|
||||||
|
const int s_1byte_ch_l,
|
||||||
const unsigned char* s_2byte_ch,
|
const unsigned char* s_2byte_ch,
|
||||||
|
const int s_2byte_ch_l,
|
||||||
const unsigned char* s_3byte_ch,
|
const unsigned char* s_3byte_ch,
|
||||||
|
const int s_3byte_ch_l,
|
||||||
const unsigned char* s_4byte_ch,
|
const unsigned char* s_4byte_ch,
|
||||||
|
const int s_4byte_ch_l,
|
||||||
unsigned char* res_buf,
|
unsigned char* res_buf,
|
||||||
int max_res_l,
|
int max_res_l,
|
||||||
unsigned char* l_buf,
|
unsigned char* l_buf,
|
||||||
|
@ -2096,6 +2110,7 @@ cdef void _search_for_chars(
|
||||||
which may be *0* if the search was not successful.
|
which may be *0* if the search was not successful.
|
||||||
|
|
||||||
tok_str: a memoryview of a UTF-8 representation of a string.
|
tok_str: a memoryview of a UTF-8 representation of a string.
|
||||||
|
tok_str_l: the length of *tok_str*.
|
||||||
s_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for.
|
s_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for.
|
||||||
res_buf: the buffer in which to place the search results.
|
res_buf: the buffer in which to place the search results.
|
||||||
max_res_l: the maximum number of found characters to place in *res_buf*.
|
max_res_l: the maximum number of found characters to place in *res_buf*.
|
||||||
|
@ -2103,7 +2118,7 @@ cdef void _search_for_chars(
|
||||||
The calling code ensures that lengths greater than 255 cannot occur.
|
The calling code ensures that lengths greater than 255 cannot occur.
|
||||||
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
|
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
|
||||||
"""
|
"""
|
||||||
cdef int tok_str_l = strlen(<char*> tok_str), res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx
|
cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx
|
||||||
cdef int search_chars_l
|
cdef int search_chars_l
|
||||||
cdef const unsigned char* search_chars
|
cdef const unsigned char* search_chars
|
||||||
|
|
||||||
|
@ -2121,13 +2136,16 @@ cdef void _search_for_chars(
|
||||||
ch_wdth = last_tok_str_idx - this_tok_str_idx
|
ch_wdth = last_tok_str_idx - this_tok_str_idx
|
||||||
if ch_wdth == 1:
|
if ch_wdth == 1:
|
||||||
search_chars = s_1byte_ch
|
search_chars = s_1byte_ch
|
||||||
|
search_chars_l = s_1byte_ch_l
|
||||||
elif ch_wdth == 2:
|
elif ch_wdth == 2:
|
||||||
search_chars = s_2byte_ch
|
search_chars = s_2byte_ch
|
||||||
|
search_chars_l = s_2byte_ch_l
|
||||||
elif ch_wdth == 3:
|
elif ch_wdth == 3:
|
||||||
search_chars = s_3byte_ch
|
search_chars = s_3byte_ch
|
||||||
|
search_chars_l = s_3byte_ch_l
|
||||||
else:
|
else:
|
||||||
search_chars = s_4byte_ch
|
search_chars = s_4byte_ch
|
||||||
search_chars_l = strlen(<char*> search_chars)
|
search_chars_l = s_4byte_ch_l
|
||||||
tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
|
tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
|
||||||
|
|
||||||
search_char_idx = 0
|
search_char_idx = 0
|
||||||
|
@ -2157,6 +2175,43 @@ cdef void _search_for_chars(
|
||||||
memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
|
memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _write_hashes(
|
||||||
|
const unsigned char* res_buf,
|
||||||
|
const unsigned char* aff_l_buf,
|
||||||
|
const unsigned char* offset_buf,
|
||||||
|
const int end_idx,
|
||||||
|
np.ndarray[np.int64_t, ndim=2] hashes,
|
||||||
|
const int tok_i,
|
||||||
|
const int start_hash_idx,
|
||||||
|
):
|
||||||
|
""" Write hashes for a token/rich property group combination.
|
||||||
|
|
||||||
|
res_buf: the string from which to generate the hash values.
|
||||||
|
aff_l_buf: one-byte lengths describing how many characters to hash.
|
||||||
|
offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
|
||||||
|
end_idx: if not *0*, the offset within *res_buf* that should end each affix being hashed;
|
||||||
|
if *0*, affixes start at the beginning of *res_buf* rather than ending at the end.
|
||||||
|
hashes: the 2D Numpy array in which the hashes are stored.
|
||||||
|
tok_i: the index of axis 0 of *hashes* to write to.
|
||||||
|
start_hash_idx: the index of axis 1 of *hashes* at which to start writing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cdef int offset, aff_l, hash_val = 0, hash_idx = start_hash_idx
|
||||||
|
|
||||||
|
while True:
|
||||||
|
aff_l = aff_l_buf[hash_idx - start_hash_idx]
|
||||||
|
if aff_l == 0:
|
||||||
|
return hash_idx
|
||||||
|
offset = offset_buf[aff_l - 1]
|
||||||
|
if offset > 0:
|
||||||
|
if end_idx != 0:
|
||||||
|
hash_val = hash32(<void*> res_buf + end_idx - offset, offset, 0)
|
||||||
|
else:
|
||||||
|
hash_val = hash32(<void*> res_buf, offset, 0)
|
||||||
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
hash_idx += 1
|
||||||
|
|
||||||
|
|
||||||
def pickle_doc(doc):
|
def pickle_doc(doc):
|
||||||
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
|
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
|
||||||
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user