diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index e03692f66..61eab311b 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -994,10 +994,8 @@ def test_doc_spans_setdefault(en_tokenizer): assert len(doc.spans["key3"]) == 2 -def _get_unsigned_32_bit_hash(input: str) -> int: +def _get_32_bit_hash(input: str) -> int: working_hash = hash(input.encode("UTF-8")) - if working_hash < 0: - working_hash = working_hash + (2 << 31) return working_hash @@ -1009,79 +1007,91 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive hashes = doc.get_character_combination_hashes( cs=case_sensitive, p_lengths=bytes((1, 3, 4,)), + p_max_l = 4, s_lengths=bytes((2, 3, 4, 5,)), + s_max_l = 5, ps_1byte_ch=ps1, + ps_1byte_ch_l = len(ps1), ps_2byte_ch=ps2, + ps_2byte_ch_l = len(ps2), ps_3byte_ch=ps3, + ps_3byte_ch_l = len(ps3), ps_4byte_ch=ps4, + ps_4byte_ch_l = len(ps4), ps_lengths=bytes((2,)), + ps_max_l = 2, ss_1byte_ch=ss1, + ss_1byte_ch_l = len(ss1), ss_2byte_ch=ss2, + ss_2byte_ch_l = len(ss2), ss_3byte_ch=ss3, + ss_3byte_ch_l = len(ss3), ss_4byte_ch=ss4, + ss_4byte_ch_l = len(ss4), ss_lengths=bytes((1, 2,)), + ss_max_l = 2, ) - assert hashes[0][0] == _get_unsigned_32_bit_hash("s") - assert hashes[0][1] == _get_unsigned_32_bit_hash("spa") - assert hashes[0][2] == _get_unsigned_32_bit_hash( + assert hashes[0][0] == _get_32_bit_hash("s") + assert hashes[0][1] == _get_32_bit_hash("spa") + assert hashes[0][2] == _get_32_bit_hash( "spaC" if case_sensitive else "spac" ) - assert hashes[0][3] == _get_unsigned_32_bit_hash("Cy" if case_sensitive else "cy") - assert hashes[0][4] == _get_unsigned_32_bit_hash("aCy" if case_sensitive else "acy") - assert hashes[0][5] == _get_unsigned_32_bit_hash( + assert hashes[0][3] == _get_32_bit_hash("Cy" if case_sensitive else "cy") + assert hashes[0][4] == _get_32_bit_hash("aCy" if case_sensitive else "acy") + assert hashes[0][5] == _get_32_bit_hash( "paCy" if case_sensitive else "pacy" ) - assert hashes[0][6] == _get_unsigned_32_bit_hash( + assert hashes[0][6] == _get_32_bit_hash( "spaCy" if case_sensitive else "spacy" ) - assert hashes[0][7] == _get_unsigned_32_bit_hash("p") - assert hashes[0][8] == _get_unsigned_32_bit_hash("p") - assert hashes[0][9] == _get_unsigned_32_bit_hash("p") - assert hashes[1][0] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][1] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][2] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][3] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][4] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][5] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][6] == _get_unsigned_32_bit_hash("✨") + assert hashes[0][7] == _get_32_bit_hash("p") + assert hashes[0][8] == _get_32_bit_hash("p") + assert hashes[0][9] == _get_32_bit_hash("p") + assert hashes[1][0] == _get_32_bit_hash("✨") + assert hashes[1][1] == _get_32_bit_hash("✨") + assert hashes[1][2] == _get_32_bit_hash("✨") + assert hashes[1][3] == _get_32_bit_hash("✨") + assert hashes[1][4] == _get_32_bit_hash("✨") + assert hashes[1][5] == _get_32_bit_hash("✨") + assert hashes[1][6] == _get_32_bit_hash("✨") assert hashes[1][7] == 0 - assert hashes[1][8] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][9] == _get_unsigned_32_bit_hash("✨") - assert hashes[2][0] == _get_unsigned_32_bit_hash("a") - assert hashes[2][1] == _get_unsigned_32_bit_hash("and") - assert hashes[2][2] == _get_unsigned_32_bit_hash("and") - assert hashes[2][3] == _get_unsigned_32_bit_hash("nd") - assert hashes[2][4] == _get_unsigned_32_bit_hash("and") - assert hashes[2][5] == _get_unsigned_32_bit_hash("and") - assert hashes[2][6] == _get_unsigned_32_bit_hash("and") + assert hashes[1][8] == _get_32_bit_hash("✨") + assert hashes[1][9] == _get_32_bit_hash("✨") + assert hashes[2][0] == _get_32_bit_hash("a") + assert hashes[2][1] == _get_32_bit_hash("and") + assert hashes[2][2] == _get_32_bit_hash("and") + assert hashes[2][3] == _get_32_bit_hash("nd") + assert hashes[2][4] == _get_32_bit_hash("and") + assert hashes[2][5] == _get_32_bit_hash("and") + assert hashes[2][6] == _get_32_bit_hash("and") assert hashes[2][7] == 0 assert hashes[2][8] == 0 assert hashes[2][9] == 0 - assert hashes[3][0] == _get_unsigned_32_bit_hash("P" if case_sensitive else "p") - assert hashes[3][1] == _get_unsigned_32_bit_hash("Pro" if case_sensitive else "pro") - assert hashes[3][2] == _get_unsigned_32_bit_hash( + assert hashes[3][0] == _get_32_bit_hash("P" if case_sensitive else "p") + assert hashes[3][1] == _get_32_bit_hash("Pro" if case_sensitive else "pro") + assert hashes[3][2] == _get_32_bit_hash( "Prod" if case_sensitive else "prod" ) - assert hashes[3][3] == _get_unsigned_32_bit_hash("gy") - assert hashes[3][4] == _get_unsigned_32_bit_hash("igy") - assert hashes[3][5] == _get_unsigned_32_bit_hash("digy") - assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy") - assert hashes[3][7] == 0 if case_sensitive else _get_unsigned_32_bit_hash("pr") + assert hashes[3][3] == _get_32_bit_hash("gy") + assert hashes[3][4] == _get_32_bit_hash("igy") + assert hashes[3][5] == _get_32_bit_hash("digy") + assert hashes[3][6] == _get_32_bit_hash("odigy") + assert hashes[3][7] == 0 if case_sensitive else _get_32_bit_hash("pr") - assert hashes[3][8] == _get_unsigned_32_bit_hash("r") + assert hashes[3][8] == _get_32_bit_hash("r") if case_sensitive: - assert hashes[3][9] == _get_unsigned_32_bit_hash("r") + assert hashes[3][9] == _get_32_bit_hash("r") else: - assert hashes[3][9] == _get_unsigned_32_bit_hash("rp") + assert hashes[3][9] == _get_32_bit_hash("rp") # check values are the same cross-platform if case_sensitive: - assert hashes[0][2] == 3041529170 + assert hashes[0][2] == -1253438126 else: - assert hashes[0][2] == 2199614696 + assert hashes[0][2] == -2095352600 assert hashes[1][3] == 910783208 assert hashes[3][8] == 1553167345 @@ -1092,40 +1102,52 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer): hashes = doc.get_character_combination_hashes( cs=False, p_lengths=bytes(), + p_max_l = 0, s_lengths=bytes((2,3,4,5,)), + s_max_l = 5, ps_1byte_ch=ps1, + ps_1byte_ch_l = len(ps1), ps_2byte_ch=ps2, + ps_2byte_ch_l = len(ps2), ps_3byte_ch=ps3, + ps_3byte_ch_l = len(ps3), ps_4byte_ch=ps4, + ps_4byte_ch_l = len(ps4), ps_lengths=bytes((2,)), + ps_max_l = 2, ss_1byte_ch=bytes(), + ss_1byte_ch_l = 0, ss_2byte_ch=bytes(), + ss_2byte_ch_l = 0, ss_3byte_ch=bytes(), + ss_3byte_ch_l = 0, ss_4byte_ch=bytes(), + ss_4byte_ch_l = 0, ss_lengths=bytes(), + ss_max_l = 0, ) - assert hashes[0][0] == _get_unsigned_32_bit_hash("cy") - assert hashes[0][1] == _get_unsigned_32_bit_hash("acy") - assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy") - assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy") - assert hashes[0][4] == _get_unsigned_32_bit_hash("p") - assert hashes[1][0] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][1] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][2] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][3] == _get_unsigned_32_bit_hash("✨") + assert hashes[0][0] == _get_32_bit_hash("cy") + assert hashes[0][1] == _get_32_bit_hash("acy") + assert hashes[0][2] == _get_32_bit_hash("pacy") + assert hashes[0][3] == _get_32_bit_hash("spacy") + assert hashes[0][4] == _get_32_bit_hash("p") + assert hashes[1][0] == _get_32_bit_hash("✨") + assert hashes[1][1] == _get_32_bit_hash("✨") + assert hashes[1][2] == _get_32_bit_hash("✨") + assert hashes[1][3] == _get_32_bit_hash("✨") assert hashes[1][4] == 0 - assert hashes[2][0] == _get_unsigned_32_bit_hash("nd") - assert hashes[2][1] == _get_unsigned_32_bit_hash("and") - assert hashes[2][2] == _get_unsigned_32_bit_hash("and") - assert hashes[2][3] == _get_unsigned_32_bit_hash("and") + assert hashes[2][0] == _get_32_bit_hash("nd") + assert hashes[2][1] == _get_32_bit_hash("and") + assert hashes[2][2] == _get_32_bit_hash("and") + assert hashes[2][3] == _get_32_bit_hash("and") assert hashes[2][4] == 0 - assert hashes[3][0] == _get_unsigned_32_bit_hash("gy") - assert hashes[3][1] == _get_unsigned_32_bit_hash("igy") - assert hashes[3][2] == _get_unsigned_32_bit_hash("digy") - assert hashes[3][3] == _get_unsigned_32_bit_hash("odigy") - assert hashes[3][4] == _get_unsigned_32_bit_hash("pr") + assert hashes[3][0] == _get_32_bit_hash("gy") + assert hashes[3][1] == _get_32_bit_hash("igy") + assert hashes[3][2] == _get_32_bit_hash("digy") + assert hashes[3][3] == _get_32_bit_hash("odigy") + assert hashes[3][4] == _get_32_bit_hash("pr") def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): @@ -1137,21 +1159,33 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): hashes = doc.get_character_combination_hashes( cs=False, p_lengths=bytes((p_length,)), + p_max_l = p_length, s_lengths=bytes((s_length,)), + s_max_l = s_length, ps_1byte_ch=bytes(), + ps_1byte_ch_l = 0, ps_2byte_ch=bytes(), + ps_2byte_ch_l = 0, ps_3byte_ch=bytes(), + ps_3byte_ch_l = 0, ps_4byte_ch=bytes(), + ps_4byte_ch_l = 0, ps_lengths=bytes(), + ps_max_l = 0, ss_1byte_ch=bytes(), + ss_1byte_ch_l = 0, ss_2byte_ch=bytes(), + ss_2byte_ch_l = 0, ss_3byte_ch=bytes(), + ss_3byte_ch_l = 0, ss_4byte_ch=bytes(), + ss_4byte_ch_l = 0, ss_lengths=bytes(), + ss_max_l = 0 ) - assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length]) - assert hashes[0][1] == _get_unsigned_32_bit_hash("sp𐌞cé"[-s_length:]) + assert hashes[0][0] == _get_32_bit_hash("sp𐌞cé"[:p_length]) + assert hashes[0][1] == _get_32_bit_hash("sp𐌞cé"[-s_length:]) @pytest.mark.parametrize("case_sensitive", [True, False]) @@ -1161,49 +1195,61 @@ def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_ hashes = doc.get_character_combination_hashes( cs=case_sensitive, p_lengths=bytes((1,2,3,4,)), + p_max_l = 4, s_lengths=bytes((1,2,3,4,)), + s_max_l = 4, ps_1byte_ch=s1, + ps_1byte_ch_l = len(s1), ps_2byte_ch=s2, + ps_2byte_ch_l = len(s2), ps_3byte_ch=s3, + ps_3byte_ch_l = len(s3), ps_4byte_ch=s4, + ps_4byte_ch_l = len(s4), ps_lengths=bytes((1,2,3,4,)), + ps_max_l = 4, ss_1byte_ch=s1, + ss_1byte_ch_l = len(s1), ss_2byte_ch=s2, + ss_2byte_ch_l = len(s2), ss_3byte_ch=s3, + ss_3byte_ch_l = len(s3), ss_4byte_ch=s4, + ss_4byte_ch_l = len(s4), ss_lengths=bytes((1,2,3,4,)), + ss_max_l = 4 ) COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") - assert hashes[0][0] == _get_unsigned_32_bit_hash("i") - assert hashes[0][1] == _get_unsigned_32_bit_hash("İ".lower()) + assert hashes[0][0] == _get_32_bit_hash("i") + assert hashes[0][1] == _get_32_bit_hash("İ".lower()) if case_sensitive: - assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "İ") - assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() + "İ") - assert hashes[0][4] == _get_unsigned_32_bit_hash("İ") - assert hashes[0][5] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ") - assert hashes[0][6] == _get_unsigned_32_bit_hash("İ".lower() + "İ") - assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() + "İ") - assert hashes[0][8] == _get_unsigned_32_bit_hash("İ") - assert hashes[0][9] == _get_unsigned_32_bit_hash("İ") - assert hashes[0][12] == _get_unsigned_32_bit_hash("İ") - assert hashes[0][13] == _get_unsigned_32_bit_hash("İ") + assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "İ") + assert hashes[0][3] == _get_32_bit_hash("İ".lower() + "İ") + assert hashes[0][4] == _get_32_bit_hash("İ") + assert hashes[0][5] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ") + assert hashes[0][6] == _get_32_bit_hash("İ".lower() + "İ") + assert hashes[0][7] == _get_32_bit_hash("İ".lower() + "İ") + assert hashes[0][8] == _get_32_bit_hash("İ") + assert hashes[0][9] == _get_32_bit_hash("İ") + assert hashes[0][12] == _get_32_bit_hash("İ") + assert hashes[0][13] == _get_32_bit_hash("İ") else: - assert hashes[0][2] == _get_unsigned_32_bit_hash("İ".lower() + "i") - assert hashes[0][3] == _get_unsigned_32_bit_hash("İ".lower() * 2) - assert hashes[0][4] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE) - assert hashes[0][5] == _get_unsigned_32_bit_hash("İ".lower()) - assert hashes[0][6] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower()) - assert hashes[0][7] == _get_unsigned_32_bit_hash("İ".lower() * 2) - assert hashes[0][8] == _get_unsigned_32_bit_hash("i") - assert hashes[0][9] == _get_unsigned_32_bit_hash("İ".lower()) - assert hashes[0][10] == _get_unsigned_32_bit_hash("İ".lower() + "i") - assert hashes[0][11] == _get_unsigned_32_bit_hash("İ".lower() * 2) - assert hashes[0][12] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE) - assert hashes[0][13] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i") - assert hashes[0][14] == _get_unsigned_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE) - assert hashes[0][15] == _get_unsigned_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2) + assert hashes[0][2] == _get_32_bit_hash("İ".lower() + "i") + assert hashes[0][3] == _get_32_bit_hash("İ".lower() * 2) + assert hashes[0][4] == _get_32_bit_hash(COMBINING_DOT_ABOVE) + assert hashes[0][5] == _get_32_bit_hash("İ".lower()) + assert hashes[0][6] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "İ".lower()) + assert hashes[0][7] == _get_32_bit_hash("İ".lower() * 2) + assert hashes[0][8] == _get_32_bit_hash("i") + assert hashes[0][9] == _get_32_bit_hash("İ".lower()) + assert hashes[0][10] == _get_32_bit_hash("İ".lower() + "i") + assert hashes[0][11] == _get_32_bit_hash("İ".lower() * 2) + assert hashes[0][12] == _get_32_bit_hash(COMBINING_DOT_ABOVE) + assert hashes[0][13] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i") + assert hashes[0][14] == _get_32_bit_hash(COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE) + assert hashes[0][15] == _get_32_bit_hash((COMBINING_DOT_ABOVE + "i") * 2) @pytest.mark.parametrize("case_sensitive", [True, False]) @@ -1219,33 +1265,45 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer, hashes = doc.get_character_combination_hashes( cs=case_sensitive, p_lengths=bytes((2,)), + p_max_l = 2, s_lengths=bytes((2,)), + s_max_l = 2, ps_1byte_ch=ps1, + ps_1byte_ch_l = len(ps1), ps_2byte_ch=ps2, + ps_2byte_ch_l = len(ps2), ps_3byte_ch=ps3, + ps_3byte_ch_l = len(ps3), ps_4byte_ch=ps4, + ps_4byte_ch_l = len(ps4), ps_lengths=bytes((2,)), + ps_max_l = 2, ss_1byte_ch=bytes(), + ss_1byte_ch_l = 0, ss_2byte_ch=bytes(), + ss_2byte_ch_l = 0, ss_3byte_ch=bytes(), + ss_3byte_ch_l = 0, ss_4byte_ch=bytes(), + ss_4byte_ch_l = 0, ss_lengths=bytes(), + ss_max_l = 0 ) - assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl") - assert hashes[0][1] == _get_unsigned_32_bit_hash("19") + assert hashes[0][0] == _get_32_bit_hash("FL" if case_sensitive else "fl") + assert hashes[0][1] == _get_32_bit_hash("19") assert hashes[0][2] == 0 - assert hashes[1][0] == _get_unsigned_32_bit_hash("be") - assert hashes[1][1] == _get_unsigned_32_bit_hash("ee") + assert hashes[1][0] == _get_32_bit_hash("be") + assert hashes[1][1] == _get_32_bit_hash("ee") if case_sensitive: assert hashes[1][2] == 0 else: - assert hashes[1][2] == _get_unsigned_32_bit_hash("ee") - assert hashes[2][0] == hashes[3][0] == _get_unsigned_32_bit_hash("se") - assert hashes[2][1] == hashes[3][1] == _get_unsigned_32_bit_hash("ty") + assert hashes[1][2] == _get_32_bit_hash("ee") + assert hashes[2][0] == hashes[3][0] == _get_32_bit_hash("se") + assert hashes[2][1] == hashes[3][1] == _get_32_bit_hash("ty") if case_sensitive: assert hashes[2][2] == hashes[3][2] == 0 else: - assert hashes[2][2] == hashes[3][2] == _get_unsigned_32_bit_hash("ee") + assert hashes[2][2] == hashes[3][2] == _get_32_bit_hash("ee") def test_character_combination_hashes_empty_lengths(en_tokenizer): @@ -1253,15 +1311,27 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer): assert doc.get_character_combination_hashes( cs=True, p_lengths=bytes(), + p_max_l = 0, s_lengths=bytes(), + s_max_l = 0, ps_1byte_ch=bytes(), + ps_1byte_ch_l=0, ps_2byte_ch=bytes(), + ps_2byte_ch_l=0, ps_3byte_ch=bytes(), + ps_3byte_ch_l=0, ps_4byte_ch=bytes(), + ps_4byte_ch_l=0, ps_lengths=bytes(), + ps_max_l = 0, ss_1byte_ch=bytes(), + ss_1byte_ch_l=0, ss_2byte_ch=bytes(), + ss_2byte_ch_l=0, ss_3byte_ch=bytes(), + ss_3byte_ch_l=0, ss_4byte_ch=bytes(), + ss_4byte_ch_l=0, ss_lengths=bytes(), + ss_max_l = 0, ).shape == (1, 0) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 994020744..8888939df 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -38,20 +38,33 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2 cdef int [:,:] _get_lca_matrix(Doc, int start, int end) -cdef void _set_affix_lengths( +cdef void _set_prefix_lengths( const unsigned char* tok_str, - unsigned char* aff_l_buf, - const int pref_l, - const int suff_l, + const int tok_str_l, + unsigned char* pref_l_buf, + const int p_max_l, ) nogil +cdef void _set_suffix_lengths( + const unsigned char* tok_str, + const int tok_str_l, + unsigned char* suff_l_buf, + const int s_max_l, +) + + cdef void _search_for_chars( const unsigned char* tok_str, + const int tok_str_l, const unsigned char* s_1byte_ch, + const int s_1byte_ch_l, const unsigned char* s_2byte_ch, + const int s_2byte_ch_l, const unsigned char* s_3byte_ch, + const int s_3byte_ch_l, const unsigned char* s_4byte_ch, + const int s_4byte_ch_l, unsigned char* res_buf, int max_res_l, unsigned char* l_buf, @@ -59,6 +72,18 @@ cdef void _search_for_chars( ) nogil + +cdef int _write_hashes( + const unsigned char* res_buf, + const unsigned char* aff_l_buf, + const unsigned char* offset_buf, + const int end_idx, + np.ndarray[np.int64_t, ndim=2] hashes, + const int tok_i, + const int start_hash_idx, +) + + cdef class Doc: cdef readonly Pool mem cdef readonly Vocab vocab diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index df431b460..b27c68386 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -179,17 +179,58 @@ class Doc: *, cs: bool, p_lengths: bytes, + p_max_l: int, s_lengths: bytes, + s_max_l: int, ps_1byte_ch: bytes, + ps_1_byte_ch_l: int, ps_2byte_ch: bytes, + ps_2_byte_ch_l: int, ps_3byte_ch: bytes, + ps_3_byte_ch_l: int, ps_4byte_ch: bytes, + ps_4_byte_ch_l: int, ps_lengths: bytes, + ps_max_l: int, ss_1byte_ch: bytes, + ss_1_byte_ch_l: int, ss_2byte_ch: bytes, + ss_2_byte_ch_l: int, ss_3byte_ch: bytes, + ss_3_byte_ch_l: int, ss_4byte_ch: bytes, + ss_4_byte_ch_l: int, ss_lengths: bytes, + ss_max_l: int, ) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ... + +def get_character_combination_hashes(self, + *, + const bint cs, + const unsigned char* p_lengths, + const int p_max_l, + const unsigned char* s_lengths, + const int s_max_l, + const unsigned char* ps_1byte_ch, + const int ps_1_byte_ch_l, + const unsigned char* ps_2byte_ch, + const int ps_2_byte_ch_l, + const unsigned char* ps_3byte_ch, + const int ps_3_byte_ch_l, + const unsigned char* ps_4byte_ch, + const int ps_4_byte_ch_l, + const unsigned char* ps_lengths, + const int ps_max_l, + const unsigned char* ss_1byte_ch, + const int ss_1_byte_ch_l, + const unsigned char* ss_2byte_ch, + const int ss_2_byte_ch_l, + const unsigned char* ss_3byte_ch, + const int ss_3_byte_ch_l, + const unsigned char* ss_4byte_ch, + const int ss_4_byte_ch_l, + const unsigned char* ss_lengths, + const int ss_max_l, + ) \ No newline at end of file diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4bea8d656..91836e15e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -956,7 +956,7 @@ cdef class Doc: cdef int i, j cdef attr_id_t feature cdef np.ndarray[attr_t, ndim=2] output - # Handle scalar/list inputs of strings/ints for py_attr_ids + # Handle scalar/list inputs of cdef np.strings/ints for py_attr_ids # See also #3064 if isinstance(py_attr_ids, str): # Handle inputs like doc.to_array('ORTH') @@ -1735,22 +1735,34 @@ cdef class Doc: j += 1 return output - @cython.boundscheck(False) # Deactivate bounds checking + #@cython.boundscheck(False) # Deactivate bounds checking def get_character_combination_hashes(self, *, const bint cs, - const unsigned char* p_lengths, + const unsigned char* p_lengths, + const int p_max_l, const unsigned char* s_lengths, + const int s_max_l, const unsigned char* ps_1byte_ch, + const int ps_1byte_ch_l, const unsigned char* ps_2byte_ch, + const int ps_2byte_ch_l, const unsigned char* ps_3byte_ch, + const int ps_3byte_ch_l, const unsigned char* ps_4byte_ch, + const int ps_4byte_ch_l, const unsigned char* ps_lengths, + const int ps_max_l, const unsigned char* ss_1byte_ch, + const int ss_1byte_ch_l, const unsigned char* ss_2byte_ch, + const int ss_2byte_ch_l, const unsigned char* ss_3byte_ch, + const int ss_3byte_ch_l, const unsigned char* ss_4byte_ch, + const int ss_4byte_ch_l, const unsigned char* ss_lengths, + const int ss_max_l, ): """ Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations @@ -1766,39 +1778,33 @@ cdef class Doc: cs: if *False*, hashes are generated based on the lower-case version of each token. p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order. For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa". + p_max_l: the value of *p_lengths[-1]*, or *0* if *p_lengths==None*. Passed in for speed. s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order. For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". + s_max_l: the value of *s_lengths[-1]*, or *0* if *s_lengths==None*. Passed in for speed. ps_byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, starting at the beginning. + ps_byte_ch_l: the length of *ps_byte_ch*. Passed in for speed. ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "a" and "ac". + ps_max_l: the value of *ps_lengths[-1]*, or *0* if *ps_lengths==None*. Passed in for speed. ss_byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, starting at the end. + ss_byte_ch_l: the length of *ss_byte_ch*. Passed in for speed. ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "c" and "ca". + ss_max_l: the value of *ss_lengths[-1]*, or *0* if *ss_lengths==None*. Passed in for speed. """ - # Define the result array and work out what is used for what in axis 1 - cdef int num_toks = len(self) - cdef int p_h_num = strlen( p_lengths) - cdef int s_h_num = strlen( s_lengths), s_h_end = p_h_num + s_h_num - cdef int ps_h_num = strlen( ps_lengths), ps_h_end = s_h_end + ps_h_num - cdef int ss_h_num = strlen( ss_lengths), ss_h_end = ps_h_end + ss_h_num - cdef np.ndarray[np.int64_t, ndim=2] hashes - hashes = numpy.empty((num_toks, ss_h_end), dtype="int64") - - # Determine the maximum possible lengths of the affixes to work out how big the buffers need to be - cdef int p_max_l = p_lengths[-1] if p_h_num > 0 else 0 - cdef int s_max_l = s_lengths[-1] if s_h_num > 0 else 0 - cdef int ps_max_l = ps_lengths[-1] if ps_h_num > 0 else 0 - cdef int ss_max_l = ss_lengths[-1] if ss_h_num > 0 else 0 - + cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty( + (self.length, p_max_l + s_max_l + ps_max_l + ss_max_l), dtype="int64") + # Define / allocate buffers cdef Pool mem = Pool() - cdef int aff_l = p_max_l + s_max_l - cdef unsigned char* aff_l_buf = mem.alloc(aff_l, 1) + cdef unsigned char* pref_l_buf = mem.alloc(p_max_l, 1) + cdef unsigned char* suff_l_buf = mem.alloc(p_max_l, 1) cdef unsigned char* ps_res_buf = mem.alloc(ps_max_l, 4) cdef unsigned char* ps_l_buf = mem.alloc(ps_max_l, 1) cdef unsigned char* ss_res_buf = mem.alloc(ss_max_l, 4) @@ -1806,48 +1812,34 @@ cdef class Doc: # Define working variables cdef TokenC tok_c - cdef int tok_i, offset - cdef uint64_t hash_val = 0 + cdef int hash_idx, tok_i, tok_str_l cdef attr_t num_tok_attr cdef const unsigned char* tok_str - for tok_i in range(num_toks): + for tok_i in range(self.length): tok_c = self.c[tok_i] num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower tok_str = self.vocab.strings.utf8_ptr(num_tok_attr) + tok_str_l = strlen( tok_str) + hash_idx = 0 - if aff_l > 0: - _set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l) + if p_max_l > 0: + _set_prefix_lengths(tok_str, tok_str_l, pref_l_buf, p_max_l) + hash_idx = _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes, tok_i, 0) - for hash_idx in range(p_h_num): - offset = aff_l_buf[p_lengths[hash_idx] - 1] - if offset > 0: - hash_val = hash32( &tok_str[0], offset, 0) - hashes[tok_i, hash_idx] = hash_val + if s_max_l > 0: + _set_suffix_lengths(tok_str, tok_str_l, suff_l_buf, s_max_l) + hash_idx = _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l, hashes, tok_i, hash_idx) - for hash_idx in range(p_h_num, s_h_end): - offset = aff_l_buf[s_lengths[hash_idx - p_h_num] + p_max_l - 1] - if offset > 0: - hash_val = hash32( &tok_str[len(tok_str) - offset], offset, 0) - hashes[tok_i, hash_idx] = hash_val + if ps_max_l > 0: + _search_for_chars(tok_str, tok_str_l, ps_1byte_ch, ps_1byte_ch_l, ps_2byte_ch, ps_2byte_ch_l, + ps_3byte_ch, ps_3byte_ch_l, ps_4byte_ch, ps_4byte_ch_l, ps_res_buf, ps_max_l, ps_l_buf, False) + hash_idx = _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes, tok_i, hash_idx) - if ps_h_num > 0: - _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False) - hash_val = 0 - for hash_idx in range(s_h_end, ps_h_end): - offset = ps_l_buf[ps_lengths[hash_idx - s_h_end] - 1] - if offset > 0: - hash_val = hash32(ps_res_buf, offset, 0) - hashes[tok_i, hash_idx] = hash_val - - if ss_h_num > 0: - _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True) - hash_val = 0 - for hash_idx in range(ps_h_end, ss_h_end): - offset = ss_l_buf[ss_lengths[hash_idx - ps_h_end] - 1] - if offset > 0: - hash_val = hash32(ss_res_buf, offset, 0) - hashes[tok_i, hash_idx] = hash_val + if ss_max_l > 0: + _search_for_chars(tok_str, tok_str_l, ss_1byte_ch, ss_1byte_ch_l, ss_2byte_ch, ss_2byte_ch_l, + ss_3byte_ch, ss_3byte_ch_l, ss_4byte_ch, ss_4byte_ch_l, ss_res_buf, ss_max_l, ss_l_buf, True) + _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes, tok_i, hash_idx) return hashes @@ -2031,59 +2023,81 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): lca_matrix[k, j] = lca - start return lca_matrix -@cython.boundscheck(False) # Deactivate bounds checking -cdef void _set_affix_lengths( +#@cython.boundscheck(False) # Deactivate bounds checking +cdef void _set_prefix_lengths( const unsigned char* tok_str, - unsigned char* aff_l_buf, - const int pref_l, - const int suff_l, + const int tok_str_l, + unsigned char* pref_l_buf, + const int p_max_l, ) nogil: - """ Populate *aff_l_buf*, which has length *pref_l+suff_l* with the byte lengths of the first *pref_l* and the last - *suff_l* characters within *tok_str*. Lengths that are greater than the character length of the whole word are - populated with the byte length of the whole word. + """ Populate *pref_l_buf*, which has length *pref_l*, with the byte lengths of the first *pref_l* characters within *tok_str*. + Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word. - tok_str: a memoryview of a UTF-8 representation of a string. - aff_l_buf: a buffer of length *pref_l+suff_l* in which to store the lengths. The calling code ensures that lengths + tok_str: a UTF-8 representation of a string. + tok_str_l: the length of *tok_str*. + pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The calling code ensures that lengths greater than 255 cannot occur. - pref_l: the number of characters to process at the beginning of the word. - suff_l: the number of characters to process at the end of the word. + p_max_l: the number of characters to process at the beginning of the word. """ - cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = strlen( tok_str) + cdef int tok_str_idx = 1, pref_l_buf_idx = 0 - while aff_l_buf_idx < pref_l: - if (tok_str_idx == strlen( tok_str) + while pref_l_buf_idx < p_max_l: + if (tok_str[tok_str_idx] == 0 # end of string or - ((tok_str[tok_str_idx] & 0xc0) != 0x80 # not a continuation character + ((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character ): - aff_l_buf[aff_l_buf_idx] = tok_str_idx - aff_l_buf_idx += 1 - tok_str_idx += 1 - if tok_str_idx > tok_str_l: + pref_l_buf[pref_l_buf_idx] = tok_str_idx + pref_l_buf_idx += 1 + if tok_str[tok_str_idx] == 0: # end of string break + tok_str_idx += 1 - if aff_l_buf_idx < pref_l: - memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l - aff_l_buf_idx) - aff_l_buf_idx = pref_l + if pref_l_buf_idx < p_max_l: + memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx) - tok_str_idx = tok_str_l - 1 - while aff_l_buf_idx < pref_l + suff_l: + +#@cython.boundscheck(False) # Deactivate bounds checking +cdef void _set_suffix_lengths( + const unsigned char* tok_str, + const int tok_str_l, + unsigned char* suff_l_buf, + const int s_max_l, +): + """ Populate *suff_l_buf*, which has length *suff_l*, with the byte lengths of the last *suff_l* characters within *tok_str*. + Lengths that are greater than the character length of the whole word are populated with the byte length of the whole word. + + tok_str: a UTF-8 representation of a string. + tok_str_l: the length of *tok_str*. + suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The calling code ensures that lengths + greater than 255 cannot occur. + s_max_l: the number of characters to process at the end of the word. + """ + cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0 + + while suff_l_buf_idx < s_max_l: if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character - aff_l_buf[aff_l_buf_idx] = tok_str_l - tok_str_idx - aff_l_buf_idx += 1 + suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx + suff_l_buf_idx += 1 tok_str_idx -= 1 if tok_str_idx < 0: break - if aff_l_buf_idx < pref_l + suff_l: - memset(aff_l_buf + aff_l_buf_idx, aff_l_buf[aff_l_buf_idx - 1], pref_l + suff_l - aff_l_buf_idx) + if suff_l_buf_idx < s_max_l: + memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx) -@cython.boundscheck(False) # Deactivate bounds checking + +#@cython.boundscheck(False) # Deactivate bounds checking cdef void _search_for_chars( const unsigned char* tok_str, + const int tok_str_l, const unsigned char* s_1byte_ch, + const int s_1byte_ch_l, const unsigned char* s_2byte_ch, + const int s_2byte_ch_l, const unsigned char* s_3byte_ch, + const int s_3byte_ch_l, const unsigned char* s_4byte_ch, + const int s_4byte_ch_l, unsigned char* res_buf, int max_res_l, unsigned char* l_buf, @@ -2096,6 +2110,7 @@ cdef void _search_for_chars( which may be *0* if the search was not successful. tok_str: a memoryview of a UTF-8 representation of a string. + tok_str_l: the length of *tok_str*. s_byte_ch: a byte array containing in order n-byte-wide characters to search for. res_buf: the buffer in which to place the search results. max_res_l: the maximum number of found characters to place in *res_buf*. @@ -2103,7 +2118,7 @@ cdef void _search_for_chars( The calling code ensures that lengths greater than 255 cannot occur. suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning. """ - cdef int tok_str_l = strlen( tok_str), res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx + cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx cdef int search_chars_l cdef const unsigned char* search_chars @@ -2121,13 +2136,16 @@ cdef void _search_for_chars( ch_wdth = last_tok_str_idx - this_tok_str_idx if ch_wdth == 1: search_chars = s_1byte_ch + search_chars_l = s_1byte_ch_l elif ch_wdth == 2: search_chars = s_2byte_ch + search_chars_l = s_2byte_ch_l elif ch_wdth == 3: search_chars = s_3byte_ch + search_chars_l = s_3byte_ch_l else: search_chars = s_4byte_ch - search_chars_l = strlen( search_chars) + search_chars_l = s_4byte_ch_l tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx search_char_idx = 0 @@ -2157,6 +2175,43 @@ cdef void _search_for_chars( memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) +cdef int _write_hashes( + const unsigned char* res_buf, + const unsigned char* aff_l_buf, + const unsigned char* offset_buf, + const int end_idx, + np.ndarray[np.int64_t, ndim=2] hashes, + const int tok_i, + const int start_hash_idx, +): + """ Write hashes for a token/rich property group combination. + + res_buf: the string from which to generate the hash values. + aff_l_buf: one-byte lengths describing how many characters to hash. + offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*. + end_idx: if not *0*, the offset within *res_buf* that should end each affix being hashed; + if *0*, affixes start at the beginning of *res_buf* rather than ending at the end. + hashes: the 2D Numpy array in which the hashes are stored. + tok_i: the index of axis 0 of *hashes* to write to. + start_hash_idx: the index of axis 1 of *hashes* at which to start writing. + """ + + cdef int offset, aff_l, hash_val = 0, hash_idx = start_hash_idx + + while True: + aff_l = aff_l_buf[hash_idx - start_hash_idx] + if aff_l == 0: + return hash_idx + offset = offset_buf[aff_l - 1] + if offset > 0: + if end_idx != 0: + hash_val = hash32( res_buf + end_idx - offset, offset, 0) + else: + hash_val = hash32( res_buf, offset, 0) + hashes[tok_i, hash_idx] = hash_val + hash_idx += 1 + + def pickle_doc(doc): bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"]) hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,