diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py index 4186f3750..488c33a60 100644 --- a/spacy/ml/richfeatureextractor.py +++ b/spacy/ml/richfeatureextractor.py @@ -43,26 +43,26 @@ def RichFeatureExtractor( forward, attrs={ "case_sensitive": case_sensitive, - "pref_lengths": ops.asarray1i(pref_lengths) + "pref_lengths": bytes(pref_lengths) if pref_lengths is not None - else ops.asarray1i([]), - "suff_lengths": ops.asarray1i(suff_lengths) + else bytes(), + "suff_lengths": bytes(suff_lengths) if suff_lengths is not None - else ops.asarray1i([]), + else bytes(), "pref_search_1_byte": ps_1byte_ch, "pref_search_2_bytes": ps_2byte_ch, "pref_search_3_bytes": ps_3byte_ch, "pref_search_4_bytes": ps_4byte_ch, - "pref_search_lengths": ops.asarray1i(pref_search_lengths) + "pref_search_lengths": bytes(pref_search_lengths) if pref_search_lengths is not None - else ops.asarray1i([]), + else bytes(), "suff_search_1_byte": ss_1byte_ch, "suff_search_2_bytes": ss_2byte_ch, "suff_search_3_bytes": ss_3byte_ch, "suff_search_4_bytes": ss_4byte_ch, - "suff_search_lengths": ops.asarray1i(suff_search_lengths) + "suff_search_lengths": bytes(suff_search_lengths) if suff_search_lengths is not None - else ops.asarray1i([]), + else bytes(), }, ) @@ -72,18 +72,18 @@ def forward( ) -> Tuple[List[Ints2d], Callable]: ops = model.ops case_sensitive: bool = model.attrs["case_sensitive"] - pref_lengths: Ints1d = model.attrs["pref_lengths"] - suff_lengths: Ints1d = model.attrs["suff_lengths"] + pref_lengths: bytes = model.attrs["pref_lengths"] + suff_lengths: bytes = model.attrs["suff_lengths"] ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"] ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"] ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"] ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"] - pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"] + pref_search_lengths: bytes = model.attrs["pref_search_lengths"] ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"] ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"] ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"] ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"] - suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"] + suff_search_lengths: bytes = model.attrs["suff_search_lengths"] features: List[Ints2d] = [] for doc in docs: hashes = doc.get_character_combination_hashes( diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 7b33a498e..59cdff522 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -27,4 +27,4 @@ cdef class StringStore: cdef const Utf8Str* intern_unicode(self, str py_string) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash) - cdef const unsigned char[:] utf8_view(self, attr_t hash_val) + cdef const unsigned char* utf8_ptr(self, attr_t hash_val) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index d86bf600b..d2264fb9c 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -316,9 +316,9 @@ cdef class StringStore: self.keys.push_back(key) return value - cdef const unsigned char[:] utf8_view(self, attr_t hash_val): + cdef const unsigned char* utf8_ptr(self, attr_t hash_val): if hash_val == 0: - return "" + return "".encode("utf-8") elif hash_val < len(SYMBOLS_BY_INT): return SYMBOLS_BY_INT[hash_val].encode("utf-8") cdef Utf8Str* string = self._map.get(hash_val) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index a9c974b8c..e03692f66 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1004,23 +1004,22 @@ def _get_unsigned_32_bit_hash(input: str) -> int: @pytest.mark.parametrize("case_sensitive", [True, False]) def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): doc = en_tokenizer("spaCy✨ and Prodigy") - ops = get_current_ops() ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("Rp", case_sensitive) ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive) hashes = doc.get_character_combination_hashes( cs=case_sensitive, - p_lengths=ops.asarray1i([1, 3, 4]), - s_lengths=ops.asarray1i([2, 3, 4, 5]), + p_lengths=bytes((1, 3, 4,)), + s_lengths=bytes((2, 3, 4, 5,)), ps_1byte_ch=ps1, ps_2byte_ch=ps2, ps_3byte_ch=ps3, ps_4byte_ch=ps4, - ps_lengths=ops.asarray1i([2]), + ps_lengths=bytes((2,)), ss_1byte_ch=ss1, ss_2byte_ch=ss2, ss_3byte_ch=ss3, ss_4byte_ch=ss4, - ss_lengths=ops.asarray1i([1, 2]), + ss_lengths=bytes((1, 2,)), ) assert hashes[0][0] == _get_unsigned_32_bit_hash("s") @@ -1089,22 +1088,21 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive def test_get_character_combination_hashes_good_case_partial(en_tokenizer): doc = en_tokenizer("spaCy✨ and Prodigy") - ops = get_current_ops() ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("rp", False) hashes = doc.get_character_combination_hashes( cs=False, - p_lengths=ops.asarray1i([]), - s_lengths=ops.asarray1i([2, 3, 4, 5]), + p_lengths=bytes(), + s_lengths=bytes((2,3,4,5,)), ps_1byte_ch=ps1, ps_2byte_ch=ps2, ps_3byte_ch=ps3, ps_4byte_ch=ps4, - ps_lengths=ops.asarray1i([2]), + ps_lengths=bytes((2,)), ss_1byte_ch=bytes(), ss_2byte_ch=bytes(), ss_3byte_ch=bytes(), ss_4byte_ch=bytes(), - ss_lengths=ops.asarray1i([]), + ss_lengths=bytes(), ) @@ -1132,25 +1130,24 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer): def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): doc = en_tokenizer("sp𐌞Cé") - ops = get_current_ops() for p_length in range(1, 8): for s_length in range(1, 8): hashes = doc.get_character_combination_hashes( cs=False, - p_lengths=ops.asarray1i([p_length]), - s_lengths=ops.asarray1i([s_length]), + p_lengths=bytes((p_length,)), + s_lengths=bytes((s_length,)), ps_1byte_ch=bytes(), ps_2byte_ch=bytes(), ps_3byte_ch=bytes(), ps_4byte_ch=bytes(), - ps_lengths=ops.asarray1i([]), + ps_lengths=bytes(), ss_1byte_ch=bytes(), ss_2byte_ch=bytes(), ss_3byte_ch=bytes(), ss_4byte_ch=bytes(), - ss_lengths=ops.asarray1i([]), + ss_lengths=bytes(), ) assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length]) @@ -1160,22 +1157,21 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): @pytest.mark.parametrize("case_sensitive", [True, False]) def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_sensitive): doc = en_tokenizer("İ".lower() + "İ") - ops = get_current_ops() s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive) hashes = doc.get_character_combination_hashes( cs=case_sensitive, - p_lengths=ops.asarray1i([1, 2, 3, 4]), - s_lengths=ops.asarray1i([1, 2, 3, 4]), + p_lengths=bytes((1,2,3,4,)), + s_lengths=bytes((1,2,3,4,)), ps_1byte_ch=s1, ps_2byte_ch=s2, ps_3byte_ch=s3, ps_4byte_ch=s4, - ps_lengths=ops.asarray1i([1, 2, 3, 4]), + ps_lengths=bytes((1,2,3,4,)), ss_1byte_ch=s1, ss_2byte_ch=s2, ss_3byte_ch=s3, ss_4byte_ch=s4, - ss_lengths=ops.asarray1i([1, 2, 3, 4]), + ss_lengths=bytes((1,2,3,4,)), ) COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") @@ -1219,22 +1215,21 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer, assert len(long_word) > 255 doc = en_tokenizer(' '.join((symbol, short_word, normal_word, long_word))) assert len(doc) == 4 - ops = get_current_ops() ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive) hashes = doc.get_character_combination_hashes( cs=case_sensitive, - p_lengths=ops.asarray1i([2]), - s_lengths=ops.asarray1i([2]), + p_lengths=bytes((2,)), + s_lengths=bytes((2,)), ps_1byte_ch=ps1, ps_2byte_ch=ps2, ps_3byte_ch=ps3, ps_4byte_ch=ps4, - ps_lengths=ops.asarray1i([2]), + ps_lengths=bytes((2,)), ss_1byte_ch=bytes(), ss_2byte_ch=bytes(), ss_3byte_ch=bytes(), ss_4byte_ch=bytes(), - ss_lengths=ops.asarray1i([]), + ss_lengths=bytes(), ) assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl") assert hashes[0][1] == _get_unsigned_32_bit_hash("19") @@ -1255,19 +1250,18 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer, def test_character_combination_hashes_empty_lengths(en_tokenizer): doc = en_tokenizer("and𐌞") - ops = get_current_ops() - hashes = doc.get_character_combination_hashes( + assert doc.get_character_combination_hashes( cs=True, - p_lengths=ops.asarray1i([]), - s_lengths=ops.asarray1i([]), + p_lengths=bytes(), + s_lengths=bytes(), ps_1byte_ch=bytes(), ps_2byte_ch=bytes(), ps_3byte_ch=bytes(), ps_4byte_ch=bytes(), - ps_lengths=ops.asarray1i([]), + ps_lengths=bytes(), ss_1byte_ch=bytes(), ss_2byte_ch=bytes(), ss_3byte_ch=bytes(), ss_4byte_ch=bytes(), - ss_lengths=ops.asarray1i([]), + ss_lengths=bytes(), ).shape == (1, 0) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 796525663..994020744 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -39,7 +39,7 @@ cdef int [:,:] _get_lca_matrix(Doc, int start, int end) cdef void _set_affix_lengths( - const unsigned char[:] tok_str, + const unsigned char* tok_str, unsigned char* aff_l_buf, const int pref_l, const int suff_l, @@ -47,11 +47,11 @@ cdef void _set_affix_lengths( cdef void _search_for_chars( - const unsigned char[:] tok_str, - const unsigned char[:] s_1byte_ch, - const unsigned char[:] s_2byte_ch, - const unsigned char[:] s_3byte_ch, - const unsigned char[:] s_4byte_ch, + const unsigned char* tok_str, + const unsigned char* s_1byte_ch, + const unsigned char* s_2byte_ch, + const unsigned char* s_3byte_ch, + const unsigned char* s_4byte_ch, unsigned char* res_buf, int max_res_l, unsigned char* l_buf, diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index dc26b6010..df431b460 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -178,18 +178,18 @@ class Doc: self, *, cs: bool, - p_lengths: Ints1d, - s_lengths: Ints1d, + p_lengths: bytes, + s_lengths: bytes, ps_1byte_ch: bytes, ps_2byte_ch: bytes, ps_3byte_ch: bytes, ps_4byte_ch: bytes, - ps_lengths: Ints1d, + ps_lengths: bytes, ss_1byte_ch: bytes, ss_2byte_ch: bytes, ss_3byte_ch: bytes, ss_4byte_ch: bytes, - ss_lengths: Ints1d, + ss_lengths: bytes, ) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 668353024..4bea8d656 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -4,7 +4,7 @@ from typing import Set, List cimport cython cimport numpy as np from cpython cimport array -from libc.string cimport memcpy, memcmp, memset +from libc.string cimport memcpy, memcmp, memset, strlen from libc.math cimport sqrt from libc.stdint cimport int32_t, uint64_t @@ -1739,18 +1739,18 @@ cdef class Doc: def get_character_combination_hashes(self, *, const bint cs, - int[:] p_lengths, - int[:] s_lengths, - const unsigned char[:] ps_1byte_ch, - const unsigned char[:] ps_2byte_ch, - const unsigned char[:] ps_3byte_ch, - const unsigned char[:] ps_4byte_ch, - int[:] ps_lengths, - const unsigned char[:] ss_1byte_ch, - const unsigned char[:] ss_2byte_ch, - const unsigned char[:] ss_3byte_ch, - const unsigned char[:] ss_4byte_ch, - int[:] ss_lengths, + const unsigned char* p_lengths, + const unsigned char* s_lengths, + const unsigned char* ps_1byte_ch, + const unsigned char* ps_2byte_ch, + const unsigned char* ps_3byte_ch, + const unsigned char* ps_4byte_ch, + const unsigned char* ps_lengths, + const unsigned char* ss_1byte_ch, + const unsigned char* ss_2byte_ch, + const unsigned char* ss_3byte_ch, + const unsigned char* ss_4byte_ch, + const unsigned char* ss_lengths, ): """ Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations @@ -1764,28 +1764,28 @@ cdef class Doc: ss_ variables relate to searches starting at the end of the word cs: if *False*, hashes are generated based on the lower-case version of each token. - p_lengths: an Ints1d specifying the lengths of prefixes to be hashed in ascending order. For example, - if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa". - s_lengths: an Ints1d specifying the lengths of suffixes to be hashed in ascending order. For example, - if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". + p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order. + For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa". + s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order. + For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". ps_byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, starting at the beginning. - ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed in ascending order. - For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for - "spaCy" would be "a" and "ac". + ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed + in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings + hashed for "spaCy" would be "a" and "ac". ss_byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, starting at the end. - ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed in ascending order. - For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for - "spaCy" would be "c" and "ca". + ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed + in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings + hashed for "spaCy" would be "c" and "ca". """ # Define the result array and work out what is used for what in axis 1 cdef int num_toks = len(self) - cdef int p_h_num = len(p_lengths) - cdef int s_h_num = len(s_lengths), s_h_end = p_h_num + s_h_num - cdef int ps_h_num = len(ps_lengths), ps_h_end = s_h_end + ps_h_num - cdef int ss_h_num = len(ss_lengths), ss_h_end = ps_h_end + ss_h_num + cdef int p_h_num = strlen( p_lengths) + cdef int s_h_num = strlen( s_lengths), s_h_end = p_h_num + s_h_num + cdef int ps_h_num = strlen( ps_lengths), ps_h_end = s_h_end + ps_h_num + cdef int ss_h_num = strlen( ss_lengths), ss_h_end = ps_h_end + ss_h_num cdef np.ndarray[np.int64_t, ndim=2] hashes hashes = numpy.empty((num_toks, ss_h_end), dtype="int64") @@ -1804,35 +1804,29 @@ cdef class Doc: cdef unsigned char* ss_res_buf = mem.alloc(ss_max_l, 4) cdef unsigned char* ss_l_buf = mem.alloc(ss_max_l, 1) - # Define memory views on length arrays - cdef int[:] p_lengths_v = p_lengths - cdef int[:] s_lengths_v = s_lengths - cdef int[:] ps_lengths_v = ps_lengths - cdef int[:] ss_lengths_v = ss_lengths - # Define working variables cdef TokenC tok_c cdef int tok_i, offset cdef uint64_t hash_val = 0 cdef attr_t num_tok_attr - cdef const unsigned char[:] tok_str + cdef const unsigned char* tok_str for tok_i in range(num_toks): tok_c = self.c[tok_i] num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower - tok_str = self.vocab.strings.utf8_view(num_tok_attr) + tok_str = self.vocab.strings.utf8_ptr(num_tok_attr) if aff_l > 0: _set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l) for hash_idx in range(p_h_num): - offset = aff_l_buf[p_lengths_v[hash_idx] - 1] + offset = aff_l_buf[p_lengths[hash_idx] - 1] if offset > 0: hash_val = hash32( &tok_str[0], offset, 0) hashes[tok_i, hash_idx] = hash_val for hash_idx in range(p_h_num, s_h_end): - offset = aff_l_buf[s_lengths_v[hash_idx - p_h_num] + p_max_l - 1] + offset = aff_l_buf[s_lengths[hash_idx - p_h_num] + p_max_l - 1] if offset > 0: hash_val = hash32( &tok_str[len(tok_str) - offset], offset, 0) hashes[tok_i, hash_idx] = hash_val @@ -1841,7 +1835,7 @@ cdef class Doc: _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False) hash_val = 0 for hash_idx in range(s_h_end, ps_h_end): - offset = ps_l_buf[ps_lengths_v[hash_idx - s_h_end] - 1] + offset = ps_l_buf[ps_lengths[hash_idx - s_h_end] - 1] if offset > 0: hash_val = hash32(ps_res_buf, offset, 0) hashes[tok_i, hash_idx] = hash_val @@ -1850,7 +1844,7 @@ cdef class Doc: _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True) hash_val = 0 for hash_idx in range(ps_h_end, ss_h_end): - offset = ss_l_buf[ss_lengths_v[hash_idx - ps_h_end] - 1] + offset = ss_l_buf[ss_lengths[hash_idx - ps_h_end] - 1] if offset > 0: hash_val = hash32(ss_res_buf, offset, 0) hashes[tok_i, hash_idx] = hash_val @@ -2039,7 +2033,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): @cython.boundscheck(False) # Deactivate bounds checking cdef void _set_affix_lengths( - const unsigned char[:] tok_str, + const unsigned char* tok_str, unsigned char* aff_l_buf, const int pref_l, const int suff_l, @@ -2054,14 +2048,17 @@ cdef void _set_affix_lengths( pref_l: the number of characters to process at the beginning of the word. suff_l: the number of characters to process at the end of the word. """ - cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = len(tok_str) + cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = strlen( tok_str) while aff_l_buf_idx < pref_l: - if tok_str_idx == len(tok_str) or ((tok_str[tok_str_idx] & 0xc0) != 0x80): # not a continuation character + if (tok_str_idx == strlen( tok_str) + or + ((tok_str[tok_str_idx] & 0xc0) != 0x80 # not a continuation character + ): aff_l_buf[aff_l_buf_idx] = tok_str_idx aff_l_buf_idx += 1 tok_str_idx += 1 - if tok_str_idx > len(tok_str): + if tok_str_idx > tok_str_l: break if aff_l_buf_idx < pref_l: @@ -2082,11 +2079,11 @@ cdef void _set_affix_lengths( @cython.boundscheck(False) # Deactivate bounds checking cdef void _search_for_chars( - const unsigned char[:] tok_str, - const unsigned char[:] s_1byte_ch, - const unsigned char[:] s_2byte_ch, - const unsigned char[:] s_3byte_ch, - const unsigned char[:] s_4byte_ch, + const unsigned char* tok_str, + const unsigned char* s_1byte_ch, + const unsigned char* s_2byte_ch, + const unsigned char* s_3byte_ch, + const unsigned char* s_4byte_ch, unsigned char* res_buf, int max_res_l, unsigned char* l_buf, @@ -2106,9 +2103,9 @@ cdef void _search_for_chars( The calling code ensures that lengths greater than 255 cannot occur. suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning. """ - cdef int tok_str_l = len(tok_str), res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx + cdef int tok_str_l = strlen( tok_str), res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx cdef int search_chars_l - cdef const unsigned char[:] search_chars + cdef const unsigned char* search_chars cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0 cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1 @@ -2130,7 +2127,7 @@ cdef void _search_for_chars( search_chars = s_3byte_ch else: search_chars = s_4byte_ch - search_chars_l = len(search_chars) + search_chars_l = strlen( search_chars) tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx search_char_idx = 0