diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py index 8c9e6b5e0..e76a92c86 100644 --- a/spacy/ml/richfeatureextractor.py +++ b/spacy/ml/richfeatureextractor.py @@ -52,21 +52,16 @@ def forward( suff_search_lengths: List[int] = model.attrs["suff_search_lengths"] features: List[Ints2d] = [] for doc in docs: - prefix_hashes = doc.get_character_combination_hashes( + hashes = doc.get_character_combination_hashes( case_sensitive=case_sensitive, - suffs_not_prefs=False, - affix_lengths=pref_lengths, - search_chars=pref_search_chars, - search_lengths=pref_search_lengths, + pref_lengths=pref_lengths, + suff_lengths=suff_lengths, + pref_search_chars=pref_search_chars, + pref_search_lengths=pref_search_lengths, + suff_search_chars=suff_search_chars, + suff_search_lengths=suff_search_lengths, ) - suffix_hashes = doc.get_character_combination_hashes( - case_sensitive=case_sensitive, - suffs_not_prefs=True, - affix_lengths=suff_lengths, - search_chars=suff_search_chars, - search_lengths=suff_search_lengths, - ) - features.append(ops.asarray2i(ops.xp.hstack([prefix_hashes, suffix_hashes]))) + features.append(ops.asarray2i(hashes)) backprop: Callable[[List[Ints2d]], List] = lambda d_features: [] return features, backprop diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index bc68a2ab0..de097f2e8 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -989,121 +989,132 @@ def _get_unsigned_32_bit_hash(input: str) -> int: def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): doc = en_tokenizer("spaCy✨ and Prodigy") - prefixes = doc.get_character_combination_hashes(case_sensitive=case_sensitive, - suffs_not_prefs=False, - affix_lengths=[1, 4, 3], - search_chars="", - search_lengths=[2]) - suffixes = doc.get_character_combination_hashes(case_sensitive=case_sensitive, - suffs_not_prefs=True, - affix_lengths=[2, 3, 4, 5], - search_chars="xx✨rp", - search_lengths=[2, 1]) - assert prefixes[0][0] == _get_unsigned_32_bit_hash("s") - assert prefixes[0][1] == _get_unsigned_32_bit_hash( + hashes = doc.get_character_combination_hashes( + case_sensitive=case_sensitive, + pref_lengths=[1, 4, 3], + suff_lengths=[2, 3, 4, 5], + pref_search_chars="", + pref_search_lengths=[2], + suff_search_chars="xx✨rp", + suff_search_lengths=[2, 1], + ) + assert hashes[0][0] == _get_unsigned_32_bit_hash("s") + assert hashes[0][1] == _get_unsigned_32_bit_hash( "spaC" if case_sensitive else "spac" ) - assert prefixes[0][2] == _get_unsigned_32_bit_hash("spa") - assert prefixes[0][3] == _get_unsigned_32_bit_hash(" ") - assert prefixes[1][0] == _get_unsigned_32_bit_hash("✨") - assert prefixes[1][1] == _get_unsigned_32_bit_hash("✨") - assert prefixes[1][2] == _get_unsigned_32_bit_hash("✨") - assert prefixes[1][3] == _get_unsigned_32_bit_hash(" ") - assert prefixes[2][0] == _get_unsigned_32_bit_hash("a") - assert prefixes[2][1] == _get_unsigned_32_bit_hash("and") - assert prefixes[2][2] == _get_unsigned_32_bit_hash("and") - assert prefixes[2][3] == _get_unsigned_32_bit_hash(" ") - assert prefixes[3][0] == _get_unsigned_32_bit_hash("P" if case_sensitive else "p") - assert prefixes[3][1] == _get_unsigned_32_bit_hash( - "Prod" if case_sensitive else "prod" - ) - assert prefixes[3][2] == _get_unsigned_32_bit_hash( - "Pro" if case_sensitive else "pro" - ) - assert prefixes[3][3] == _get_unsigned_32_bit_hash(" ") - - assert suffixes[0][0] == _get_unsigned_32_bit_hash("Cy" if case_sensitive else "cy") - assert suffixes[0][1] == _get_unsigned_32_bit_hash( - "aCy" if case_sensitive else "acy" - ) - assert suffixes[0][2] == _get_unsigned_32_bit_hash( + assert hashes[0][2] == _get_unsigned_32_bit_hash("spa") + assert hashes[0][3] == _get_unsigned_32_bit_hash("Cy" if case_sensitive else "cy") + assert hashes[0][4] == _get_unsigned_32_bit_hash("aCy" if case_sensitive else "acy") + assert hashes[0][5] == _get_unsigned_32_bit_hash( "paCy" if case_sensitive else "pacy" ) - assert suffixes[0][3] == _get_unsigned_32_bit_hash( + assert hashes[0][6] == _get_unsigned_32_bit_hash( "spaCy" if case_sensitive else "spacy" ) - assert suffixes[0][4] == _get_unsigned_32_bit_hash("p ") - assert suffixes[0][5] == _get_unsigned_32_bit_hash("p") - assert suffixes[1][0] == _get_unsigned_32_bit_hash("✨") - assert suffixes[1][1] == _get_unsigned_32_bit_hash("✨") - assert suffixes[1][2] == _get_unsigned_32_bit_hash("✨") - assert suffixes[1][3] == _get_unsigned_32_bit_hash("✨") - assert suffixes[1][4] == _get_unsigned_32_bit_hash("✨ ") - assert suffixes[1][5] == _get_unsigned_32_bit_hash("✨") - assert suffixes[2][0] == _get_unsigned_32_bit_hash("nd") - assert suffixes[2][1] == _get_unsigned_32_bit_hash("and") - assert suffixes[2][2] == _get_unsigned_32_bit_hash("and") - assert suffixes[2][3] == _get_unsigned_32_bit_hash("and") - assert suffixes[2][4] == _get_unsigned_32_bit_hash(" ") - assert suffixes[2][5] == _get_unsigned_32_bit_hash(" ") - assert suffixes[3][0] == _get_unsigned_32_bit_hash("gy") - assert suffixes[3][1] == _get_unsigned_32_bit_hash("igy") - assert suffixes[3][2] == _get_unsigned_32_bit_hash("digy") - assert suffixes[3][3] == _get_unsigned_32_bit_hash("odigy") - assert suffixes[3][5] == _get_unsigned_32_bit_hash("r") + + assert hashes[0][7] == _get_unsigned_32_bit_hash(" ") + assert hashes[0][8] == _get_unsigned_32_bit_hash("p ") + assert hashes[0][9] == _get_unsigned_32_bit_hash("p") + assert hashes[1][0] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][1] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][2] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][3] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][4] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][5] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][6] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][7] == _get_unsigned_32_bit_hash(" ") + assert hashes[1][8] == _get_unsigned_32_bit_hash("✨ ") + assert hashes[1][9] == _get_unsigned_32_bit_hash("✨") + assert hashes[2][0] == _get_unsigned_32_bit_hash("a") + assert hashes[2][1] == _get_unsigned_32_bit_hash("and") + assert hashes[2][2] == _get_unsigned_32_bit_hash("and") + assert hashes[2][3] == _get_unsigned_32_bit_hash("nd") + assert hashes[2][4] == _get_unsigned_32_bit_hash("and") + assert hashes[2][5] == _get_unsigned_32_bit_hash("and") + assert hashes[2][6] == _get_unsigned_32_bit_hash("and") + assert hashes[2][7] == _get_unsigned_32_bit_hash(" ") + assert hashes[2][8] == _get_unsigned_32_bit_hash(" ") + assert hashes[2][9] == _get_unsigned_32_bit_hash(" ") + assert hashes[3][0] == _get_unsigned_32_bit_hash("P" if case_sensitive else "p") + assert hashes[3][1] == _get_unsigned_32_bit_hash( + "Prod" if case_sensitive else "prod" + ) + assert hashes[3][2] == _get_unsigned_32_bit_hash("Pro" if case_sensitive else "pro") + assert hashes[3][3] == _get_unsigned_32_bit_hash("gy") + assert hashes[3][4] == _get_unsigned_32_bit_hash("igy") + assert hashes[3][5] == _get_unsigned_32_bit_hash("digy") + assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy") + assert hashes[3][7] == _get_unsigned_32_bit_hash(" ") + + assert hashes[3][9] == _get_unsigned_32_bit_hash("r") if case_sensitive: - assert suffixes[3][4] == _get_unsigned_32_bit_hash("r ") + assert hashes[3][8] == _get_unsigned_32_bit_hash("r ") else: - assert suffixes[3][4] == _get_unsigned_32_bit_hash("rp") + assert hashes[3][8] == _get_unsigned_32_bit_hash("rp") # check values are the same cross-platform - assert prefixes[0][1] == 753329845 if case_sensitive else 18446744071614199016 - assert suffixes[1][0] == 3425774424 - assert suffixes[2][4] == 3076404432 + assert hashes[0][1] == 753329845 if case_sensitive else 18446744071614199016 + assert hashes[1][3] == 3425774424 + assert hashes[2][8] == 3076404432 def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer): doc = en_tokenizer("and𐌞") - suffixes = doc.get_character_combination_hashes( - case_sensitive=True, - suffs_not_prefs=True, - affix_lengths=[1, 2, 3], - search_chars="a", - search_lengths=[1]) - assert suffixes[0][1] == _get_unsigned_32_bit_hash("𐌞") - assert suffixes[0][2] == _get_unsigned_32_bit_hash("d𐌞") - assert suffixes[0][3] == _get_unsigned_32_bit_hash("a") + hashes = doc.get_character_combination_hashes( + case_sensitive=True, + pref_lengths=[], + suff_lengths=[1, 2, 3], + pref_search_chars="", + pref_search_lengths=[], + suff_search_chars="a", + suff_search_lengths=[1], + ) + assert hashes[0][1] == _get_unsigned_32_bit_hash("𐌞") + assert hashes[0][2] == _get_unsigned_32_bit_hash("d𐌞") + assert hashes[0][3] == _get_unsigned_32_bit_hash("a") def test_get_character_combination_hashes_4_byte_char_in_middle(en_tokenizer): doc = en_tokenizer("and𐌞a") - suffixes = doc.get_character_combination_hashes( - case_sensitive=False, - suffs_not_prefs=True, - affix_lengths=[1, 2, 3, 4], - search_chars="a", - search_lengths=[1, 2]) - assert suffixes[0][0] == _get_unsigned_32_bit_hash("a") - assert suffixes[0][2] == _get_unsigned_32_bit_hash("𐌞a") - assert suffixes[0][3] == _get_unsigned_32_bit_hash("d𐌞a") - assert suffixes[0][4] == _get_unsigned_32_bit_hash("a") - assert suffixes[0][5] == _get_unsigned_32_bit_hash("aa") + hashes = doc.get_character_combination_hashes( + case_sensitive=False, + pref_lengths=[], + suff_lengths=[1, 2, 3, 4], + pref_search_chars="", + pref_search_lengths=[], + suff_search_chars="a", + suff_search_lengths=[1, 2], + ) + assert hashes[0][0] == _get_unsigned_32_bit_hash("a") + assert hashes[0][2] == _get_unsigned_32_bit_hash("𐌞a") + assert hashes[0][3] == _get_unsigned_32_bit_hash("d𐌞a") + assert hashes[0][4] == _get_unsigned_32_bit_hash("a") + assert hashes[0][5] == _get_unsigned_32_bit_hash("aa") def test_get_character_combination_hashes_4_byte_special_char(en_tokenizer): doc = en_tokenizer("and𐌞") with pytest.raises(ValueError): - doc.get_character_combination_hashes(case_sensitive=True, - suffs_not_prefs=True, - affix_lengths=[2, 3, 4, 5], - search_chars="𐌞", - search_lengths=[2]) + doc.get_character_combination_hashes( + case_sensitive=True, + pref_lengths=[], + suff_lengths=[2, 3, 4, 5], + pref_search_chars="", + pref_search_lengths=[], + suff_search_chars="𐌞", + suff_search_lengths=[2], + ) + def test_character_combination_hashes_empty_lengths(en_tokenizer): doc = en_tokenizer("and𐌞") - assert doc.get_character_combination_hashes(case_sensitive=True, - suffs_not_prefs=True, - affix_lengths=[], - search_chars="", - search_lengths=[]).shape == (1, 0) + assert doc.get_character_combination_hashes( + case_sensitive=True, + pref_lengths=[], + suff_lengths=[], + pref_search_chars="", + pref_search_lengths=[], + suff_search_chars="", + suff_search_lengths=[], + ).shape == (1, 0) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index c4569e9d6..f97c7baf6 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -178,10 +178,12 @@ class Doc: self, *, case_sensitive: bool, - suffs_not_prefs: bool, - affix_lengths: List[int], - search_chars: str, - search_lengths: List[int] + pref_lengths: List[int], + suff_lengths: List[int], + pref_search_chars: str, + pref_search_lengths: List[int], + suff_search_chars: str, + suff_search_lengths: List[int] ) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index dba844adc..c47855111 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1740,10 +1740,12 @@ cdef class Doc: self, *, bint case_sensitive, - bint suffs_not_prefs, - affix_lengths: List[int], - str search_chars, - search_lengths: List[int] + pref_lengths: List[int], + suff_lengths: List[int], + str pref_search_chars, + pref_search_lengths: List[int], + str suff_search_chars, + suff_search_lengths: List[int] ): """ Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations @@ -1751,22 +1753,25 @@ cdef class Doc: case_sensitive: if *True*, the lower-case version of each token string is used as the basis for generating hashes. Note that if *case_sensitive==False*, upper-case characters in *search_chars* will not be found in token strings. - suffs_not_prefs: if *True*, affixes are suffixes, and searching are from the end of each token; - if *False*, affixes are prefixes, and searching is from the start of each token. - affix_lengths: an integer list specifying the lengths of affixes to be hashed. For example, if *affix_lengths==[2, 3]*, - *suffs_not_prefs==True* and *case_sensitive==True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". - search_chars: a string containing characters to search for within each token, starting at the beginning or end depending on the - value of *suffs_not_prefs*. - search_lengths: an integer list specifying the lengths of search results to be hashed. For example if *search_lengths==[1, 2]*, - *search_chars=="aC", *suffs_not_prefs==True* and *case_sensitive==True*, the searched strings hashed for "spaCy" would be - "C" and "Ca". + pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*, + the prefixes hashed for "spaCy" would be "sp" and "spa". + suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and + *case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". + pref_search_chars: a string containing characters to search for within each token, starting at the beginning. + pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if + *pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for + "spaCy" would be "a" and "ac". + suff_search_chars: a string containing characters to search for within each token, starting at the end. + suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if + *suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for + "spaCy" would be "c" and "ca". For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by - *get_affix_hashes(True, True, [2, 4, 6], "yC", [1, 2])* would correspond to + *get_character_combination_hashes(True, [2], [2, 4, 6], "yC", [1], [2])* would correspond to - [[hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")], - [hash("nd"), hash("and", hash("and"), hash(" "), hash(" "))], - [hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]] + [[hash("sp"), [hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")], + [hash("an"), hash("nd"), hash("and", hash("and"), hash(" "), hash(" "))], + [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]] UTF-16 is used to encode the token texts, as this results in two-byte representations for all characters that are realistically interesting when learning features from words. UTF-16 can also contain four-byte representations, but neither of the byte pairs in @@ -1775,14 +1780,18 @@ cdef class Doc: representation in *search_chars*, on the other hand, is not supported and results in a ValueError(E1046). """ - cdef const unsigned char[:] search_chars_v = _get_utf16_memoryview(search_chars, True) - cdef unsigned int longest_search_length = max(search_lengths) if len(search_lengths) > 0 else 0 + cdef const unsigned char[:] pref_search_chars_v = _get_utf16_memoryview(pref_search_chars, True) + cdef const unsigned char[:] suff_search_chars_v = _get_utf16_memoryview(suff_search_chars, True) + cdef unsigned int longest_search_length = max(pref_search_lengths + suff_search_lengths) if len(pref_search_lengths + suff_search_lengths) > 0 else 0 cdef bytes found_char_buf_bytes = (bytes(" " * longest_search_length, "UTF-16"))[2:] # first two bytes express endianness cdef char* found_char_buf = found_char_buf_bytes - cdef unsigned int search_chars_v_len = len(search_chars_v), found_char_buf_len = len(found_char_buf_bytes) + cdef unsigned int pref_search_chars_v_len = len(pref_search_chars_v), suff_search_chars_v_len = len(suff_search_chars_v), + cdef unsigned int found_char_buf_len = len(found_char_buf_bytes) - cdef unsigned int num_toks = len(self), num_norm_hashes = len(affix_lengths), num_search_hashes = len(search_lengths) - cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_norm_hashes + num_search_hashes), dtype="int64") + cdef unsigned int num_toks = len(self), num_pref_norm_hashes = len(pref_lengths), num_suff_norm_hashes = len(suff_lengths) + cdef unsigned int num_pref_search_hashes = len(pref_search_lengths) + cdef unsigned int num_suff_search_hashes = len(suff_search_lengths) + cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_norm_hashes), dtype="int64") cdef const unsigned char[:] tok_str_v cdef unsigned int tok_idx, tok_str_v_len, hash_idx, affix_start, char_comb_len @@ -1795,25 +1804,45 @@ cdef class Doc: tok_str_v = _get_utf16_memoryview(str_tok_attr, False) tok_str_v_len = len(tok_str_v) - for hash_idx in range(num_norm_hashes): - char_comb_len = affix_lengths[hash_idx] * 2 + for hash_idx in range(num_pref_norm_hashes): + char_comb_len = pref_lengths[hash_idx] * 2 if char_comb_len > tok_str_v_len: char_comb_len = tok_str_v_len - affix_start = tok_str_v_len - char_comb_len if suffs_not_prefs else 0 + hashes[tok_idx, hash_idx] = hash32( &tok_str_v[0], char_comb_len, 0) + + for hash_idx in range(num_pref_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes): + char_comb_len = suff_lengths[hash_idx - num_pref_norm_hashes] * 2 + if char_comb_len > tok_str_v_len: + char_comb_len = tok_str_v_len + affix_start = tok_str_v_len - char_comb_len hashes[tok_idx, hash_idx] = hash32( &tok_str_v[affix_start], char_comb_len, 0) _set_found_char_buf( - suffs_not_prefs, + False, tok_str_v, tok_str_v_len, - search_chars_v, - search_chars_v_len, + pref_search_chars_v, + pref_search_chars_v_len, found_char_buf, found_char_buf_len, ) - for hash_idx in range(num_norm_hashes, num_norm_hashes + num_search_hashes): - char_comb_len = search_lengths[hash_idx - num_norm_hashes] * 2 + for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes): + char_comb_len = pref_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes)] * 2 + hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0) + + _set_found_char_buf( + True, + tok_str_v, + tok_str_v_len, + suff_search_chars_v, + suff_search_chars_v_len, + found_char_buf, + found_char_buf_len, + ) + + for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_search_hashes): + char_comb_len = suff_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes)] * 2 hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0) return hashes