From f2c73aa85d70e4e25bea96afa093d04c4d6dfa74 Mon Sep 17 00:00:00 2001 From: richardpaulhudson Date: Thu, 6 Oct 2022 07:50:35 +0200 Subject: [PATCH] Corrections --- spacy/tokens/doc.pxd | 4 ++-- spacy/tokens/doc.pyi | 5 +++-- spacy/tokens/doc.pyx | 23 +++++++++++------------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index e8b00051b..44d7e65b9 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -39,7 +39,8 @@ cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint cdef bint _is_searched_char_in_search_chars_v( const unsigned short searched_char, const unsigned char[:] search_chars_v, - const unsigned int search_chars_v_len) + const unsigned int search_chars_v_len, +) cdef void _set_found_char_buf( @@ -50,7 +51,6 @@ cdef void _set_found_char_buf( const unsigned int search_chars_v_len, char* found_char_buf, const unsigned int found_char_buf_len, - ) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 70e5b3e2b..c4569e9d6 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -176,11 +176,12 @@ class Doc: def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ... def get_character_combination_hashes( self, - *case_sensitive: bool, + *, + case_sensitive: bool, suffs_not_prefs: bool, affix_lengths: List[int], search_chars: str, search_lengths: List[int] - ): ... + ) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ef4d5e706..3a689c60e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1769,19 +1769,20 @@ cdef class Doc: [hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]] UTF-16 is used to encode the token texts, as this results in two-byte representations for all characters that are realistically - likely to occur in normal spaCy documents. UTF-16 can also contain four-byte representations, but neither of the byte pairs in - a four-byte representation is ever valid in its own right as a two-byte representation. in the rare case that a four-byte - representation occurs in a string being analysed, each of its two-byte pairs is treated as a separate character, while a four-byte - representation in *search_chars* is not supported and results in a ValueError(E1046). + interesting in learning features from words. UTF-16 can also contain four-byte representations, but neither of the byte pairs in + a four-byte representation is ever valid in its own right as a two-byte representation. In the rare case that a four-byte + representation occurs in a string being analysed, each of its two-byte pairs is treated as a separate character. A four-byte + representation in *search_chars*, on the other hand, is not supported and results in a ValueError(E1046). """ + cdef const unsigned char[:] search_chars_v = _get_utf16_memoryview(search_chars, True) cdef unsigned int longest_search_length = max(search_lengths) if len(search_lengths) > 0 else 0 cdef bytes found_char_buf_bytes = (bytes(" " * longest_search_length, "UTF-16"))[2:] # first two bytes express endianness cdef char* found_char_buf = found_char_buf_bytes cdef unsigned int search_chars_v_len = len(search_chars_v), found_char_buf_len = len(found_char_buf_bytes) - cdef unsigned int num_toks = len(self), num_norm_hashes = len(affix_lengths), num_spec_hashes = len(search_lengths) - cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_norm_hashes + num_spec_hashes), dtype="int64") + cdef unsigned int num_toks = len(self), num_norm_hashes = len(affix_lengths), num_search_hashes = len(search_lengths) + cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_norm_hashes + num_search_hashes), dtype="int64") cdef const unsigned char[:] tok_str_v cdef unsigned int tok_idx, tok_str_v_len, hash_idx, affix_start, hash_len @@ -1798,10 +1799,7 @@ cdef class Doc: hash_len = affix_lengths[hash_idx] * 2 if hash_len > tok_str_v_len: hash_len = tok_str_v_len - if suffs_not_prefs: - affix_start = tok_str_v_len - hash_len - else: - affix_start = 0 + affix_start = tok_str_v_len - hash_len if suffs_not_prefs else 0 hashes[tok_idx, hash_idx] = hash32( &tok_str_v[affix_start], hash_len, 0) _set_found_char_buf( @@ -1814,7 +1812,7 @@ cdef class Doc: found_char_buf_len, ) - for hash_idx in range(num_norm_hashes, num_norm_hashes + num_spec_hashes): + for hash_idx in range(num_norm_hashes, num_norm_hashes + num_search_hashes): hash_len = search_lengths[hash_idx - num_norm_hashes] * 2 hashes[tok_idx, hash_idx] = hash32(found_char_buf, hash_len, 0) @@ -2005,7 +2003,7 @@ cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint """ Return a memory view of the UTF-16 representation of a string with the default endianness of the platform. Throw a ValueError if *check_2_bytes == True* and one or more characters in the UTF-16 representation - occupy four bytes rather than two. + occupies four bytes rather than two. """ cdef const unsigned char[:] view = unicode_string.encode("UTF-16") view = view[2:] # first two bytes express endianness @@ -2064,6 +2062,7 @@ cdef void _set_found_char_buf( memcpy(found_char_buf + found_char_buf_idx, &SPACE, 2) found_char_buf_idx += 2 + def pickle_doc(doc): bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"]) hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,