mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Corrections
This commit is contained in:
parent
7d4e99425b
commit
f2c73aa85d
|
@ -39,7 +39,8 @@ cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint
|
||||||
cdef bint _is_searched_char_in_search_chars_v(
|
cdef bint _is_searched_char_in_search_chars_v(
|
||||||
const unsigned short searched_char,
|
const unsigned short searched_char,
|
||||||
const unsigned char[:] search_chars_v,
|
const unsigned char[:] search_chars_v,
|
||||||
const unsigned int search_chars_v_len)
|
const unsigned int search_chars_v_len,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
cdef void _set_found_char_buf(
|
cdef void _set_found_char_buf(
|
||||||
|
@ -50,7 +51,6 @@ cdef void _set_found_char_buf(
|
||||||
const unsigned int search_chars_v_len,
|
const unsigned int search_chars_v_len,
|
||||||
char* found_char_buf,
|
char* found_char_buf,
|
||||||
const unsigned int found_char_buf_len,
|
const unsigned int found_char_buf_len,
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -176,11 +176,12 @@ class Doc:
|
||||||
def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ...
|
def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ...
|
||||||
def get_character_combination_hashes(
|
def get_character_combination_hashes(
|
||||||
self,
|
self,
|
||||||
*case_sensitive: bool,
|
*,
|
||||||
|
case_sensitive: bool,
|
||||||
suffs_not_prefs: bool,
|
suffs_not_prefs: bool,
|
||||||
affix_lengths: List[int],
|
affix_lengths: List[int],
|
||||||
search_chars: str,
|
search_chars: str,
|
||||||
search_lengths: List[int]
|
search_lengths: List[int]
|
||||||
): ...
|
) -> Ints2d: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_array_attrs() -> Tuple[Any]: ...
|
def _get_array_attrs() -> Tuple[Any]: ...
|
||||||
|
|
|
@ -1769,19 +1769,20 @@ cdef class Doc:
|
||||||
[hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
|
[hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
|
||||||
|
|
||||||
UTF-16 is used to encode the token texts, as this results in two-byte representations for all characters that are realistically
|
UTF-16 is used to encode the token texts, as this results in two-byte representations for all characters that are realistically
|
||||||
likely to occur in normal spaCy documents. UTF-16 can also contain four-byte representations, but neither of the byte pairs in
|
interesting in learning features from words. UTF-16 can also contain four-byte representations, but neither of the byte pairs in
|
||||||
a four-byte representation is ever valid in its own right as a two-byte representation. in the rare case that a four-byte
|
a four-byte representation is ever valid in its own right as a two-byte representation. In the rare case that a four-byte
|
||||||
representation occurs in a string being analysed, each of its two-byte pairs is treated as a separate character, while a four-byte
|
representation occurs in a string being analysed, each of its two-byte pairs is treated as a separate character. A four-byte
|
||||||
representation in *search_chars* is not supported and results in a ValueError(E1046).
|
representation in *search_chars*, on the other hand, is not supported and results in a ValueError(E1046).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cdef const unsigned char[:] search_chars_v = _get_utf16_memoryview(search_chars, True)
|
cdef const unsigned char[:] search_chars_v = _get_utf16_memoryview(search_chars, True)
|
||||||
cdef unsigned int longest_search_length = max(search_lengths) if len(search_lengths) > 0 else 0
|
cdef unsigned int longest_search_length = max(search_lengths) if len(search_lengths) > 0 else 0
|
||||||
cdef bytes found_char_buf_bytes = (bytes(" " * longest_search_length, "UTF-16"))[2:] # first two bytes express endianness
|
cdef bytes found_char_buf_bytes = (bytes(" " * longest_search_length, "UTF-16"))[2:] # first two bytes express endianness
|
||||||
cdef char* found_char_buf = found_char_buf_bytes
|
cdef char* found_char_buf = found_char_buf_bytes
|
||||||
cdef unsigned int search_chars_v_len = len(search_chars_v), found_char_buf_len = len(found_char_buf_bytes)
|
cdef unsigned int search_chars_v_len = len(search_chars_v), found_char_buf_len = len(found_char_buf_bytes)
|
||||||
|
|
||||||
cdef unsigned int num_toks = len(self), num_norm_hashes = len(affix_lengths), num_spec_hashes = len(search_lengths)
|
cdef unsigned int num_toks = len(self), num_norm_hashes = len(affix_lengths), num_search_hashes = len(search_lengths)
|
||||||
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_norm_hashes + num_spec_hashes), dtype="int64")
|
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_norm_hashes + num_search_hashes), dtype="int64")
|
||||||
|
|
||||||
cdef const unsigned char[:] tok_str_v
|
cdef const unsigned char[:] tok_str_v
|
||||||
cdef unsigned int tok_idx, tok_str_v_len, hash_idx, affix_start, hash_len
|
cdef unsigned int tok_idx, tok_str_v_len, hash_idx, affix_start, hash_len
|
||||||
|
@ -1798,10 +1799,7 @@ cdef class Doc:
|
||||||
hash_len = affix_lengths[hash_idx] * 2
|
hash_len = affix_lengths[hash_idx] * 2
|
||||||
if hash_len > tok_str_v_len:
|
if hash_len > tok_str_v_len:
|
||||||
hash_len = tok_str_v_len
|
hash_len = tok_str_v_len
|
||||||
if suffs_not_prefs:
|
affix_start = tok_str_v_len - hash_len if suffs_not_prefs else 0
|
||||||
affix_start = tok_str_v_len - hash_len
|
|
||||||
else:
|
|
||||||
affix_start = 0
|
|
||||||
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[affix_start], hash_len, 0)
|
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[affix_start], hash_len, 0)
|
||||||
|
|
||||||
_set_found_char_buf(
|
_set_found_char_buf(
|
||||||
|
@ -1814,7 +1812,7 @@ cdef class Doc:
|
||||||
found_char_buf_len,
|
found_char_buf_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hash_idx in range(num_norm_hashes, num_norm_hashes + num_spec_hashes):
|
for hash_idx in range(num_norm_hashes, num_norm_hashes + num_search_hashes):
|
||||||
hash_len = search_lengths[hash_idx - num_norm_hashes] * 2
|
hash_len = search_lengths[hash_idx - num_norm_hashes] * 2
|
||||||
hashes[tok_idx, hash_idx] = hash32(found_char_buf, hash_len, 0)
|
hashes[tok_idx, hash_idx] = hash32(found_char_buf, hash_len, 0)
|
||||||
|
|
||||||
|
@ -2005,7 +2003,7 @@ cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint
|
||||||
"""
|
"""
|
||||||
Return a memory view of the UTF-16 representation of a string with the default endianness of the platform.
|
Return a memory view of the UTF-16 representation of a string with the default endianness of the platform.
|
||||||
Throw a ValueError if *check_2_bytes == True* and one or more characters in the UTF-16 representation
|
Throw a ValueError if *check_2_bytes == True* and one or more characters in the UTF-16 representation
|
||||||
occupy four bytes rather than two.
|
occupies four bytes rather than two.
|
||||||
"""
|
"""
|
||||||
cdef const unsigned char[:] view = unicode_string.encode("UTF-16")
|
cdef const unsigned char[:] view = unicode_string.encode("UTF-16")
|
||||||
view = view[2:] # first two bytes express endianness
|
view = view[2:] # first two bytes express endianness
|
||||||
|
@ -2064,6 +2062,7 @@ cdef void _set_found_char_buf(
|
||||||
memcpy(found_char_buf + found_char_buf_idx, &SPACE, 2)
|
memcpy(found_char_buf + found_char_buf_idx, &SPACE, 2)
|
||||||
found_char_buf_idx += 2
|
found_char_buf_idx += 2
|
||||||
|
|
||||||
|
|
||||||
def pickle_doc(doc):
|
def pickle_doc(doc):
|
||||||
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
|
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
|
||||||
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user