mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Intermediate state necessary to test equivalence
This commit is contained in:
parent
f410c066f4
commit
be363a7710
|
@ -52,21 +52,16 @@ def forward(
|
||||||
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
|
suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
|
||||||
features: List[Ints2d] = []
|
features: List[Ints2d] = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
prefix_hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=case_sensitive,
|
case_sensitive=case_sensitive,
|
||||||
suffs_not_prefs=False,
|
pref_lengths=pref_lengths,
|
||||||
affix_lengths=pref_lengths,
|
suff_lengths=suff_lengths,
|
||||||
search_chars=pref_search_chars,
|
pref_search_chars=pref_search_chars,
|
||||||
search_lengths=pref_search_lengths,
|
pref_search_lengths=pref_search_lengths,
|
||||||
|
suff_search_chars=suff_search_chars,
|
||||||
|
suff_search_lengths=suff_search_lengths,
|
||||||
)
|
)
|
||||||
suffix_hashes = doc.get_character_combination_hashes(
|
features.append(ops.asarray2i(hashes))
|
||||||
case_sensitive=case_sensitive,
|
|
||||||
suffs_not_prefs=True,
|
|
||||||
affix_lengths=suff_lengths,
|
|
||||||
search_chars=suff_search_chars,
|
|
||||||
search_lengths=suff_search_lengths,
|
|
||||||
)
|
|
||||||
features.append(ops.asarray2i(ops.xp.hstack([prefix_hashes, suffix_hashes])))
|
|
||||||
|
|
||||||
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||||
return features, backprop
|
return features, backprop
|
||||||
|
|
|
@ -989,121 +989,132 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
|
||||||
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
||||||
|
|
||||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||||
prefixes = doc.get_character_combination_hashes(case_sensitive=case_sensitive,
|
hashes = doc.get_character_combination_hashes(
|
||||||
suffs_not_prefs=False,
|
case_sensitive=case_sensitive,
|
||||||
affix_lengths=[1, 4, 3],
|
pref_lengths=[1, 4, 3],
|
||||||
search_chars="",
|
suff_lengths=[2, 3, 4, 5],
|
||||||
search_lengths=[2])
|
pref_search_chars="",
|
||||||
suffixes = doc.get_character_combination_hashes(case_sensitive=case_sensitive,
|
pref_search_lengths=[2],
|
||||||
suffs_not_prefs=True,
|
suff_search_chars="xx✨rp",
|
||||||
affix_lengths=[2, 3, 4, 5],
|
suff_search_lengths=[2, 1],
|
||||||
search_chars="xx✨rp",
|
)
|
||||||
search_lengths=[2, 1])
|
assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
|
||||||
assert prefixes[0][0] == _get_unsigned_32_bit_hash("s")
|
assert hashes[0][1] == _get_unsigned_32_bit_hash(
|
||||||
assert prefixes[0][1] == _get_unsigned_32_bit_hash(
|
|
||||||
"spaC" if case_sensitive else "spac"
|
"spaC" if case_sensitive else "spac"
|
||||||
)
|
)
|
||||||
assert prefixes[0][2] == _get_unsigned_32_bit_hash("spa")
|
assert hashes[0][2] == _get_unsigned_32_bit_hash("spa")
|
||||||
assert prefixes[0][3] == _get_unsigned_32_bit_hash(" ")
|
assert hashes[0][3] == _get_unsigned_32_bit_hash("Cy" if case_sensitive else "cy")
|
||||||
assert prefixes[1][0] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[0][4] == _get_unsigned_32_bit_hash("aCy" if case_sensitive else "acy")
|
||||||
assert prefixes[1][1] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[0][5] == _get_unsigned_32_bit_hash(
|
||||||
assert prefixes[1][2] == _get_unsigned_32_bit_hash("✨")
|
|
||||||
assert prefixes[1][3] == _get_unsigned_32_bit_hash(" ")
|
|
||||||
assert prefixes[2][0] == _get_unsigned_32_bit_hash("a")
|
|
||||||
assert prefixes[2][1] == _get_unsigned_32_bit_hash("and")
|
|
||||||
assert prefixes[2][2] == _get_unsigned_32_bit_hash("and")
|
|
||||||
assert prefixes[2][3] == _get_unsigned_32_bit_hash(" ")
|
|
||||||
assert prefixes[3][0] == _get_unsigned_32_bit_hash("P" if case_sensitive else "p")
|
|
||||||
assert prefixes[3][1] == _get_unsigned_32_bit_hash(
|
|
||||||
"Prod" if case_sensitive else "prod"
|
|
||||||
)
|
|
||||||
assert prefixes[3][2] == _get_unsigned_32_bit_hash(
|
|
||||||
"Pro" if case_sensitive else "pro"
|
|
||||||
)
|
|
||||||
assert prefixes[3][3] == _get_unsigned_32_bit_hash(" ")
|
|
||||||
|
|
||||||
assert suffixes[0][0] == _get_unsigned_32_bit_hash("Cy" if case_sensitive else "cy")
|
|
||||||
assert suffixes[0][1] == _get_unsigned_32_bit_hash(
|
|
||||||
"aCy" if case_sensitive else "acy"
|
|
||||||
)
|
|
||||||
assert suffixes[0][2] == _get_unsigned_32_bit_hash(
|
|
||||||
"paCy" if case_sensitive else "pacy"
|
"paCy" if case_sensitive else "pacy"
|
||||||
)
|
)
|
||||||
assert suffixes[0][3] == _get_unsigned_32_bit_hash(
|
assert hashes[0][6] == _get_unsigned_32_bit_hash(
|
||||||
"spaCy" if case_sensitive else "spacy"
|
"spaCy" if case_sensitive else "spacy"
|
||||||
)
|
)
|
||||||
assert suffixes[0][4] == _get_unsigned_32_bit_hash("p ")
|
|
||||||
assert suffixes[0][5] == _get_unsigned_32_bit_hash("p")
|
assert hashes[0][7] == _get_unsigned_32_bit_hash(" ")
|
||||||
assert suffixes[1][0] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
|
||||||
assert suffixes[1][1] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
|
||||||
assert suffixes[1][2] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
|
||||||
assert suffixes[1][3] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][1] == _get_unsigned_32_bit_hash("✨")
|
||||||
assert suffixes[1][4] == _get_unsigned_32_bit_hash("✨ ")
|
assert hashes[1][2] == _get_unsigned_32_bit_hash("✨")
|
||||||
assert suffixes[1][5] == _get_unsigned_32_bit_hash("✨")
|
assert hashes[1][3] == _get_unsigned_32_bit_hash("✨")
|
||||||
assert suffixes[2][0] == _get_unsigned_32_bit_hash("nd")
|
assert hashes[1][4] == _get_unsigned_32_bit_hash("✨")
|
||||||
assert suffixes[2][1] == _get_unsigned_32_bit_hash("and")
|
assert hashes[1][5] == _get_unsigned_32_bit_hash("✨")
|
||||||
assert suffixes[2][2] == _get_unsigned_32_bit_hash("and")
|
assert hashes[1][6] == _get_unsigned_32_bit_hash("✨")
|
||||||
assert suffixes[2][3] == _get_unsigned_32_bit_hash("and")
|
assert hashes[1][7] == _get_unsigned_32_bit_hash(" ")
|
||||||
assert suffixes[2][4] == _get_unsigned_32_bit_hash(" ")
|
assert hashes[1][8] == _get_unsigned_32_bit_hash("✨ ")
|
||||||
assert suffixes[2][5] == _get_unsigned_32_bit_hash(" ")
|
assert hashes[1][9] == _get_unsigned_32_bit_hash("✨")
|
||||||
assert suffixes[3][0] == _get_unsigned_32_bit_hash("gy")
|
assert hashes[2][0] == _get_unsigned_32_bit_hash("a")
|
||||||
assert suffixes[3][1] == _get_unsigned_32_bit_hash("igy")
|
assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
|
||||||
assert suffixes[3][2] == _get_unsigned_32_bit_hash("digy")
|
assert hashes[2][2] == _get_unsigned_32_bit_hash("and")
|
||||||
assert suffixes[3][3] == _get_unsigned_32_bit_hash("odigy")
|
assert hashes[2][3] == _get_unsigned_32_bit_hash("nd")
|
||||||
assert suffixes[3][5] == _get_unsigned_32_bit_hash("r")
|
assert hashes[2][4] == _get_unsigned_32_bit_hash("and")
|
||||||
|
assert hashes[2][5] == _get_unsigned_32_bit_hash("and")
|
||||||
|
assert hashes[2][6] == _get_unsigned_32_bit_hash("and")
|
||||||
|
assert hashes[2][7] == _get_unsigned_32_bit_hash(" ")
|
||||||
|
assert hashes[2][8] == _get_unsigned_32_bit_hash(" ")
|
||||||
|
assert hashes[2][9] == _get_unsigned_32_bit_hash(" ")
|
||||||
|
assert hashes[3][0] == _get_unsigned_32_bit_hash("P" if case_sensitive else "p")
|
||||||
|
assert hashes[3][1] == _get_unsigned_32_bit_hash(
|
||||||
|
"Prod" if case_sensitive else "prod"
|
||||||
|
)
|
||||||
|
assert hashes[3][2] == _get_unsigned_32_bit_hash("Pro" if case_sensitive else "pro")
|
||||||
|
assert hashes[3][3] == _get_unsigned_32_bit_hash("gy")
|
||||||
|
assert hashes[3][4] == _get_unsigned_32_bit_hash("igy")
|
||||||
|
assert hashes[3][5] == _get_unsigned_32_bit_hash("digy")
|
||||||
|
assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy")
|
||||||
|
assert hashes[3][7] == _get_unsigned_32_bit_hash(" ")
|
||||||
|
|
||||||
|
assert hashes[3][9] == _get_unsigned_32_bit_hash("r")
|
||||||
|
|
||||||
if case_sensitive:
|
if case_sensitive:
|
||||||
assert suffixes[3][4] == _get_unsigned_32_bit_hash("r ")
|
assert hashes[3][8] == _get_unsigned_32_bit_hash("r ")
|
||||||
else:
|
else:
|
||||||
assert suffixes[3][4] == _get_unsigned_32_bit_hash("rp")
|
assert hashes[3][8] == _get_unsigned_32_bit_hash("rp")
|
||||||
|
|
||||||
# check values are the same cross-platform
|
# check values are the same cross-platform
|
||||||
assert prefixes[0][1] == 753329845 if case_sensitive else 18446744071614199016
|
assert hashes[0][1] == 753329845 if case_sensitive else 18446744071614199016
|
||||||
assert suffixes[1][0] == 3425774424
|
assert hashes[1][3] == 3425774424
|
||||||
assert suffixes[2][4] == 3076404432
|
assert hashes[2][8] == 3076404432
|
||||||
|
|
||||||
|
|
||||||
def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer):
|
def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer):
|
||||||
doc = en_tokenizer("and𐌞")
|
doc = en_tokenizer("and𐌞")
|
||||||
suffixes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=True,
|
case_sensitive=True,
|
||||||
suffs_not_prefs=True,
|
pref_lengths=[],
|
||||||
affix_lengths=[1, 2, 3],
|
suff_lengths=[1, 2, 3],
|
||||||
search_chars="a",
|
pref_search_chars="",
|
||||||
search_lengths=[1])
|
pref_search_lengths=[],
|
||||||
assert suffixes[0][1] == _get_unsigned_32_bit_hash("𐌞")
|
suff_search_chars="a",
|
||||||
assert suffixes[0][2] == _get_unsigned_32_bit_hash("d𐌞")
|
suff_search_lengths=[1],
|
||||||
assert suffixes[0][3] == _get_unsigned_32_bit_hash("a")
|
)
|
||||||
|
assert hashes[0][1] == _get_unsigned_32_bit_hash("𐌞")
|
||||||
|
assert hashes[0][2] == _get_unsigned_32_bit_hash("d𐌞")
|
||||||
|
assert hashes[0][3] == _get_unsigned_32_bit_hash("a")
|
||||||
|
|
||||||
|
|
||||||
def test_get_character_combination_hashes_4_byte_char_in_middle(en_tokenizer):
|
def test_get_character_combination_hashes_4_byte_char_in_middle(en_tokenizer):
|
||||||
doc = en_tokenizer("and𐌞a")
|
doc = en_tokenizer("and𐌞a")
|
||||||
suffixes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
case_sensitive=False,
|
case_sensitive=False,
|
||||||
suffs_not_prefs=True,
|
pref_lengths=[],
|
||||||
affix_lengths=[1, 2, 3, 4],
|
suff_lengths=[1, 2, 3, 4],
|
||||||
search_chars="a",
|
pref_search_chars="",
|
||||||
search_lengths=[1, 2])
|
pref_search_lengths=[],
|
||||||
assert suffixes[0][0] == _get_unsigned_32_bit_hash("a")
|
suff_search_chars="a",
|
||||||
assert suffixes[0][2] == _get_unsigned_32_bit_hash("𐌞a")
|
suff_search_lengths=[1, 2],
|
||||||
assert suffixes[0][3] == _get_unsigned_32_bit_hash("d𐌞a")
|
)
|
||||||
assert suffixes[0][4] == _get_unsigned_32_bit_hash("a")
|
assert hashes[0][0] == _get_unsigned_32_bit_hash("a")
|
||||||
assert suffixes[0][5] == _get_unsigned_32_bit_hash("aa")
|
assert hashes[0][2] == _get_unsigned_32_bit_hash("𐌞a")
|
||||||
|
assert hashes[0][3] == _get_unsigned_32_bit_hash("d𐌞a")
|
||||||
|
assert hashes[0][4] == _get_unsigned_32_bit_hash("a")
|
||||||
|
assert hashes[0][5] == _get_unsigned_32_bit_hash("aa")
|
||||||
|
|
||||||
|
|
||||||
def test_get_character_combination_hashes_4_byte_special_char(en_tokenizer):
|
def test_get_character_combination_hashes_4_byte_special_char(en_tokenizer):
|
||||||
doc = en_tokenizer("and𐌞")
|
doc = en_tokenizer("and𐌞")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc.get_character_combination_hashes(case_sensitive=True,
|
doc.get_character_combination_hashes(
|
||||||
suffs_not_prefs=True,
|
case_sensitive=True,
|
||||||
affix_lengths=[2, 3, 4, 5],
|
pref_lengths=[],
|
||||||
search_chars="𐌞",
|
suff_lengths=[2, 3, 4, 5],
|
||||||
search_lengths=[2])
|
pref_search_chars="",
|
||||||
|
pref_search_lengths=[],
|
||||||
|
suff_search_chars="𐌞",
|
||||||
|
suff_search_lengths=[2],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||||
doc = en_tokenizer("and𐌞")
|
doc = en_tokenizer("and𐌞")
|
||||||
assert doc.get_character_combination_hashes(case_sensitive=True,
|
assert doc.get_character_combination_hashes(
|
||||||
suffs_not_prefs=True,
|
case_sensitive=True,
|
||||||
affix_lengths=[],
|
pref_lengths=[],
|
||||||
search_chars="",
|
suff_lengths=[],
|
||||||
search_lengths=[]).shape == (1, 0)
|
pref_search_chars="",
|
||||||
|
pref_search_lengths=[],
|
||||||
|
suff_search_chars="",
|
||||||
|
suff_search_lengths=[],
|
||||||
|
).shape == (1, 0)
|
||||||
|
|
|
@ -178,10 +178,12 @@ class Doc:
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
case_sensitive: bool,
|
case_sensitive: bool,
|
||||||
suffs_not_prefs: bool,
|
pref_lengths: List[int],
|
||||||
affix_lengths: List[int],
|
suff_lengths: List[int],
|
||||||
search_chars: str,
|
pref_search_chars: str,
|
||||||
search_lengths: List[int]
|
pref_search_lengths: List[int],
|
||||||
|
suff_search_chars: str,
|
||||||
|
suff_search_lengths: List[int]
|
||||||
) -> Ints2d: ...
|
) -> Ints2d: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_array_attrs() -> Tuple[Any]: ...
|
def _get_array_attrs() -> Tuple[Any]: ...
|
||||||
|
|
|
@ -1740,10 +1740,12 @@ cdef class Doc:
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
bint case_sensitive,
|
bint case_sensitive,
|
||||||
bint suffs_not_prefs,
|
pref_lengths: List[int],
|
||||||
affix_lengths: List[int],
|
suff_lengths: List[int],
|
||||||
str search_chars,
|
str pref_search_chars,
|
||||||
search_lengths: List[int]
|
pref_search_lengths: List[int],
|
||||||
|
str suff_search_chars,
|
||||||
|
suff_search_lengths: List[int]
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
||||||
|
@ -1751,22 +1753,25 @@ cdef class Doc:
|
||||||
|
|
||||||
case_sensitive: if *True*, the lower-case version of each token string is used as the basis for generating hashes. Note that
|
case_sensitive: if *True*, the lower-case version of each token string is used as the basis for generating hashes. Note that
|
||||||
if *case_sensitive==False*, upper-case characters in *search_chars* will not be found in token strings.
|
if *case_sensitive==False*, upper-case characters in *search_chars* will not be found in token strings.
|
||||||
suffs_not_prefs: if *True*, affixes are suffixes, and searching are from the end of each token;
|
pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*,
|
||||||
if *False*, affixes are prefixes, and searching is from the start of each token.
|
the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||||
affix_lengths: an integer list specifying the lengths of affixes to be hashed. For example, if *affix_lengths==[2, 3]*,
|
suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and
|
||||||
*suffs_not_prefs==True* and *case_sensitive==True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
*case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
||||||
search_chars: a string containing characters to search for within each token, starting at the beginning or end depending on the
|
pref_search_chars: a string containing characters to search for within each token, starting at the beginning.
|
||||||
value of *suffs_not_prefs*.
|
pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
||||||
search_lengths: an integer list specifying the lengths of search results to be hashed. For example if *search_lengths==[1, 2]*,
|
*pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for
|
||||||
*search_chars=="aC", *suffs_not_prefs==True* and *case_sensitive==True*, the searched strings hashed for "spaCy" would be
|
"spaCy" would be "a" and "ac".
|
||||||
"C" and "Ca".
|
suff_search_chars: a string containing characters to search for within each token, starting at the end.
|
||||||
|
suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
||||||
|
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for
|
||||||
|
"spaCy" would be "c" and "ca".
|
||||||
|
|
||||||
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
||||||
*get_affix_hashes(True, True, [2, 4, 6], "yC", [1, 2])* would correspond to
|
*get_character_combination_hashes(True, [2], [2, 4, 6], "yC", [1], [2])* would correspond to
|
||||||
|
|
||||||
[[hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")],
|
[[hash("sp"), [hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")],
|
||||||
[hash("nd"), hash("and", hash("and"), hash(" "), hash(" "))],
|
[hash("an"), hash("nd"), hash("and", hash("and"), hash(" "), hash(" "))],
|
||||||
[hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
|
[hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
|
||||||
|
|
||||||
UTF-16 is used to encode the token texts, as this results in two-byte representations for all characters that are realistically
|
UTF-16 is used to encode the token texts, as this results in two-byte representations for all characters that are realistically
|
||||||
interesting when learning features from words. UTF-16 can also contain four-byte representations, but neither of the byte pairs in
|
interesting when learning features from words. UTF-16 can also contain four-byte representations, but neither of the byte pairs in
|
||||||
|
@ -1775,14 +1780,18 @@ cdef class Doc:
|
||||||
representation in *search_chars*, on the other hand, is not supported and results in a ValueError(E1046).
|
representation in *search_chars*, on the other hand, is not supported and results in a ValueError(E1046).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cdef const unsigned char[:] search_chars_v = _get_utf16_memoryview(search_chars, True)
|
cdef const unsigned char[:] pref_search_chars_v = _get_utf16_memoryview(pref_search_chars, True)
|
||||||
cdef unsigned int longest_search_length = max(search_lengths) if len(search_lengths) > 0 else 0
|
cdef const unsigned char[:] suff_search_chars_v = _get_utf16_memoryview(suff_search_chars, True)
|
||||||
|
cdef unsigned int longest_search_length = max(pref_search_lengths + suff_search_lengths) if len(pref_search_lengths + suff_search_lengths) > 0 else 0
|
||||||
cdef bytes found_char_buf_bytes = (bytes(" " * longest_search_length, "UTF-16"))[2:] # first two bytes express endianness
|
cdef bytes found_char_buf_bytes = (bytes(" " * longest_search_length, "UTF-16"))[2:] # first two bytes express endianness
|
||||||
cdef char* found_char_buf = found_char_buf_bytes
|
cdef char* found_char_buf = found_char_buf_bytes
|
||||||
cdef unsigned int search_chars_v_len = len(search_chars_v), found_char_buf_len = len(found_char_buf_bytes)
|
cdef unsigned int pref_search_chars_v_len = len(pref_search_chars_v), suff_search_chars_v_len = len(suff_search_chars_v),
|
||||||
|
cdef unsigned int found_char_buf_len = len(found_char_buf_bytes)
|
||||||
|
|
||||||
cdef unsigned int num_toks = len(self), num_norm_hashes = len(affix_lengths), num_search_hashes = len(search_lengths)
|
cdef unsigned int num_toks = len(self), num_pref_norm_hashes = len(pref_lengths), num_suff_norm_hashes = len(suff_lengths)
|
||||||
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_norm_hashes + num_search_hashes), dtype="int64")
|
cdef unsigned int num_pref_search_hashes = len(pref_search_lengths)
|
||||||
|
cdef unsigned int num_suff_search_hashes = len(suff_search_lengths)
|
||||||
|
cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_norm_hashes), dtype="int64")
|
||||||
|
|
||||||
cdef const unsigned char[:] tok_str_v
|
cdef const unsigned char[:] tok_str_v
|
||||||
cdef unsigned int tok_idx, tok_str_v_len, hash_idx, affix_start, char_comb_len
|
cdef unsigned int tok_idx, tok_str_v_len, hash_idx, affix_start, char_comb_len
|
||||||
|
@ -1795,25 +1804,45 @@ cdef class Doc:
|
||||||
tok_str_v = _get_utf16_memoryview(str_tok_attr, False)
|
tok_str_v = _get_utf16_memoryview(str_tok_attr, False)
|
||||||
tok_str_v_len = len(tok_str_v)
|
tok_str_v_len = len(tok_str_v)
|
||||||
|
|
||||||
for hash_idx in range(num_norm_hashes):
|
for hash_idx in range(num_pref_norm_hashes):
|
||||||
char_comb_len = affix_lengths[hash_idx] * 2
|
char_comb_len = pref_lengths[hash_idx] * 2
|
||||||
if char_comb_len > tok_str_v_len:
|
if char_comb_len > tok_str_v_len:
|
||||||
char_comb_len = tok_str_v_len
|
char_comb_len = tok_str_v_len
|
||||||
affix_start = tok_str_v_len - char_comb_len if suffs_not_prefs else 0
|
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[0], char_comb_len, 0)
|
||||||
|
|
||||||
|
for hash_idx in range(num_pref_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes):
|
||||||
|
char_comb_len = suff_lengths[hash_idx - num_pref_norm_hashes] * 2
|
||||||
|
if char_comb_len > tok_str_v_len:
|
||||||
|
char_comb_len = tok_str_v_len
|
||||||
|
affix_start = tok_str_v_len - char_comb_len
|
||||||
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[affix_start], char_comb_len, 0)
|
hashes[tok_idx, hash_idx] = hash32(<void*> &tok_str_v[affix_start], char_comb_len, 0)
|
||||||
|
|
||||||
_set_found_char_buf(
|
_set_found_char_buf(
|
||||||
suffs_not_prefs,
|
False,
|
||||||
tok_str_v,
|
tok_str_v,
|
||||||
tok_str_v_len,
|
tok_str_v_len,
|
||||||
search_chars_v,
|
pref_search_chars_v,
|
||||||
search_chars_v_len,
|
pref_search_chars_v_len,
|
||||||
found_char_buf,
|
found_char_buf,
|
||||||
found_char_buf_len,
|
found_char_buf_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
for hash_idx in range(num_norm_hashes, num_norm_hashes + num_search_hashes):
|
for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes):
|
||||||
char_comb_len = search_lengths[hash_idx - num_norm_hashes] * 2
|
char_comb_len = pref_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes)] * 2
|
||||||
|
hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0)
|
||||||
|
|
||||||
|
_set_found_char_buf(
|
||||||
|
True,
|
||||||
|
tok_str_v,
|
||||||
|
tok_str_v_len,
|
||||||
|
suff_search_chars_v,
|
||||||
|
suff_search_chars_v_len,
|
||||||
|
found_char_buf,
|
||||||
|
found_char_buf_len,
|
||||||
|
)
|
||||||
|
|
||||||
|
for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_search_hashes):
|
||||||
|
char_comb_len = suff_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes)] * 2
|
||||||
hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0)
|
hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0)
|
||||||
|
|
||||||
return hashes
|
return hashes
|
||||||
|
|
Loading…
Reference in New Issue
Block a user