mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Get rid of memory views
This commit is contained in:
parent
749da9d348
commit
2552340fb8
|
@ -43,26 +43,26 @@ def RichFeatureExtractor(
|
||||||
forward,
|
forward,
|
||||||
attrs={
|
attrs={
|
||||||
"case_sensitive": case_sensitive,
|
"case_sensitive": case_sensitive,
|
||||||
"pref_lengths": ops.asarray1i(pref_lengths)
|
"pref_lengths": bytes(pref_lengths)
|
||||||
if pref_lengths is not None
|
if pref_lengths is not None
|
||||||
else ops.asarray1i([]),
|
else bytes(),
|
||||||
"suff_lengths": ops.asarray1i(suff_lengths)
|
"suff_lengths": bytes(suff_lengths)
|
||||||
if suff_lengths is not None
|
if suff_lengths is not None
|
||||||
else ops.asarray1i([]),
|
else bytes(),
|
||||||
"pref_search_1_byte": ps_1byte_ch,
|
"pref_search_1_byte": ps_1byte_ch,
|
||||||
"pref_search_2_bytes": ps_2byte_ch,
|
"pref_search_2_bytes": ps_2byte_ch,
|
||||||
"pref_search_3_bytes": ps_3byte_ch,
|
"pref_search_3_bytes": ps_3byte_ch,
|
||||||
"pref_search_4_bytes": ps_4byte_ch,
|
"pref_search_4_bytes": ps_4byte_ch,
|
||||||
"pref_search_lengths": ops.asarray1i(pref_search_lengths)
|
"pref_search_lengths": bytes(pref_search_lengths)
|
||||||
if pref_search_lengths is not None
|
if pref_search_lengths is not None
|
||||||
else ops.asarray1i([]),
|
else bytes(),
|
||||||
"suff_search_1_byte": ss_1byte_ch,
|
"suff_search_1_byte": ss_1byte_ch,
|
||||||
"suff_search_2_bytes": ss_2byte_ch,
|
"suff_search_2_bytes": ss_2byte_ch,
|
||||||
"suff_search_3_bytes": ss_3byte_ch,
|
"suff_search_3_bytes": ss_3byte_ch,
|
||||||
"suff_search_4_bytes": ss_4byte_ch,
|
"suff_search_4_bytes": ss_4byte_ch,
|
||||||
"suff_search_lengths": ops.asarray1i(suff_search_lengths)
|
"suff_search_lengths": bytes(suff_search_lengths)
|
||||||
if suff_search_lengths is not None
|
if suff_search_lengths is not None
|
||||||
else ops.asarray1i([]),
|
else bytes(),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -72,18 +72,18 @@ def forward(
|
||||||
) -> Tuple[List[Ints2d], Callable]:
|
) -> Tuple[List[Ints2d], Callable]:
|
||||||
ops = model.ops
|
ops = model.ops
|
||||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||||
pref_lengths: Ints1d = model.attrs["pref_lengths"]
|
pref_lengths: bytes = model.attrs["pref_lengths"]
|
||||||
suff_lengths: Ints1d = model.attrs["suff_lengths"]
|
suff_lengths: bytes = model.attrs["suff_lengths"]
|
||||||
ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
||||||
ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
||||||
ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
||||||
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
||||||
pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
|
pref_search_lengths: bytes = model.attrs["pref_search_lengths"]
|
||||||
ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
||||||
ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
||||||
ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
||||||
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
||||||
suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
|
suff_search_lengths: bytes = model.attrs["suff_search_lengths"]
|
||||||
features: List[Ints2d] = []
|
features: List[Ints2d] = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
|
|
|
@ -27,4 +27,4 @@ cdef class StringStore:
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
||||||
cdef const unsigned char[:] utf8_view(self, attr_t hash_val)
|
cdef const unsigned char* utf8_ptr(self, attr_t hash_val)
|
||||||
|
|
|
@ -316,9 +316,9 @@ cdef class StringStore:
|
||||||
self.keys.push_back(key)
|
self.keys.push_back(key)
|
||||||
return value
|
return value
|
||||||
|
|
||||||
cdef const unsigned char[:] utf8_view(self, attr_t hash_val):
|
cdef const unsigned char* utf8_ptr(self, attr_t hash_val):
|
||||||
if hash_val == 0:
|
if hash_val == 0:
|
||||||
return ""
|
return "".encode("utf-8")
|
||||||
elif hash_val < len(SYMBOLS_BY_INT):
|
elif hash_val < len(SYMBOLS_BY_INT):
|
||||||
return SYMBOLS_BY_INT[hash_val].encode("utf-8")
|
return SYMBOLS_BY_INT[hash_val].encode("utf-8")
|
||||||
cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
|
cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
|
||||||
|
|
|
@ -1004,23 +1004,22 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
||||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||||
ops = get_current_ops()
|
|
||||||
ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("Rp", case_sensitive)
|
ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("Rp", case_sensitive)
|
||||||
ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive)
|
ss1, ss2, ss3, ss4 = get_search_char_byte_arrays("xx✨rp", case_sensitive)
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=case_sensitive,
|
cs=case_sensitive,
|
||||||
p_lengths=ops.asarray1i([1, 3, 4]),
|
p_lengths=bytes((1, 3, 4,)),
|
||||||
s_lengths=ops.asarray1i([2, 3, 4, 5]),
|
s_lengths=bytes((2, 3, 4, 5,)),
|
||||||
ps_1byte_ch=ps1,
|
ps_1byte_ch=ps1,
|
||||||
ps_2byte_ch=ps2,
|
ps_2byte_ch=ps2,
|
||||||
ps_3byte_ch=ps3,
|
ps_3byte_ch=ps3,
|
||||||
ps_4byte_ch=ps4,
|
ps_4byte_ch=ps4,
|
||||||
ps_lengths=ops.asarray1i([2]),
|
ps_lengths=bytes((2,)),
|
||||||
ss_1byte_ch=ss1,
|
ss_1byte_ch=ss1,
|
||||||
ss_2byte_ch=ss2,
|
ss_2byte_ch=ss2,
|
||||||
ss_3byte_ch=ss3,
|
ss_3byte_ch=ss3,
|
||||||
ss_4byte_ch=ss4,
|
ss_4byte_ch=ss4,
|
||||||
ss_lengths=ops.asarray1i([1, 2]),
|
ss_lengths=bytes((1, 2,)),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
|
assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
|
||||||
|
@ -1089,22 +1088,21 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
||||||
|
|
||||||
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
||||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||||
ops = get_current_ops()
|
|
||||||
ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("rp", False)
|
ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("rp", False)
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=False,
|
cs=False,
|
||||||
p_lengths=ops.asarray1i([]),
|
p_lengths=bytes(),
|
||||||
s_lengths=ops.asarray1i([2, 3, 4, 5]),
|
s_lengths=bytes((2,3,4,5,)),
|
||||||
ps_1byte_ch=ps1,
|
ps_1byte_ch=ps1,
|
||||||
ps_2byte_ch=ps2,
|
ps_2byte_ch=ps2,
|
||||||
ps_3byte_ch=ps3,
|
ps_3byte_ch=ps3,
|
||||||
ps_4byte_ch=ps4,
|
ps_4byte_ch=ps4,
|
||||||
ps_lengths=ops.asarray1i([2]),
|
ps_lengths=bytes((2,)),
|
||||||
ss_1byte_ch=bytes(),
|
ss_1byte_ch=bytes(),
|
||||||
ss_2byte_ch=bytes(),
|
ss_2byte_ch=bytes(),
|
||||||
ss_3byte_ch=bytes(),
|
ss_3byte_ch=bytes(),
|
||||||
ss_4byte_ch=bytes(),
|
ss_4byte_ch=bytes(),
|
||||||
ss_lengths=ops.asarray1i([]),
|
ss_lengths=bytes(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1132,25 +1130,24 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
||||||
|
|
||||||
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
|
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
|
||||||
doc = en_tokenizer("sp𐌞Cé")
|
doc = en_tokenizer("sp𐌞Cé")
|
||||||
ops = get_current_ops()
|
|
||||||
|
|
||||||
for p_length in range(1, 8):
|
for p_length in range(1, 8):
|
||||||
for s_length in range(1, 8):
|
for s_length in range(1, 8):
|
||||||
|
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=False,
|
cs=False,
|
||||||
p_lengths=ops.asarray1i([p_length]),
|
p_lengths=bytes((p_length,)),
|
||||||
s_lengths=ops.asarray1i([s_length]),
|
s_lengths=bytes((s_length,)),
|
||||||
ps_1byte_ch=bytes(),
|
ps_1byte_ch=bytes(),
|
||||||
ps_2byte_ch=bytes(),
|
ps_2byte_ch=bytes(),
|
||||||
ps_3byte_ch=bytes(),
|
ps_3byte_ch=bytes(),
|
||||||
ps_4byte_ch=bytes(),
|
ps_4byte_ch=bytes(),
|
||||||
ps_lengths=ops.asarray1i([]),
|
ps_lengths=bytes(),
|
||||||
ss_1byte_ch=bytes(),
|
ss_1byte_ch=bytes(),
|
||||||
ss_2byte_ch=bytes(),
|
ss_2byte_ch=bytes(),
|
||||||
ss_3byte_ch=bytes(),
|
ss_3byte_ch=bytes(),
|
||||||
ss_4byte_ch=bytes(),
|
ss_4byte_ch=bytes(),
|
||||||
ss_lengths=ops.asarray1i([]),
|
ss_lengths=bytes(),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length])
|
assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé"[:p_length])
|
||||||
|
@ -1160,22 +1157,21 @@ def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
|
||||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||||
def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_sensitive):
|
def test_get_character_combination_hashes_turkish_i_with_dot(en_tokenizer, case_sensitive):
|
||||||
doc = en_tokenizer("İ".lower() + "İ")
|
doc = en_tokenizer("İ".lower() + "İ")
|
||||||
ops = get_current_ops()
|
|
||||||
s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive)
|
s1, s2, s3, s4 = get_search_char_byte_arrays("İ", case_sensitive)
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=case_sensitive,
|
cs=case_sensitive,
|
||||||
p_lengths=ops.asarray1i([1, 2, 3, 4]),
|
p_lengths=bytes((1,2,3,4,)),
|
||||||
s_lengths=ops.asarray1i([1, 2, 3, 4]),
|
s_lengths=bytes((1,2,3,4,)),
|
||||||
ps_1byte_ch=s1,
|
ps_1byte_ch=s1,
|
||||||
ps_2byte_ch=s2,
|
ps_2byte_ch=s2,
|
||||||
ps_3byte_ch=s3,
|
ps_3byte_ch=s3,
|
||||||
ps_4byte_ch=s4,
|
ps_4byte_ch=s4,
|
||||||
ps_lengths=ops.asarray1i([1, 2, 3, 4]),
|
ps_lengths=bytes((1,2,3,4,)),
|
||||||
ss_1byte_ch=s1,
|
ss_1byte_ch=s1,
|
||||||
ss_2byte_ch=s2,
|
ss_2byte_ch=s2,
|
||||||
ss_3byte_ch=s3,
|
ss_3byte_ch=s3,
|
||||||
ss_4byte_ch=s4,
|
ss_4byte_ch=s4,
|
||||||
ss_lengths=ops.asarray1i([1, 2, 3, 4]),
|
ss_lengths=bytes((1,2,3,4,)),
|
||||||
)
|
)
|
||||||
|
|
||||||
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
|
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
|
||||||
|
@ -1219,22 +1215,21 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer,
|
||||||
assert len(long_word) > 255
|
assert len(long_word) > 255
|
||||||
doc = en_tokenizer(' '.join((symbol, short_word, normal_word, long_word)))
|
doc = en_tokenizer(' '.join((symbol, short_word, normal_word, long_word)))
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
ops = get_current_ops()
|
|
||||||
ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive)
|
ps1, ps2, ps3, ps4 = get_search_char_byte_arrays("E", case_sensitive)
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=case_sensitive,
|
cs=case_sensitive,
|
||||||
p_lengths=ops.asarray1i([2]),
|
p_lengths=bytes((2,)),
|
||||||
s_lengths=ops.asarray1i([2]),
|
s_lengths=bytes((2,)),
|
||||||
ps_1byte_ch=ps1,
|
ps_1byte_ch=ps1,
|
||||||
ps_2byte_ch=ps2,
|
ps_2byte_ch=ps2,
|
||||||
ps_3byte_ch=ps3,
|
ps_3byte_ch=ps3,
|
||||||
ps_4byte_ch=ps4,
|
ps_4byte_ch=ps4,
|
||||||
ps_lengths=ops.asarray1i([2]),
|
ps_lengths=bytes((2,)),
|
||||||
ss_1byte_ch=bytes(),
|
ss_1byte_ch=bytes(),
|
||||||
ss_2byte_ch=bytes(),
|
ss_2byte_ch=bytes(),
|
||||||
ss_3byte_ch=bytes(),
|
ss_3byte_ch=bytes(),
|
||||||
ss_4byte_ch=bytes(),
|
ss_4byte_ch=bytes(),
|
||||||
ss_lengths=ops.asarray1i([]),
|
ss_lengths=bytes(),
|
||||||
)
|
)
|
||||||
assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl")
|
assert hashes[0][0] == _get_unsigned_32_bit_hash("FL" if case_sensitive else "fl")
|
||||||
assert hashes[0][1] == _get_unsigned_32_bit_hash("19")
|
assert hashes[0][1] == _get_unsigned_32_bit_hash("19")
|
||||||
|
@ -1255,19 +1250,18 @@ def test_get_character_combination_hashes_string_store_spec_cases(en_tokenizer,
|
||||||
|
|
||||||
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||||
doc = en_tokenizer("and𐌞")
|
doc = en_tokenizer("and𐌞")
|
||||||
ops = get_current_ops()
|
assert doc.get_character_combination_hashes(
|
||||||
hashes = doc.get_character_combination_hashes(
|
|
||||||
cs=True,
|
cs=True,
|
||||||
p_lengths=ops.asarray1i([]),
|
p_lengths=bytes(),
|
||||||
s_lengths=ops.asarray1i([]),
|
s_lengths=bytes(),
|
||||||
ps_1byte_ch=bytes(),
|
ps_1byte_ch=bytes(),
|
||||||
ps_2byte_ch=bytes(),
|
ps_2byte_ch=bytes(),
|
||||||
ps_3byte_ch=bytes(),
|
ps_3byte_ch=bytes(),
|
||||||
ps_4byte_ch=bytes(),
|
ps_4byte_ch=bytes(),
|
||||||
ps_lengths=ops.asarray1i([]),
|
ps_lengths=bytes(),
|
||||||
ss_1byte_ch=bytes(),
|
ss_1byte_ch=bytes(),
|
||||||
ss_2byte_ch=bytes(),
|
ss_2byte_ch=bytes(),
|
||||||
ss_3byte_ch=bytes(),
|
ss_3byte_ch=bytes(),
|
||||||
ss_4byte_ch=bytes(),
|
ss_4byte_ch=bytes(),
|
||||||
ss_lengths=ops.asarray1i([]),
|
ss_lengths=bytes(),
|
||||||
).shape == (1, 0)
|
).shape == (1, 0)
|
||||||
|
|
|
@ -39,7 +39,7 @@ cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||||
|
|
||||||
|
|
||||||
cdef void _set_affix_lengths(
|
cdef void _set_affix_lengths(
|
||||||
const unsigned char[:] tok_str,
|
const unsigned char* tok_str,
|
||||||
unsigned char* aff_l_buf,
|
unsigned char* aff_l_buf,
|
||||||
const int pref_l,
|
const int pref_l,
|
||||||
const int suff_l,
|
const int suff_l,
|
||||||
|
@ -47,11 +47,11 @@ cdef void _set_affix_lengths(
|
||||||
|
|
||||||
|
|
||||||
cdef void _search_for_chars(
|
cdef void _search_for_chars(
|
||||||
const unsigned char[:] tok_str,
|
const unsigned char* tok_str,
|
||||||
const unsigned char[:] s_1byte_ch,
|
const unsigned char* s_1byte_ch,
|
||||||
const unsigned char[:] s_2byte_ch,
|
const unsigned char* s_2byte_ch,
|
||||||
const unsigned char[:] s_3byte_ch,
|
const unsigned char* s_3byte_ch,
|
||||||
const unsigned char[:] s_4byte_ch,
|
const unsigned char* s_4byte_ch,
|
||||||
unsigned char* res_buf,
|
unsigned char* res_buf,
|
||||||
int max_res_l,
|
int max_res_l,
|
||||||
unsigned char* l_buf,
|
unsigned char* l_buf,
|
||||||
|
|
|
@ -178,18 +178,18 @@ class Doc:
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
cs: bool,
|
cs: bool,
|
||||||
p_lengths: Ints1d,
|
p_lengths: bytes,
|
||||||
s_lengths: Ints1d,
|
s_lengths: bytes,
|
||||||
ps_1byte_ch: bytes,
|
ps_1byte_ch: bytes,
|
||||||
ps_2byte_ch: bytes,
|
ps_2byte_ch: bytes,
|
||||||
ps_3byte_ch: bytes,
|
ps_3byte_ch: bytes,
|
||||||
ps_4byte_ch: bytes,
|
ps_4byte_ch: bytes,
|
||||||
ps_lengths: Ints1d,
|
ps_lengths: bytes,
|
||||||
ss_1byte_ch: bytes,
|
ss_1byte_ch: bytes,
|
||||||
ss_2byte_ch: bytes,
|
ss_2byte_ch: bytes,
|
||||||
ss_3byte_ch: bytes,
|
ss_3byte_ch: bytes,
|
||||||
ss_4byte_ch: bytes,
|
ss_4byte_ch: bytes,
|
||||||
ss_lengths: Ints1d,
|
ss_lengths: bytes,
|
||||||
) -> Ints2d: ...
|
) -> Ints2d: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_array_attrs() -> Tuple[Any]: ...
|
def _get_array_attrs() -> Tuple[Any]: ...
|
||||||
|
|
|
@ -4,7 +4,7 @@ from typing import Set, List
|
||||||
cimport cython
|
cimport cython
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from cpython cimport array
|
from cpython cimport array
|
||||||
from libc.string cimport memcpy, memcmp, memset
|
from libc.string cimport memcpy, memcmp, memset, strlen
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
from libc.stdint cimport int32_t, uint64_t
|
from libc.stdint cimport int32_t, uint64_t
|
||||||
|
|
||||||
|
@ -1739,18 +1739,18 @@ cdef class Doc:
|
||||||
def get_character_combination_hashes(self,
|
def get_character_combination_hashes(self,
|
||||||
*,
|
*,
|
||||||
const bint cs,
|
const bint cs,
|
||||||
int[:] p_lengths,
|
const unsigned char* p_lengths,
|
||||||
int[:] s_lengths,
|
const unsigned char* s_lengths,
|
||||||
const unsigned char[:] ps_1byte_ch,
|
const unsigned char* ps_1byte_ch,
|
||||||
const unsigned char[:] ps_2byte_ch,
|
const unsigned char* ps_2byte_ch,
|
||||||
const unsigned char[:] ps_3byte_ch,
|
const unsigned char* ps_3byte_ch,
|
||||||
const unsigned char[:] ps_4byte_ch,
|
const unsigned char* ps_4byte_ch,
|
||||||
int[:] ps_lengths,
|
const unsigned char* ps_lengths,
|
||||||
const unsigned char[:] ss_1byte_ch,
|
const unsigned char* ss_1byte_ch,
|
||||||
const unsigned char[:] ss_2byte_ch,
|
const unsigned char* ss_2byte_ch,
|
||||||
const unsigned char[:] ss_3byte_ch,
|
const unsigned char* ss_3byte_ch,
|
||||||
const unsigned char[:] ss_4byte_ch,
|
const unsigned char* ss_4byte_ch,
|
||||||
int[:] ss_lengths,
|
const unsigned char* ss_lengths,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
||||||
|
@ -1764,28 +1764,28 @@ cdef class Doc:
|
||||||
ss_ variables relate to searches starting at the end of the word
|
ss_ variables relate to searches starting at the end of the word
|
||||||
|
|
||||||
cs: if *False*, hashes are generated based on the lower-case version of each token.
|
cs: if *False*, hashes are generated based on the lower-case version of each token.
|
||||||
p_lengths: an Ints1d specifying the lengths of prefixes to be hashed in ascending order. For example,
|
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
|
||||||
if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
|
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||||
s_lengths: an Ints1d specifying the lengths of suffixes to be hashed in ascending order. For example,
|
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
|
||||||
if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
|
||||||
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
||||||
starting at the beginning.
|
starting at the beginning.
|
||||||
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed in ascending order.
|
ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed
|
||||||
For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
|
in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings
|
||||||
"spaCy" would be "a" and "ac".
|
hashed for "spaCy" would be "a" and "ac".
|
||||||
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token,
|
||||||
starting at the end.
|
starting at the end.
|
||||||
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed in ascending order.
|
ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed
|
||||||
For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for
|
in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings
|
||||||
"spaCy" would be "c" and "ca".
|
hashed for "spaCy" would be "c" and "ca".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Define the result array and work out what is used for what in axis 1
|
# Define the result array and work out what is used for what in axis 1
|
||||||
cdef int num_toks = len(self)
|
cdef int num_toks = len(self)
|
||||||
cdef int p_h_num = len(p_lengths)
|
cdef int p_h_num = strlen(<char*> p_lengths)
|
||||||
cdef int s_h_num = len(s_lengths), s_h_end = p_h_num + s_h_num
|
cdef int s_h_num = strlen(<char*> s_lengths), s_h_end = p_h_num + s_h_num
|
||||||
cdef int ps_h_num = len(ps_lengths), ps_h_end = s_h_end + ps_h_num
|
cdef int ps_h_num = strlen(<char*> ps_lengths), ps_h_end = s_h_end + ps_h_num
|
||||||
cdef int ss_h_num = len(ss_lengths), ss_h_end = ps_h_end + ss_h_num
|
cdef int ss_h_num = strlen(<char*> ss_lengths), ss_h_end = ps_h_end + ss_h_num
|
||||||
cdef np.ndarray[np.int64_t, ndim=2] hashes
|
cdef np.ndarray[np.int64_t, ndim=2] hashes
|
||||||
hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
|
hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
|
||||||
|
|
||||||
|
@ -1804,35 +1804,29 @@ cdef class Doc:
|
||||||
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
|
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
|
||||||
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
|
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
|
||||||
|
|
||||||
# Define memory views on length arrays
|
|
||||||
cdef int[:] p_lengths_v = p_lengths
|
|
||||||
cdef int[:] s_lengths_v = s_lengths
|
|
||||||
cdef int[:] ps_lengths_v = ps_lengths
|
|
||||||
cdef int[:] ss_lengths_v = ss_lengths
|
|
||||||
|
|
||||||
# Define working variables
|
# Define working variables
|
||||||
cdef TokenC tok_c
|
cdef TokenC tok_c
|
||||||
cdef int tok_i, offset
|
cdef int tok_i, offset
|
||||||
cdef uint64_t hash_val = 0
|
cdef uint64_t hash_val = 0
|
||||||
cdef attr_t num_tok_attr
|
cdef attr_t num_tok_attr
|
||||||
cdef const unsigned char[:] tok_str
|
cdef const unsigned char* tok_str
|
||||||
|
|
||||||
for tok_i in range(num_toks):
|
for tok_i in range(num_toks):
|
||||||
tok_c = self.c[tok_i]
|
tok_c = self.c[tok_i]
|
||||||
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
|
num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
|
||||||
tok_str = self.vocab.strings.utf8_view(num_tok_attr)
|
tok_str = self.vocab.strings.utf8_ptr(num_tok_attr)
|
||||||
|
|
||||||
if aff_l > 0:
|
if aff_l > 0:
|
||||||
_set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l)
|
_set_affix_lengths(tok_str, aff_l_buf, p_max_l, s_max_l)
|
||||||
|
|
||||||
for hash_idx in range(p_h_num):
|
for hash_idx in range(p_h_num):
|
||||||
offset = aff_l_buf[p_lengths_v[hash_idx] - 1]
|
offset = aff_l_buf[p_lengths[hash_idx] - 1]
|
||||||
if offset > 0:
|
if offset > 0:
|
||||||
hash_val = hash32(<void*> &tok_str[0], offset, 0)
|
hash_val = hash32(<void*> &tok_str[0], offset, 0)
|
||||||
hashes[tok_i, hash_idx] = hash_val
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
|
||||||
for hash_idx in range(p_h_num, s_h_end):
|
for hash_idx in range(p_h_num, s_h_end):
|
||||||
offset = aff_l_buf[s_lengths_v[hash_idx - p_h_num] + p_max_l - 1]
|
offset = aff_l_buf[s_lengths[hash_idx - p_h_num] + p_max_l - 1]
|
||||||
if offset > 0:
|
if offset > 0:
|
||||||
hash_val = hash32(<void*> &tok_str[len(tok_str) - offset], offset, 0)
|
hash_val = hash32(<void*> &tok_str[len(tok_str) - offset], offset, 0)
|
||||||
hashes[tok_i, hash_idx] = hash_val
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
@ -1841,7 +1835,7 @@ cdef class Doc:
|
||||||
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False)
|
_search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_max_l, ps_l_buf, False)
|
||||||
hash_val = 0
|
hash_val = 0
|
||||||
for hash_idx in range(s_h_end, ps_h_end):
|
for hash_idx in range(s_h_end, ps_h_end):
|
||||||
offset = ps_l_buf[ps_lengths_v[hash_idx - s_h_end] - 1]
|
offset = ps_l_buf[ps_lengths[hash_idx - s_h_end] - 1]
|
||||||
if offset > 0:
|
if offset > 0:
|
||||||
hash_val = hash32(ps_res_buf, offset, 0)
|
hash_val = hash32(ps_res_buf, offset, 0)
|
||||||
hashes[tok_i, hash_idx] = hash_val
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
@ -1850,7 +1844,7 @@ cdef class Doc:
|
||||||
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True)
|
_search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_max_l, ss_l_buf, True)
|
||||||
hash_val = 0
|
hash_val = 0
|
||||||
for hash_idx in range(ps_h_end, ss_h_end):
|
for hash_idx in range(ps_h_end, ss_h_end):
|
||||||
offset = ss_l_buf[ss_lengths_v[hash_idx - ps_h_end] - 1]
|
offset = ss_l_buf[ss_lengths[hash_idx - ps_h_end] - 1]
|
||||||
if offset > 0:
|
if offset > 0:
|
||||||
hash_val = hash32(ss_res_buf, offset, 0)
|
hash_val = hash32(ss_res_buf, offset, 0)
|
||||||
hashes[tok_i, hash_idx] = hash_val
|
hashes[tok_i, hash_idx] = hash_val
|
||||||
|
@ -2039,7 +2033,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
|
|
||||||
@cython.boundscheck(False) # Deactivate bounds checking
|
@cython.boundscheck(False) # Deactivate bounds checking
|
||||||
cdef void _set_affix_lengths(
|
cdef void _set_affix_lengths(
|
||||||
const unsigned char[:] tok_str,
|
const unsigned char* tok_str,
|
||||||
unsigned char* aff_l_buf,
|
unsigned char* aff_l_buf,
|
||||||
const int pref_l,
|
const int pref_l,
|
||||||
const int suff_l,
|
const int suff_l,
|
||||||
|
@ -2054,14 +2048,17 @@ cdef void _set_affix_lengths(
|
||||||
pref_l: the number of characters to process at the beginning of the word.
|
pref_l: the number of characters to process at the beginning of the word.
|
||||||
suff_l: the number of characters to process at the end of the word.
|
suff_l: the number of characters to process at the end of the word.
|
||||||
"""
|
"""
|
||||||
cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = len(tok_str)
|
cdef int tok_str_idx = 1, aff_l_buf_idx = 0, tok_str_l = strlen(<char*> tok_str)
|
||||||
|
|
||||||
while aff_l_buf_idx < pref_l:
|
while aff_l_buf_idx < pref_l:
|
||||||
if tok_str_idx == len(tok_str) or ((tok_str[tok_str_idx] & 0xc0) != 0x80): # not a continuation character
|
if (tok_str_idx == strlen(<char*> tok_str)
|
||||||
|
or
|
||||||
|
((tok_str[tok_str_idx] & 0xc0) != 0x80 # not a continuation character
|
||||||
|
):
|
||||||
aff_l_buf[aff_l_buf_idx] = tok_str_idx
|
aff_l_buf[aff_l_buf_idx] = tok_str_idx
|
||||||
aff_l_buf_idx += 1
|
aff_l_buf_idx += 1
|
||||||
tok_str_idx += 1
|
tok_str_idx += 1
|
||||||
if tok_str_idx > len(tok_str):
|
if tok_str_idx > tok_str_l:
|
||||||
break
|
break
|
||||||
|
|
||||||
if aff_l_buf_idx < pref_l:
|
if aff_l_buf_idx < pref_l:
|
||||||
|
@ -2082,11 +2079,11 @@ cdef void _set_affix_lengths(
|
||||||
|
|
||||||
@cython.boundscheck(False) # Deactivate bounds checking
|
@cython.boundscheck(False) # Deactivate bounds checking
|
||||||
cdef void _search_for_chars(
|
cdef void _search_for_chars(
|
||||||
const unsigned char[:] tok_str,
|
const unsigned char* tok_str,
|
||||||
const unsigned char[:] s_1byte_ch,
|
const unsigned char* s_1byte_ch,
|
||||||
const unsigned char[:] s_2byte_ch,
|
const unsigned char* s_2byte_ch,
|
||||||
const unsigned char[:] s_3byte_ch,
|
const unsigned char* s_3byte_ch,
|
||||||
const unsigned char[:] s_4byte_ch,
|
const unsigned char* s_4byte_ch,
|
||||||
unsigned char* res_buf,
|
unsigned char* res_buf,
|
||||||
int max_res_l,
|
int max_res_l,
|
||||||
unsigned char* l_buf,
|
unsigned char* l_buf,
|
||||||
|
@ -2106,9 +2103,9 @@ cdef void _search_for_chars(
|
||||||
The calling code ensures that lengths greater than 255 cannot occur.
|
The calling code ensures that lengths greater than 255 cannot occur.
|
||||||
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
|
suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
|
||||||
"""
|
"""
|
||||||
cdef int tok_str_l = len(tok_str), res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx
|
cdef int tok_str_l = strlen(<char*> tok_str), res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx
|
||||||
cdef int search_chars_l
|
cdef int search_chars_l
|
||||||
cdef const unsigned char[:] search_chars
|
cdef const unsigned char* search_chars
|
||||||
|
|
||||||
cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0
|
cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0
|
||||||
cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1
|
cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1
|
||||||
|
@ -2130,7 +2127,7 @@ cdef void _search_for_chars(
|
||||||
search_chars = s_3byte_ch
|
search_chars = s_3byte_ch
|
||||||
else:
|
else:
|
||||||
search_chars = s_4byte_ch
|
search_chars = s_4byte_ch
|
||||||
search_chars_l = len(search_chars)
|
search_chars_l = strlen(<char*> search_chars)
|
||||||
tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
|
tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
|
||||||
|
|
||||||
search_char_idx = 0
|
search_char_idx = 0
|
||||||
|
|
Loading…
Reference in New Issue
Block a user