Everything working after refactoring

This commit is contained in:
richardpaulhudson 2022-11-04 09:33:06 +01:00
parent 5d210a0f3b
commit 7f1873ad81
4 changed files with 48 additions and 61 deletions

View File

@ -199,7 +199,7 @@ def _verify_rich_config_group(
if lengths is not None or rows is not None:
if is_search_char_group and (search_chars is None or len(search_chars) == 0):
raise ValueError(Errors.E1047.format(label=label))
if len(search_chars) > 63:
if search_chars is not None and len(search_chars) > 63:
raise ValueError(Errors.E1048.format(label=label))
if lengths is None or rows is None:
raise ValueError(Errors.E1047.format(label=label))
@ -262,6 +262,12 @@ def RichMultiHashEmbed(
plural noun does not become `a` if it is the third or fourth vowel from the
end of the word.
There are a few rare situations where a graphical character is expressed as
more than one UTF-8 character, e.g. *i* when representing the lower-case form
of the Turkish letter *İ*. Such situations are supported, but the lengths of
prefixes, suffixes and character search results may need to be increased
accordingly.
All lengths must be specified in ascending order.
width (int): The output width. Also used as the width of the embedding tables.

View File

@ -2,7 +2,7 @@ from typing import List, Optional, Callable, Tuple
from spacy.util import get_search_char_byte_arrays
# from ..util import get_arrays_for_search_chars
from thinc.types import Ints1d, Ints2d
from thinc.types import Ints2d
from thinc.api import Model, registry, get_current_ops
from ..tokens import Doc
@ -21,46 +21,35 @@ def RichFeatureExtractor(
) -> Model[List[Doc], List[Ints2d]]:
ops = get_current_ops()
if pref_search_chars is not None:
(
ps_1byte_ch,
ps_2byte_ch,
ps_3byte_ch,
ps_4byte_ch,
) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
else:
ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes()
ps_search_chars = bytes()
ps_width_offsets = bytes()
if suff_search_chars is not None:
(
ss_1byte_ch,
ss_2byte_ch,
ss_3byte_ch,
ss_4byte_ch,
) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
else:
ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes()
ss_search_chars = bytes()
ss_width_offsets = bytes()
return Model(
"extract_character_combination_hashes",
forward,
attrs={
"case_sensitive": case_sensitive,
"pref_lengths": bytes(pref_lengths)
"p_lengths": bytes(pref_lengths)
if pref_lengths is not None
else bytes(),
"suff_lengths": bytes(suff_lengths)
"s_lengths": bytes(suff_lengths)
if suff_lengths is not None
else bytes(),
"pref_search_1_byte": ps_1byte_ch,
"pref_search_2_bytes": ps_2byte_ch,
"pref_search_3_bytes": ps_3byte_ch,
"pref_search_4_bytes": ps_4byte_ch,
"pref_search_lengths": bytes(pref_search_lengths)
"ps_search_chars": ps_search_chars,
"ps_width_offsets": ps_width_offsets,
"ps_lengths": bytes(pref_search_lengths)
if pref_search_lengths is not None
else bytes(),
"suff_search_1_byte": ss_1byte_ch,
"suff_search_2_bytes": ss_2byte_ch,
"suff_search_3_bytes": ss_3byte_ch,
"suff_search_4_bytes": ss_4byte_ch,
"suff_search_lengths": bytes(suff_search_lengths)
"ss_search_chars": ss_search_chars,
"ss_width_offsets": ss_width_offsets,
"ss_lengths": bytes(suff_search_lengths)
if suff_search_lengths is not None
else bytes(),
},
@ -72,36 +61,28 @@ def forward(
) -> Tuple[List[Ints2d], Callable]:
ops = model.ops
case_sensitive: bool = model.attrs["case_sensitive"]
pref_lengths: bytes = model.attrs["pref_lengths"]
suff_lengths: bytes = model.attrs["suff_lengths"]
ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
pref_search_lengths: bytes = model.attrs["pref_search_lengths"]
ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
suff_search_lengths: bytes = model.attrs["suff_search_lengths"]
p_lengths: bytes = model.attrs["p_lengths"]
s_lengths: bytes = model.attrs["s_lengths"]
ps_search_chars: bytes = model.attrs["ps_search_chars"]
ps_width_offsets: bytes = model.attrs["ps_width_offsets"]
ps_lengths: bytes = model.attrs["ps_lengths"]
ss_search_chars: bytes = model.attrs["ss_search_chars"]
ss_width_offsets: bytes = model.attrs["ss_width_offsets"]
ss_lengths: bytes = model.attrs["ss_lengths"]
features: List[Ints2d] = []
for doc in docs:
hashes = doc.get_character_combination_hashes(
cs=case_sensitive,
p_lengths=pref_lengths,
s_lengths=suff_lengths,
ps_1byte_ch=ps_1byte_ch,
ps_2byte_ch=ps_2byte_ch,
ps_3byte_ch=ps_3byte_ch,
ps_4byte_ch=ps_4byte_ch,
ps_lengths=pref_search_lengths,
ss_1byte_ch=ss_1byte_ch,
ss_2byte_ch=ss_2byte_ch,
ss_3byte_ch=ss_3byte_ch,
ss_4byte_ch=ss_4byte_ch,
ss_lengths=suff_search_lengths,
p_lengths=p_lengths,
s_lengths=s_lengths,
ps_search_chars=ps_search_chars,
ps_width_offsets=ps_width_offsets,
ps_lengths=ps_lengths,
ss_search_chars=ss_search_chars,
ss_width_offsets=ss_width_offsets,
ss_lengths=ss_lengths,
)
features.append(ops.asarray2i(hashes))
features.append(ops.asarray2i(hashes, dtype="uint64"))
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop

View File

@ -73,7 +73,7 @@ cdef int _write_hashes(
const unsigned char* aff_l_buf,
const unsigned char* offset_buf,
const int res_buf_last,
np.uint32_t* hashes_ptr,
np.uint64_t* hashes_ptr,
) nogil

View File

@ -1803,15 +1803,15 @@ cdef class Doc:
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok
cdef np.uint32_t* hashes_ptr = <np.uint32_t*> mem.alloc(
total_hashes, sizeof(np.uint32_t))
cdef np.uint64_t* hashes_ptr = <np.uint64_t*> mem.alloc(
total_hashes, sizeof(np.uint64_t))
# Define working variables
cdef TokenC tok_c
cdef int tok_i, tok_str_l
cdef attr_t num_tok_attr
cdef const unsigned char* tok_str
cdef np.uint32_t* w_hashes_ptr = hashes_ptr
cdef np.uint64_t* w_hashes_ptr = hashes_ptr
for tok_i in range(doc_l):
tok_c = self.c[tok_i]
@ -1837,9 +1837,9 @@ cdef class Doc:
ss_max_l, True, ss_res_buf, ss_l_buf)
w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr)
cdef np.ndarray[np.uint32_t, ndim=2] hashes = numpy.empty(
(doc_l, hashes_per_tok), dtype="uint32")
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint32_t))
cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
(doc_l, hashes_per_tok), dtype="uint64")
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint64_t))
return hashes
@ -2173,7 +2173,7 @@ cdef int _write_hashes(
const unsigned char* aff_l_buf,
const unsigned char* offset_buf,
const int res_buf_last,
np.uint32_t* hashes_ptr,
np.uint64_t* hashes_ptr,
) nogil:
""" Write FNV1A hashes for a token/rich property group combination.