Everything working after refactoring

This commit is contained in:
richardpaulhudson 2022-11-04 09:33:06 +01:00
parent 5d210a0f3b
commit 7f1873ad81
4 changed files with 48 additions and 61 deletions

View File

@ -199,7 +199,7 @@ def _verify_rich_config_group(
if lengths is not None or rows is not None: if lengths is not None or rows is not None:
if is_search_char_group and (search_chars is None or len(search_chars) == 0): if is_search_char_group and (search_chars is None or len(search_chars) == 0):
raise ValueError(Errors.E1047.format(label=label)) raise ValueError(Errors.E1047.format(label=label))
if len(search_chars) > 63: if search_chars is not None and len(search_chars) > 63:
raise ValueError(Errors.E1048.format(label=label)) raise ValueError(Errors.E1048.format(label=label))
if lengths is None or rows is None: if lengths is None or rows is None:
raise ValueError(Errors.E1047.format(label=label)) raise ValueError(Errors.E1047.format(label=label))
@ -262,6 +262,12 @@ def RichMultiHashEmbed(
plural noun does not become `a` if it is the third or fourth vowel from the plural noun does not become `a` if it is the third or fourth vowel from the
end of the word. end of the word.
There are a few rare situations where a graphical character is expressed as
more than one UTF-8 character, e.g. *i* when representing the lower-case form
of the Turkish letter *İ*. Such situations are supported, but the lengths of
prefixes, suffixes and character search results may need to be increased
accordingly.
All lengths must be specified in ascending order. All lengths must be specified in ascending order.
width (int): The output width. Also used as the width of the embedding tables. width (int): The output width. Also used as the width of the embedding tables.

View File

@ -2,7 +2,7 @@ from typing import List, Optional, Callable, Tuple
from spacy.util import get_search_char_byte_arrays from spacy.util import get_search_char_byte_arrays
# from ..util import get_arrays_for_search_chars # from ..util import get_arrays_for_search_chars
from thinc.types import Ints1d, Ints2d from thinc.types import Ints2d
from thinc.api import Model, registry, get_current_ops from thinc.api import Model, registry, get_current_ops
from ..tokens import Doc from ..tokens import Doc
@ -21,46 +21,35 @@ def RichFeatureExtractor(
) -> Model[List[Doc], List[Ints2d]]: ) -> Model[List[Doc], List[Ints2d]]:
ops = get_current_ops() ops = get_current_ops()
if pref_search_chars is not None: if pref_search_chars is not None:
( ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
ps_1byte_ch,
ps_2byte_ch,
ps_3byte_ch,
ps_4byte_ch,
) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
else: else:
ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes() ps_search_chars = bytes()
ps_width_offsets = bytes()
if suff_search_chars is not None: if suff_search_chars is not None:
(
ss_1byte_ch, ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
ss_2byte_ch,
ss_3byte_ch,
ss_4byte_ch,
) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
else: else:
ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes() ss_search_chars = bytes()
ss_width_offsets = bytes()
return Model( return Model(
"extract_character_combination_hashes", "extract_character_combination_hashes",
forward, forward,
attrs={ attrs={
"case_sensitive": case_sensitive, "case_sensitive": case_sensitive,
"pref_lengths": bytes(pref_lengths) "p_lengths": bytes(pref_lengths)
if pref_lengths is not None if pref_lengths is not None
else bytes(), else bytes(),
"suff_lengths": bytes(suff_lengths) "s_lengths": bytes(suff_lengths)
if suff_lengths is not None if suff_lengths is not None
else bytes(), else bytes(),
"pref_search_1_byte": ps_1byte_ch, "ps_search_chars": ps_search_chars,
"pref_search_2_bytes": ps_2byte_ch, "ps_width_offsets": ps_width_offsets,
"pref_search_3_bytes": ps_3byte_ch, "ps_lengths": bytes(pref_search_lengths)
"pref_search_4_bytes": ps_4byte_ch,
"pref_search_lengths": bytes(pref_search_lengths)
if pref_search_lengths is not None if pref_search_lengths is not None
else bytes(), else bytes(),
"suff_search_1_byte": ss_1byte_ch, "ss_search_chars": ss_search_chars,
"suff_search_2_bytes": ss_2byte_ch, "ss_width_offsets": ss_width_offsets,
"suff_search_3_bytes": ss_3byte_ch, "ss_lengths": bytes(suff_search_lengths)
"suff_search_4_bytes": ss_4byte_ch,
"suff_search_lengths": bytes(suff_search_lengths)
if suff_search_lengths is not None if suff_search_lengths is not None
else bytes(), else bytes(),
}, },
@ -72,36 +61,28 @@ def forward(
) -> Tuple[List[Ints2d], Callable]: ) -> Tuple[List[Ints2d], Callable]:
ops = model.ops ops = model.ops
case_sensitive: bool = model.attrs["case_sensitive"] case_sensitive: bool = model.attrs["case_sensitive"]
pref_lengths: bytes = model.attrs["pref_lengths"] p_lengths: bytes = model.attrs["p_lengths"]
suff_lengths: bytes = model.attrs["suff_lengths"] s_lengths: bytes = model.attrs["s_lengths"]
ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"] ps_search_chars: bytes = model.attrs["ps_search_chars"]
ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"] ps_width_offsets: bytes = model.attrs["ps_width_offsets"]
ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"] ps_lengths: bytes = model.attrs["ps_lengths"]
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"] ss_search_chars: bytes = model.attrs["ss_search_chars"]
pref_search_lengths: bytes = model.attrs["pref_search_lengths"] ss_width_offsets: bytes = model.attrs["ss_width_offsets"]
ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"] ss_lengths: bytes = model.attrs["ss_lengths"]
ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
suff_search_lengths: bytes = model.attrs["suff_search_lengths"]
features: List[Ints2d] = [] features: List[Ints2d] = []
for doc in docs: for doc in docs:
hashes = doc.get_character_combination_hashes( hashes = doc.get_character_combination_hashes(
cs=case_sensitive, cs=case_sensitive,
p_lengths=pref_lengths, p_lengths=p_lengths,
s_lengths=suff_lengths, s_lengths=s_lengths,
ps_1byte_ch=ps_1byte_ch, ps_search_chars=ps_search_chars,
ps_2byte_ch=ps_2byte_ch, ps_width_offsets=ps_width_offsets,
ps_3byte_ch=ps_3byte_ch, ps_lengths=ps_lengths,
ps_4byte_ch=ps_4byte_ch, ss_search_chars=ss_search_chars,
ps_lengths=pref_search_lengths, ss_width_offsets=ss_width_offsets,
ss_1byte_ch=ss_1byte_ch, ss_lengths=ss_lengths,
ss_2byte_ch=ss_2byte_ch,
ss_3byte_ch=ss_3byte_ch,
ss_4byte_ch=ss_4byte_ch,
ss_lengths=suff_search_lengths,
) )
features.append(ops.asarray2i(hashes)) features.append(ops.asarray2i(hashes, dtype="uint64"))
backprop: Callable[[List[Ints2d]], List] = lambda d_features: [] backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop return features, backprop

View File

@ -73,7 +73,7 @@ cdef int _write_hashes(
const unsigned char* aff_l_buf, const unsigned char* aff_l_buf,
const unsigned char* offset_buf, const unsigned char* offset_buf,
const int res_buf_last, const int res_buf_last,
np.uint32_t* hashes_ptr, np.uint64_t* hashes_ptr,
) nogil ) nogil

View File

@ -1803,15 +1803,15 @@ cdef class Doc:
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4) cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1) cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok
cdef np.uint32_t* hashes_ptr = <np.uint32_t*> mem.alloc( cdef np.uint64_t* hashes_ptr = <np.uint64_t*> mem.alloc(
total_hashes, sizeof(np.uint32_t)) total_hashes, sizeof(np.uint64_t))
# Define working variables # Define working variables
cdef TokenC tok_c cdef TokenC tok_c
cdef int tok_i, tok_str_l cdef int tok_i, tok_str_l
cdef attr_t num_tok_attr cdef attr_t num_tok_attr
cdef const unsigned char* tok_str cdef const unsigned char* tok_str
cdef np.uint32_t* w_hashes_ptr = hashes_ptr cdef np.uint64_t* w_hashes_ptr = hashes_ptr
for tok_i in range(doc_l): for tok_i in range(doc_l):
tok_c = self.c[tok_i] tok_c = self.c[tok_i]
@ -1837,9 +1837,9 @@ cdef class Doc:
ss_max_l, True, ss_res_buf, ss_l_buf) ss_max_l, True, ss_res_buf, ss_l_buf)
w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr) w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr)
cdef np.ndarray[np.uint32_t, ndim=2] hashes = numpy.empty( cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
(doc_l, hashes_per_tok), dtype="uint32") (doc_l, hashes_per_tok), dtype="uint64")
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint32_t)) memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint64_t))
return hashes return hashes
@ -2173,7 +2173,7 @@ cdef int _write_hashes(
const unsigned char* aff_l_buf, const unsigned char* aff_l_buf,
const unsigned char* offset_buf, const unsigned char* offset_buf,
const int res_buf_last, const int res_buf_last,
np.uint32_t* hashes_ptr, np.uint64_t* hashes_ptr,
) nogil: ) nogil:
""" Write FNV1A hashes for a token/rich property group combination. """ Write FNV1A hashes for a token/rich property group combination.