mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Everything working after refactoring
This commit is contained in:
parent
5d210a0f3b
commit
7f1873ad81
|
@ -199,7 +199,7 @@ def _verify_rich_config_group(
|
|||
if lengths is not None or rows is not None:
|
||||
if is_search_char_group and (search_chars is None or len(search_chars) == 0):
|
||||
raise ValueError(Errors.E1047.format(label=label))
|
||||
if len(search_chars) > 63:
|
||||
if search_chars is not None and len(search_chars) > 63:
|
||||
raise ValueError(Errors.E1048.format(label=label))
|
||||
if lengths is None or rows is None:
|
||||
raise ValueError(Errors.E1047.format(label=label))
|
||||
|
@ -262,6 +262,12 @@ def RichMultiHashEmbed(
|
|||
plural noun does not become `a` if it is the third or fourth vowel from the
|
||||
end of the word.
|
||||
|
||||
There are a few rare situations where a graphical character is expressed as
|
||||
more than one UTF-8 character, e.g. *i* when representing the lower-case form
|
||||
of the Turkish letter *İ*. Such situations are supported, but the lengths of
|
||||
prefixes, suffixes and character search results may need to be increased
|
||||
accordingly.
|
||||
|
||||
All lengths must be specified in ascending order.
|
||||
|
||||
width (int): The output width. Also used as the width of the embedding tables.
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import List, Optional, Callable, Tuple
|
|||
from spacy.util import get_search_char_byte_arrays
|
||||
|
||||
# from ..util import get_arrays_for_search_chars
|
||||
from thinc.types import Ints1d, Ints2d
|
||||
from thinc.types import Ints2d
|
||||
from thinc.api import Model, registry, get_current_ops
|
||||
|
||||
from ..tokens import Doc
|
||||
|
@ -21,46 +21,35 @@ def RichFeatureExtractor(
|
|||
) -> Model[List[Doc], List[Ints2d]]:
|
||||
ops = get_current_ops()
|
||||
if pref_search_chars is not None:
|
||||
(
|
||||
ps_1byte_ch,
|
||||
ps_2byte_ch,
|
||||
ps_3byte_ch,
|
||||
ps_4byte_ch,
|
||||
) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
|
||||
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
|
||||
else:
|
||||
ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes()
|
||||
ps_search_chars = bytes()
|
||||
ps_width_offsets = bytes()
|
||||
if suff_search_chars is not None:
|
||||
(
|
||||
ss_1byte_ch,
|
||||
ss_2byte_ch,
|
||||
ss_3byte_ch,
|
||||
ss_4byte_ch,
|
||||
) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
|
||||
|
||||
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
|
||||
else:
|
||||
ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes()
|
||||
ss_search_chars = bytes()
|
||||
ss_width_offsets = bytes()
|
||||
return Model(
|
||||
"extract_character_combination_hashes",
|
||||
forward,
|
||||
attrs={
|
||||
"case_sensitive": case_sensitive,
|
||||
"pref_lengths": bytes(pref_lengths)
|
||||
"p_lengths": bytes(pref_lengths)
|
||||
if pref_lengths is not None
|
||||
else bytes(),
|
||||
"suff_lengths": bytes(suff_lengths)
|
||||
"s_lengths": bytes(suff_lengths)
|
||||
if suff_lengths is not None
|
||||
else bytes(),
|
||||
"pref_search_1_byte": ps_1byte_ch,
|
||||
"pref_search_2_bytes": ps_2byte_ch,
|
||||
"pref_search_3_bytes": ps_3byte_ch,
|
||||
"pref_search_4_bytes": ps_4byte_ch,
|
||||
"pref_search_lengths": bytes(pref_search_lengths)
|
||||
"ps_search_chars": ps_search_chars,
|
||||
"ps_width_offsets": ps_width_offsets,
|
||||
"ps_lengths": bytes(pref_search_lengths)
|
||||
if pref_search_lengths is not None
|
||||
else bytes(),
|
||||
"suff_search_1_byte": ss_1byte_ch,
|
||||
"suff_search_2_bytes": ss_2byte_ch,
|
||||
"suff_search_3_bytes": ss_3byte_ch,
|
||||
"suff_search_4_bytes": ss_4byte_ch,
|
||||
"suff_search_lengths": bytes(suff_search_lengths)
|
||||
"ss_search_chars": ss_search_chars,
|
||||
"ss_width_offsets": ss_width_offsets,
|
||||
"ss_lengths": bytes(suff_search_lengths)
|
||||
if suff_search_lengths is not None
|
||||
else bytes(),
|
||||
},
|
||||
|
@ -72,36 +61,28 @@ def forward(
|
|||
) -> Tuple[List[Ints2d], Callable]:
|
||||
ops = model.ops
|
||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||
pref_lengths: bytes = model.attrs["pref_lengths"]
|
||||
suff_lengths: bytes = model.attrs["suff_lengths"]
|
||||
ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
||||
ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
||||
ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
||||
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
||||
pref_search_lengths: bytes = model.attrs["pref_search_lengths"]
|
||||
ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
||||
ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
||||
ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
||||
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
||||
suff_search_lengths: bytes = model.attrs["suff_search_lengths"]
|
||||
p_lengths: bytes = model.attrs["p_lengths"]
|
||||
s_lengths: bytes = model.attrs["s_lengths"]
|
||||
ps_search_chars: bytes = model.attrs["ps_search_chars"]
|
||||
ps_width_offsets: bytes = model.attrs["ps_width_offsets"]
|
||||
ps_lengths: bytes = model.attrs["ps_lengths"]
|
||||
ss_search_chars: bytes = model.attrs["ss_search_chars"]
|
||||
ss_width_offsets: bytes = model.attrs["ss_width_offsets"]
|
||||
ss_lengths: bytes = model.attrs["ss_lengths"]
|
||||
features: List[Ints2d] = []
|
||||
for doc in docs:
|
||||
hashes = doc.get_character_combination_hashes(
|
||||
cs=case_sensitive,
|
||||
p_lengths=pref_lengths,
|
||||
s_lengths=suff_lengths,
|
||||
ps_1byte_ch=ps_1byte_ch,
|
||||
ps_2byte_ch=ps_2byte_ch,
|
||||
ps_3byte_ch=ps_3byte_ch,
|
||||
ps_4byte_ch=ps_4byte_ch,
|
||||
ps_lengths=pref_search_lengths,
|
||||
ss_1byte_ch=ss_1byte_ch,
|
||||
ss_2byte_ch=ss_2byte_ch,
|
||||
ss_3byte_ch=ss_3byte_ch,
|
||||
ss_4byte_ch=ss_4byte_ch,
|
||||
ss_lengths=suff_search_lengths,
|
||||
p_lengths=p_lengths,
|
||||
s_lengths=s_lengths,
|
||||
ps_search_chars=ps_search_chars,
|
||||
ps_width_offsets=ps_width_offsets,
|
||||
ps_lengths=ps_lengths,
|
||||
ss_search_chars=ss_search_chars,
|
||||
ss_width_offsets=ss_width_offsets,
|
||||
ss_lengths=ss_lengths,
|
||||
)
|
||||
features.append(ops.asarray2i(hashes))
|
||||
features.append(ops.asarray2i(hashes, dtype="uint64"))
|
||||
|
||||
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||
return features, backprop
|
||||
|
|
|
@ -73,7 +73,7 @@ cdef int _write_hashes(
|
|||
const unsigned char* aff_l_buf,
|
||||
const unsigned char* offset_buf,
|
||||
const int res_buf_last,
|
||||
np.uint32_t* hashes_ptr,
|
||||
np.uint64_t* hashes_ptr,
|
||||
) nogil
|
||||
|
||||
|
||||
|
|
|
@ -1803,15 +1803,15 @@ cdef class Doc:
|
|||
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
|
||||
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
|
||||
cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok
|
||||
cdef np.uint32_t* hashes_ptr = <np.uint32_t*> mem.alloc(
|
||||
total_hashes, sizeof(np.uint32_t))
|
||||
cdef np.uint64_t* hashes_ptr = <np.uint64_t*> mem.alloc(
|
||||
total_hashes, sizeof(np.uint64_t))
|
||||
|
||||
# Define working variables
|
||||
cdef TokenC tok_c
|
||||
cdef int tok_i, tok_str_l
|
||||
cdef attr_t num_tok_attr
|
||||
cdef const unsigned char* tok_str
|
||||
cdef np.uint32_t* w_hashes_ptr = hashes_ptr
|
||||
cdef np.uint64_t* w_hashes_ptr = hashes_ptr
|
||||
|
||||
for tok_i in range(doc_l):
|
||||
tok_c = self.c[tok_i]
|
||||
|
@ -1837,9 +1837,9 @@ cdef class Doc:
|
|||
ss_max_l, True, ss_res_buf, ss_l_buf)
|
||||
w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr)
|
||||
|
||||
cdef np.ndarray[np.uint32_t, ndim=2] hashes = numpy.empty(
|
||||
(doc_l, hashes_per_tok), dtype="uint32")
|
||||
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint32_t))
|
||||
cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
|
||||
(doc_l, hashes_per_tok), dtype="uint64")
|
||||
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint64_t))
|
||||
return hashes
|
||||
|
||||
|
||||
|
@ -2173,7 +2173,7 @@ cdef int _write_hashes(
|
|||
const unsigned char* aff_l_buf,
|
||||
const unsigned char* offset_buf,
|
||||
const int res_buf_last,
|
||||
np.uint32_t* hashes_ptr,
|
||||
np.uint64_t* hashes_ptr,
|
||||
) nogil:
|
||||
""" Write FNV1A hashes for a token/rich property group combination.
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user