mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Everything working after refactoring
This commit is contained in:
parent
5d210a0f3b
commit
7f1873ad81
|
@ -199,7 +199,7 @@ def _verify_rich_config_group(
|
||||||
if lengths is not None or rows is not None:
|
if lengths is not None or rows is not None:
|
||||||
if is_search_char_group and (search_chars is None or len(search_chars) == 0):
|
if is_search_char_group and (search_chars is None or len(search_chars) == 0):
|
||||||
raise ValueError(Errors.E1047.format(label=label))
|
raise ValueError(Errors.E1047.format(label=label))
|
||||||
if len(search_chars) > 63:
|
if search_chars is not None and len(search_chars) > 63:
|
||||||
raise ValueError(Errors.E1048.format(label=label))
|
raise ValueError(Errors.E1048.format(label=label))
|
||||||
if lengths is None or rows is None:
|
if lengths is None or rows is None:
|
||||||
raise ValueError(Errors.E1047.format(label=label))
|
raise ValueError(Errors.E1047.format(label=label))
|
||||||
|
@ -262,6 +262,12 @@ def RichMultiHashEmbed(
|
||||||
plural noun does not become `a` if it is the third or fourth vowel from the
|
plural noun does not become `a` if it is the third or fourth vowel from the
|
||||||
end of the word.
|
end of the word.
|
||||||
|
|
||||||
|
There are a few rare situations where a graphical character is expressed as
|
||||||
|
more than one UTF-8 character, e.g. *i* when representing the lower-case form
|
||||||
|
of the Turkish letter *İ*. Such situations are supported, but the lengths of
|
||||||
|
prefixes, suffixes and character search results may need to be increased
|
||||||
|
accordingly.
|
||||||
|
|
||||||
All lengths must be specified in ascending order.
|
All lengths must be specified in ascending order.
|
||||||
|
|
||||||
width (int): The output width. Also used as the width of the embedding tables.
|
width (int): The output width. Also used as the width of the embedding tables.
|
||||||
|
|
|
@ -2,7 +2,7 @@ from typing import List, Optional, Callable, Tuple
|
||||||
from spacy.util import get_search_char_byte_arrays
|
from spacy.util import get_search_char_byte_arrays
|
||||||
|
|
||||||
# from ..util import get_arrays_for_search_chars
|
# from ..util import get_arrays_for_search_chars
|
||||||
from thinc.types import Ints1d, Ints2d
|
from thinc.types import Ints2d
|
||||||
from thinc.api import Model, registry, get_current_ops
|
from thinc.api import Model, registry, get_current_ops
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
@ -21,46 +21,35 @@ def RichFeatureExtractor(
|
||||||
) -> Model[List[Doc], List[Ints2d]]:
|
) -> Model[List[Doc], List[Ints2d]]:
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
if pref_search_chars is not None:
|
if pref_search_chars is not None:
|
||||||
(
|
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
|
||||||
ps_1byte_ch,
|
|
||||||
ps_2byte_ch,
|
|
||||||
ps_3byte_ch,
|
|
||||||
ps_4byte_ch,
|
|
||||||
) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
|
|
||||||
else:
|
else:
|
||||||
ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes()
|
ps_search_chars = bytes()
|
||||||
|
ps_width_offsets = bytes()
|
||||||
if suff_search_chars is not None:
|
if suff_search_chars is not None:
|
||||||
(
|
|
||||||
ss_1byte_ch,
|
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
|
||||||
ss_2byte_ch,
|
|
||||||
ss_3byte_ch,
|
|
||||||
ss_4byte_ch,
|
|
||||||
) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
|
|
||||||
else:
|
else:
|
||||||
ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes()
|
ss_search_chars = bytes()
|
||||||
|
ss_width_offsets = bytes()
|
||||||
return Model(
|
return Model(
|
||||||
"extract_character_combination_hashes",
|
"extract_character_combination_hashes",
|
||||||
forward,
|
forward,
|
||||||
attrs={
|
attrs={
|
||||||
"case_sensitive": case_sensitive,
|
"case_sensitive": case_sensitive,
|
||||||
"pref_lengths": bytes(pref_lengths)
|
"p_lengths": bytes(pref_lengths)
|
||||||
if pref_lengths is not None
|
if pref_lengths is not None
|
||||||
else bytes(),
|
else bytes(),
|
||||||
"suff_lengths": bytes(suff_lengths)
|
"s_lengths": bytes(suff_lengths)
|
||||||
if suff_lengths is not None
|
if suff_lengths is not None
|
||||||
else bytes(),
|
else bytes(),
|
||||||
"pref_search_1_byte": ps_1byte_ch,
|
"ps_search_chars": ps_search_chars,
|
||||||
"pref_search_2_bytes": ps_2byte_ch,
|
"ps_width_offsets": ps_width_offsets,
|
||||||
"pref_search_3_bytes": ps_3byte_ch,
|
"ps_lengths": bytes(pref_search_lengths)
|
||||||
"pref_search_4_bytes": ps_4byte_ch,
|
|
||||||
"pref_search_lengths": bytes(pref_search_lengths)
|
|
||||||
if pref_search_lengths is not None
|
if pref_search_lengths is not None
|
||||||
else bytes(),
|
else bytes(),
|
||||||
"suff_search_1_byte": ss_1byte_ch,
|
"ss_search_chars": ss_search_chars,
|
||||||
"suff_search_2_bytes": ss_2byte_ch,
|
"ss_width_offsets": ss_width_offsets,
|
||||||
"suff_search_3_bytes": ss_3byte_ch,
|
"ss_lengths": bytes(suff_search_lengths)
|
||||||
"suff_search_4_bytes": ss_4byte_ch,
|
|
||||||
"suff_search_lengths": bytes(suff_search_lengths)
|
|
||||||
if suff_search_lengths is not None
|
if suff_search_lengths is not None
|
||||||
else bytes(),
|
else bytes(),
|
||||||
},
|
},
|
||||||
|
@ -72,36 +61,28 @@ def forward(
|
||||||
) -> Tuple[List[Ints2d], Callable]:
|
) -> Tuple[List[Ints2d], Callable]:
|
||||||
ops = model.ops
|
ops = model.ops
|
||||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||||
pref_lengths: bytes = model.attrs["pref_lengths"]
|
p_lengths: bytes = model.attrs["p_lengths"]
|
||||||
suff_lengths: bytes = model.attrs["suff_lengths"]
|
s_lengths: bytes = model.attrs["s_lengths"]
|
||||||
ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
ps_search_chars: bytes = model.attrs["ps_search_chars"]
|
||||||
ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
ps_width_offsets: bytes = model.attrs["ps_width_offsets"]
|
||||||
ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
ps_lengths: bytes = model.attrs["ps_lengths"]
|
||||||
ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
ss_search_chars: bytes = model.attrs["ss_search_chars"]
|
||||||
pref_search_lengths: bytes = model.attrs["pref_search_lengths"]
|
ss_width_offsets: bytes = model.attrs["ss_width_offsets"]
|
||||||
ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
|
ss_lengths: bytes = model.attrs["ss_lengths"]
|
||||||
ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
|
|
||||||
ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
|
|
||||||
ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
|
|
||||||
suff_search_lengths: bytes = model.attrs["suff_search_lengths"]
|
|
||||||
features: List[Ints2d] = []
|
features: List[Ints2d] = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
hashes = doc.get_character_combination_hashes(
|
hashes = doc.get_character_combination_hashes(
|
||||||
cs=case_sensitive,
|
cs=case_sensitive,
|
||||||
p_lengths=pref_lengths,
|
p_lengths=p_lengths,
|
||||||
s_lengths=suff_lengths,
|
s_lengths=s_lengths,
|
||||||
ps_1byte_ch=ps_1byte_ch,
|
ps_search_chars=ps_search_chars,
|
||||||
ps_2byte_ch=ps_2byte_ch,
|
ps_width_offsets=ps_width_offsets,
|
||||||
ps_3byte_ch=ps_3byte_ch,
|
ps_lengths=ps_lengths,
|
||||||
ps_4byte_ch=ps_4byte_ch,
|
ss_search_chars=ss_search_chars,
|
||||||
ps_lengths=pref_search_lengths,
|
ss_width_offsets=ss_width_offsets,
|
||||||
ss_1byte_ch=ss_1byte_ch,
|
ss_lengths=ss_lengths,
|
||||||
ss_2byte_ch=ss_2byte_ch,
|
|
||||||
ss_3byte_ch=ss_3byte_ch,
|
|
||||||
ss_4byte_ch=ss_4byte_ch,
|
|
||||||
ss_lengths=suff_search_lengths,
|
|
||||||
)
|
)
|
||||||
features.append(ops.asarray2i(hashes))
|
features.append(ops.asarray2i(hashes, dtype="uint64"))
|
||||||
|
|
||||||
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||||
return features, backprop
|
return features, backprop
|
||||||
|
|
|
@ -73,7 +73,7 @@ cdef int _write_hashes(
|
||||||
const unsigned char* aff_l_buf,
|
const unsigned char* aff_l_buf,
|
||||||
const unsigned char* offset_buf,
|
const unsigned char* offset_buf,
|
||||||
const int res_buf_last,
|
const int res_buf_last,
|
||||||
np.uint32_t* hashes_ptr,
|
np.uint64_t* hashes_ptr,
|
||||||
) nogil
|
) nogil
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1803,15 +1803,15 @@ cdef class Doc:
|
||||||
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
|
cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
|
||||||
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
|
cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
|
||||||
cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok
|
cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok
|
||||||
cdef np.uint32_t* hashes_ptr = <np.uint32_t*> mem.alloc(
|
cdef np.uint64_t* hashes_ptr = <np.uint64_t*> mem.alloc(
|
||||||
total_hashes, sizeof(np.uint32_t))
|
total_hashes, sizeof(np.uint64_t))
|
||||||
|
|
||||||
# Define working variables
|
# Define working variables
|
||||||
cdef TokenC tok_c
|
cdef TokenC tok_c
|
||||||
cdef int tok_i, tok_str_l
|
cdef int tok_i, tok_str_l
|
||||||
cdef attr_t num_tok_attr
|
cdef attr_t num_tok_attr
|
||||||
cdef const unsigned char* tok_str
|
cdef const unsigned char* tok_str
|
||||||
cdef np.uint32_t* w_hashes_ptr = hashes_ptr
|
cdef np.uint64_t* w_hashes_ptr = hashes_ptr
|
||||||
|
|
||||||
for tok_i in range(doc_l):
|
for tok_i in range(doc_l):
|
||||||
tok_c = self.c[tok_i]
|
tok_c = self.c[tok_i]
|
||||||
|
@ -1837,9 +1837,9 @@ cdef class Doc:
|
||||||
ss_max_l, True, ss_res_buf, ss_l_buf)
|
ss_max_l, True, ss_res_buf, ss_l_buf)
|
||||||
w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr)
|
w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr)
|
||||||
|
|
||||||
cdef np.ndarray[np.uint32_t, ndim=2] hashes = numpy.empty(
|
cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
|
||||||
(doc_l, hashes_per_tok), dtype="uint32")
|
(doc_l, hashes_per_tok), dtype="uint64")
|
||||||
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint32_t))
|
memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint64_t))
|
||||||
return hashes
|
return hashes
|
||||||
|
|
||||||
|
|
||||||
|
@ -2173,7 +2173,7 @@ cdef int _write_hashes(
|
||||||
const unsigned char* aff_l_buf,
|
const unsigned char* aff_l_buf,
|
||||||
const unsigned char* offset_buf,
|
const unsigned char* offset_buf,
|
||||||
const int res_buf_last,
|
const int res_buf_last,
|
||||||
np.uint32_t* hashes_ptr,
|
np.uint64_t* hashes_ptr,
|
||||||
) nogil:
|
) nogil:
|
||||||
""" Write FNV1A hashes for a token/rich property group combination.
|
""" Write FNV1A hashes for a token/rich property group combination.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user