diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 5cdbabf52..7f6898fe4 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -199,7 +199,7 @@ def _verify_rich_config_group( if lengths is not None or rows is not None: if is_search_char_group and (search_chars is None or len(search_chars) == 0): raise ValueError(Errors.E1047.format(label=label)) - if len(search_chars) > 63: + if search_chars is not None and len(search_chars) > 63: raise ValueError(Errors.E1048.format(label=label)) if lengths is None or rows is None: raise ValueError(Errors.E1047.format(label=label)) @@ -262,6 +262,12 @@ def RichMultiHashEmbed( plural noun does not become `a` if it is the third or fourth vowel from the end of the word. + There are a few rare situations where a graphical character is expressed as + more than one UTF-8 character, e.g. *i* when representing the lower-case form + of the Turkish letter *İ*. Such situations are supported, but the lengths of + prefixes, suffixes and character search results may need to be increased + accordingly. + All lengths must be specified in ascending order. width (int): The output width. Also used as the width of the embedding tables. diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py index 488c33a60..e623da45f 100644 --- a/spacy/ml/richfeatureextractor.py +++ b/spacy/ml/richfeatureextractor.py @@ -2,7 +2,7 @@ from typing import List, Optional, Callable, Tuple from spacy.util import get_search_char_byte_arrays # from ..util import get_arrays_for_search_chars -from thinc.types import Ints1d, Ints2d +from thinc.types import Ints2d from thinc.api import Model, registry, get_current_ops from ..tokens import Doc @@ -21,46 +21,35 @@ def RichFeatureExtractor( ) -> Model[List[Doc], List[Ints2d]]: ops = get_current_ops() if pref_search_chars is not None: - ( - ps_1byte_ch, - ps_2byte_ch, - ps_3byte_ch, - ps_4byte_ch, - ) = get_search_char_byte_arrays(pref_search_chars, case_sensitive) + ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(pref_search_chars, case_sensitive) else: - ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes() + ps_search_chars = bytes() + ps_width_offsets = bytes() if suff_search_chars is not None: - ( - ss_1byte_ch, - ss_2byte_ch, - ss_3byte_ch, - ss_4byte_ch, - ) = get_search_char_byte_arrays(suff_search_chars, case_sensitive) + + ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(suff_search_chars, case_sensitive) else: - ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes() + ss_search_chars = bytes() + ss_width_offsets = bytes() return Model( "extract_character_combination_hashes", forward, attrs={ "case_sensitive": case_sensitive, - "pref_lengths": bytes(pref_lengths) + "p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(), - "suff_lengths": bytes(suff_lengths) + "s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(), - "pref_search_1_byte": ps_1byte_ch, - "pref_search_2_bytes": ps_2byte_ch, - "pref_search_3_bytes": ps_3byte_ch, - "pref_search_4_bytes": ps_4byte_ch, - "pref_search_lengths": bytes(pref_search_lengths) + "ps_search_chars": ps_search_chars, + "ps_width_offsets": ps_width_offsets, + "ps_lengths": bytes(pref_search_lengths) if pref_search_lengths is not None else bytes(), - "suff_search_1_byte": ss_1byte_ch, - "suff_search_2_bytes": ss_2byte_ch, - "suff_search_3_bytes": ss_3byte_ch, - "suff_search_4_bytes": ss_4byte_ch, - "suff_search_lengths": bytes(suff_search_lengths) + "ss_search_chars": ss_search_chars, + "ss_width_offsets": ss_width_offsets, + "ss_lengths": bytes(suff_search_lengths) if suff_search_lengths is not None else bytes(), }, @@ -72,36 +61,28 @@ def forward( ) -> Tuple[List[Ints2d], Callable]: ops = model.ops case_sensitive: bool = model.attrs["case_sensitive"] - pref_lengths: bytes = model.attrs["pref_lengths"] - suff_lengths: bytes = model.attrs["suff_lengths"] - ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"] - ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"] - ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"] - ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"] - pref_search_lengths: bytes = model.attrs["pref_search_lengths"] - ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"] - ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"] - ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"] - ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"] - suff_search_lengths: bytes = model.attrs["suff_search_lengths"] + p_lengths: bytes = model.attrs["p_lengths"] + s_lengths: bytes = model.attrs["s_lengths"] + ps_search_chars: bytes = model.attrs["ps_search_chars"] + ps_width_offsets: bytes = model.attrs["ps_width_offsets"] + ps_lengths: bytes = model.attrs["ps_lengths"] + ss_search_chars: bytes = model.attrs["ss_search_chars"] + ss_width_offsets: bytes = model.attrs["ss_width_offsets"] + ss_lengths: bytes = model.attrs["ss_lengths"] features: List[Ints2d] = [] for doc in docs: hashes = doc.get_character_combination_hashes( cs=case_sensitive, - p_lengths=pref_lengths, - s_lengths=suff_lengths, - ps_1byte_ch=ps_1byte_ch, - ps_2byte_ch=ps_2byte_ch, - ps_3byte_ch=ps_3byte_ch, - ps_4byte_ch=ps_4byte_ch, - ps_lengths=pref_search_lengths, - ss_1byte_ch=ss_1byte_ch, - ss_2byte_ch=ss_2byte_ch, - ss_3byte_ch=ss_3byte_ch, - ss_4byte_ch=ss_4byte_ch, - ss_lengths=suff_search_lengths, + p_lengths=p_lengths, + s_lengths=s_lengths, + ps_search_chars=ps_search_chars, + ps_width_offsets=ps_width_offsets, + ps_lengths=ps_lengths, + ss_search_chars=ss_search_chars, + ss_width_offsets=ss_width_offsets, + ss_lengths=ss_lengths, ) - features.append(ops.asarray2i(hashes)) + features.append(ops.asarray2i(hashes, dtype="uint64")) backprop: Callable[[List[Ints2d]], List] = lambda d_features: [] return features, backprop diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index fbb537408..5f215c009 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -73,7 +73,7 @@ cdef int _write_hashes( const unsigned char* aff_l_buf, const unsigned char* offset_buf, const int res_buf_last, - np.uint32_t* hashes_ptr, + np.uint64_t* hashes_ptr, ) nogil diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 452dfb652..711436a0f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1803,15 +1803,15 @@ cdef class Doc: cdef unsigned char* ss_res_buf = mem.alloc(ss_max_l, 4) cdef unsigned char* ss_l_buf = mem.alloc(ss_max_l, 1) cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok - cdef np.uint32_t* hashes_ptr = mem.alloc( - total_hashes, sizeof(np.uint32_t)) + cdef np.uint64_t* hashes_ptr = mem.alloc( + total_hashes, sizeof(np.uint64_t)) # Define working variables cdef TokenC tok_c cdef int tok_i, tok_str_l cdef attr_t num_tok_attr cdef const unsigned char* tok_str - cdef np.uint32_t* w_hashes_ptr = hashes_ptr + cdef np.uint64_t* w_hashes_ptr = hashes_ptr for tok_i in range(doc_l): tok_c = self.c[tok_i] @@ -1837,9 +1837,9 @@ cdef class Doc: ss_max_l, True, ss_res_buf, ss_l_buf) w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr) - cdef np.ndarray[np.uint32_t, ndim=2] hashes = numpy.empty( - (doc_l, hashes_per_tok), dtype="uint32") - memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint32_t)) + cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty( + (doc_l, hashes_per_tok), dtype="uint64") + memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint64_t)) return hashes @@ -2173,7 +2173,7 @@ cdef int _write_hashes( const unsigned char* aff_l_buf, const unsigned char* offset_buf, const int res_buf_last, - np.uint32_t* hashes_ptr, + np.uint64_t* hashes_ptr, ) nogil: """ Write FNV1A hashes for a token/rich property group combination.