Everything working after refactoring

2025-08-02 19:30:19 +03:00 · 2022-11-04 09:33:06 +01:00 · 2022-11-04 09:33:06 +01:00 · 7f1873ad81
commit 7f1873ad81
parent 5d210a0f3b
4 changed files with 48 additions and 61 deletions
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -199,7 +199,7 @@ def _verify_rich_config_group(
    if lengths is not None or rows is not None:
        if is_search_char_group and (search_chars is None or len(search_chars) == 0):
            raise ValueError(Errors.E1047.format(label=label))
-        if len(search_chars) > 63:
+        if search_chars is not None and len(search_chars) > 63:
            raise ValueError(Errors.E1048.format(label=label))
        if lengths is None or rows is None:
            raise ValueError(Errors.E1047.format(label=label))
@ -262,6 +262,12 @@ def RichMultiHashEmbed(
    plural noun does not become `a` if it is the third or fourth vowel from the
    end of the word.

+    There are a few rare situations where a graphical character is expressed as
+    more than one UTF-8 character, e.g. *i* when representing the lower-case form
+    of the Turkish letter *İ*. Such situations are supported, but the lengths of
+    prefixes, suffixes and character search results may need to be increased
+    accordingly.
+
    All lengths must be specified in ascending order.

    width (int): The output width. Also used as the width of the embedding tables.
--- a/spacy/ml/richfeatureextractor.py
+++ b/spacy/ml/richfeatureextractor.py
@ -2,7 +2,7 @@ from typing import List, Optional, Callable, Tuple
 from spacy.util import get_search_char_byte_arrays

 # from ..util import get_arrays_for_search_chars
-from thinc.types import Ints1d, Ints2d
+from thinc.types import Ints2d
 from thinc.api import Model, registry, get_current_ops

 from ..tokens import Doc
@ -21,46 +21,35 @@ def RichFeatureExtractor(
 ) -> Model[List[Doc], List[Ints2d]]:
    ops = get_current_ops()
    if pref_search_chars is not None:
-        (
-            ps_1byte_ch,
-            ps_2byte_ch,
-            ps_3byte_ch,
-            ps_4byte_ch,
-        ) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
+        ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
    else:
-        ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes()
+        ps_search_chars = bytes()
+        ps_width_offsets = bytes()
    if suff_search_chars is not None:
-        (
-            ss_1byte_ch,
-            ss_2byte_ch,
-            ss_3byte_ch,
-            ss_4byte_ch,
-        ) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
+        
+        ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
    else:
-        ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes()
+        ss_search_chars = bytes()
+        ss_width_offsets = bytes()
    return Model(
        "extract_character_combination_hashes",
        forward,
        attrs={
            "case_sensitive": case_sensitive,
-            "pref_lengths": bytes(pref_lengths)
+            "p_lengths": bytes(pref_lengths)
            if pref_lengths is not None
            else bytes(),
-            "suff_lengths": bytes(suff_lengths)
+            "s_lengths": bytes(suff_lengths)
            if suff_lengths is not None
            else bytes(),
-            "pref_search_1_byte": ps_1byte_ch,
-            "pref_search_2_bytes": ps_2byte_ch,
-            "pref_search_3_bytes": ps_3byte_ch,
-            "pref_search_4_bytes": ps_4byte_ch,
-            "pref_search_lengths": bytes(pref_search_lengths)
+            "ps_search_chars": ps_search_chars,
+            "ps_width_offsets": ps_width_offsets,
+            "ps_lengths": bytes(pref_search_lengths)
            if pref_search_lengths is not None
            else bytes(),
-            "suff_search_1_byte": ss_1byte_ch,
-            "suff_search_2_bytes": ss_2byte_ch,
-            "suff_search_3_bytes": ss_3byte_ch,
-            "suff_search_4_bytes": ss_4byte_ch,
-            "suff_search_lengths": bytes(suff_search_lengths)
+            "ss_search_chars": ss_search_chars,
+            "ss_width_offsets": ss_width_offsets,
+            "ss_lengths": bytes(suff_search_lengths)
            if suff_search_lengths is not None
            else bytes(),
        },
@ -72,36 +61,28 @@ def forward(
 ) -> Tuple[List[Ints2d], Callable]:
    ops = model.ops
    case_sensitive: bool = model.attrs["case_sensitive"]
-    pref_lengths: bytes = model.attrs["pref_lengths"]
-    suff_lengths: bytes = model.attrs["suff_lengths"]
-    ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
-    ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
-    ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
-    ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
-    pref_search_lengths: bytes = model.attrs["pref_search_lengths"]
-    ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
-    ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
-    ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
-    ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
-    suff_search_lengths: bytes = model.attrs["suff_search_lengths"]
+    p_lengths: bytes = model.attrs["p_lengths"]
+    s_lengths: bytes = model.attrs["s_lengths"]
+    ps_search_chars: bytes = model.attrs["ps_search_chars"]
+    ps_width_offsets: bytes = model.attrs["ps_width_offsets"]
+    ps_lengths: bytes = model.attrs["ps_lengths"]
+    ss_search_chars: bytes = model.attrs["ss_search_chars"]
+    ss_width_offsets: bytes = model.attrs["ss_width_offsets"]
+    ss_lengths: bytes = model.attrs["ss_lengths"]
    features: List[Ints2d] = []
    for doc in docs:
        hashes = doc.get_character_combination_hashes(
            cs=case_sensitive,
-            p_lengths=pref_lengths,
-            s_lengths=suff_lengths,
-            ps_1byte_ch=ps_1byte_ch,
-            ps_2byte_ch=ps_2byte_ch,
-            ps_3byte_ch=ps_3byte_ch,
-            ps_4byte_ch=ps_4byte_ch,
-            ps_lengths=pref_search_lengths,
-            ss_1byte_ch=ss_1byte_ch,
-            ss_2byte_ch=ss_2byte_ch,
-            ss_3byte_ch=ss_3byte_ch,
-            ss_4byte_ch=ss_4byte_ch,
-            ss_lengths=suff_search_lengths,
+            p_lengths=p_lengths,
+            s_lengths=s_lengths,
+            ps_search_chars=ps_search_chars,
+            ps_width_offsets=ps_width_offsets,
+            ps_lengths=ps_lengths,
+            ss_search_chars=ss_search_chars,
+            ss_width_offsets=ss_width_offsets,
+            ss_lengths=ss_lengths,
        )
-        features.append(ops.asarray2i(hashes))
+        features.append(ops.asarray2i(hashes, dtype="uint64"))

    backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
    return features, backprop
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -73,7 +73,7 @@ cdef int _write_hashes(
    const unsigned char* aff_l_buf,
    const unsigned char* offset_buf,
    const int res_buf_last,
-    np.uint32_t* hashes_ptr,
+    np.uint64_t* hashes_ptr,
 ) nogil


--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1803,15 +1803,15 @@ cdef class Doc:
        cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 4)
        cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, 1)
        cdef int doc_l = self.length, total_hashes = doc_l * hashes_per_tok 
-        cdef np.uint32_t* hashes_ptr = <np.uint32_t*> mem.alloc(
-            total_hashes, sizeof(np.uint32_t))
+        cdef np.uint64_t* hashes_ptr = <np.uint64_t*> mem.alloc(
+            total_hashes, sizeof(np.uint64_t))
         
        # Define working variables
        cdef TokenC tok_c
        cdef int tok_i, tok_str_l
        cdef attr_t num_tok_attr
        cdef const unsigned char* tok_str
-        cdef np.uint32_t* w_hashes_ptr = hashes_ptr
+        cdef np.uint64_t* w_hashes_ptr = hashes_ptr
        
        for tok_i in range(doc_l):
            tok_c = self.c[tok_i]
@ -1837,9 +1837,9 @@ cdef class Doc:
                    ss_max_l, True, ss_res_buf, ss_l_buf)
                w_hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, w_hashes_ptr)
        
-        cdef np.ndarray[np.uint32_t, ndim=2] hashes = numpy.empty(
-            (doc_l, hashes_per_tok), dtype="uint32")
-        memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint32_t))
+        cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
+            (doc_l, hashes_per_tok), dtype="uint64")
+        memcpy(hashes.data, hashes_ptr, total_hashes * sizeof(np.uint64_t))
        return hashes


@ -2173,7 +2173,7 @@ cdef int _write_hashes(
    const unsigned char* aff_l_buf,
    const unsigned char* offset_buf,
    const int res_buf_last,
-    np.uint32_t* hashes_ptr,
+    np.uint64_t* hashes_ptr,
 ) nogil:    
    """ Write FNV1A hashes for a token/rich property group combination.