Changes after review discussion — intermed. state

2025-08-02 11:20:19 +03:00 · 2022-10-27 18:03:25 +02:00 · 2022-10-27 18:03:25 +02:00 · a1b8697aab
commit a1b8697aab
parent 7d8258bec8
10 changed files with 294 additions and 308 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -214,6 +214,8 @@ class Warnings(metaclass=ErrorsWithCodes):
            "is a Cython extension type.")
    W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
            "aware that this might affect other components in your pipeline.")
+    W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same "
+            "information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.")    


 class Errors(metaclass=ErrorsWithCodes):
@ -953,6 +955,8 @@ class Errors(metaclass=ErrorsWithCodes):
    E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
             "knowledge base, use `InMemoryLookupKB`.")
    E1047 = ("Invalid rich group config '{label}'.")
+    E1048 = ("Length > 63 in rich group config '{label}.")
+    E1049 = ("Error splitting UTF-8 byte string into separate characters.")


 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -1,5 +1,6 @@
 from encodings import search_function
 from typing import Optional, List, Union, cast
+import warnings
 from spacy.ml.richfeatureextractor import RichFeatureExtractor
 from thinc.types import Floats2d, Ints2d, Ragged
 from thinc.api import chain, clone, concatenate, with_array, with_padded
@ -8,7 +9,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM

 from ...tokens import Doc
 from ...util import registry
-from ...errors import Errors
+from ...errors import Errors, Warnings
 from ...ml import _character_embed
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
@ -207,6 +208,8 @@ def _verify_rich_config_group(
            raise ValueError(Errors.E1047.format(label=label))
    elif search_chars is not None:
        raise ValueError(Errors.E1047.format(label=label))
+    if lengths is not None and max(lengths) > 63:
+        raise ValueError(Errors.E1048.format(label=label))


@registry.architectures("spacy.RichMultiHashEmbed.v1")
@ -246,13 +249,13 @@ def RichMultiHashEmbed(
    depending on the presence of some other letter before or after it, e.g. German
    plural nouns where the final two vowels are `ä-e` regularly correspond to
    singular lemmas where the `e` is no longer present and the `ä` has become `a`.
-    For most languages used with spaCy, searching is likely to be useful starting 
-    at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) 
-    is also offered for completeness. Search characters should consist of all 
-    characters that regularly alternate with other characters in the language in 
-    question or whose presence before or after characters that would otherwise 
-    alternate prevents the alternation from occurring, e.g. an `ä` in a German 
-    plural noun does not become `a` if it is the third or fourth vowel from the 
+    For most languages used with spaCy, searching is likely to be useful starting
+    at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
+    is also offered for completeness. Search characters should consist of all
+    characters that regularly alternate with other characters in the language in
+    question or whose presence before or after characters that would otherwise
+    alternate prevents the alternation from occurring, e.g. an `ä` in a German
+    plural noun does not become `a` if it is the third or fourth vowel from the
    end of the word.

    width (int): The output width. Also used as the width of the embedding tables.
@ -263,27 +266,27 @@ def RichMultiHashEmbed(
        same length as attrs.
    include_static_vectors (bool): Whether to also use static word vectors.
        Requires a vectors table to be loaded in the Doc objects' vocab.
-    case_sensitive (bool): Whether lower-case and upper-case letters should be 
+    case_sensitive (bool): Whether lower-case and upper-case letters should be
        distinguished when generating the character combinations to use as features.
-    pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features 
-        for each word, e.g. for the word `spaCy`: 
+    pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features
+        for each word, e.g. for the word `spaCy`:
        `[1, 3]` would lead to `s` and `spa` being used as features.
    pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
-    suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features 
-        for each word, e.g. for the word `spaCy`: 
+    suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
+        for each word, e.g. for the word `spaCy`:
        `[1, 3]` would lead to `y` and `aCy` being used as features.
    suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
-    pref_search_chars (Optional[str]): A string containing characters to search for 
+    pref_search_chars (Optional[str]): A string containing characters to search for
        starting from the beginning of each word.
-    pref_search_lengths (Optional[List[int]]): The lengths of search result strings 
+    pref_search_lengths (Optional[List[int]]): The lengths of search result strings
        to use as features, where the searches start from the beginning of each word.
-    pref_search_rows (Optional[List[int]]): The number of rows for each of 
+    pref_search_rows (Optional[List[int]]): The number of rows for each of
        `pref_search_lengths`.
-    suff_search_chars (Optional[str]): A string containing characters to search for 
+    suff_search_chars (Optional[str]): A string containing characters to search for
        starting from the end of each word.
-    suff_search_lengths (Optional[List[int]]): The lengths of search result strings 
+    suff_search_lengths (Optional[List[int]]): The lengths of search result strings
        to use as features, where the searches start from the end of each word.
-    suff_search_rows (Optional[List[int]]): The number of rows for each of 
+    suff_search_rows (Optional[List[int]]): The number of rows for each of
        `suff_search_lengths`.
    """

@ -313,6 +316,9 @@ def RichMultiHashEmbed(
        case_sensitive,
    )

+    if "PREFIX" in attrs or "SUFFIX" in attrs:
+        warnings.warn(Warnings.W124)
+
    if pref_rows is not None:
        rows.extend(pref_rows)
    if suff_rows is not None:
--- a/spacy/ml/richfeatureextractor.py
+++ b/spacy/ml/richfeatureextractor.py
@ -1,5 +1,7 @@
 from typing import List, Optional, Callable, Tuple
-from ..util import get_arrays_for_search_chars
+from spacy.util import get_search_char_byte_arrays
+
+# from ..util import get_arrays_for_search_chars
 from thinc.types import Ints1d, Ints2d
 from thinc.api import Model, registry, get_current_ops

@ -19,17 +21,23 @@ def RichFeatureExtractor(
 ) -> Model[List[Doc], List[Ints2d]]:
    ops = get_current_ops()
    if pref_search_chars is not None:
-        pref_search, pref_lookup = get_arrays_for_search_chars(
-            pref_search_chars, case_sensitive
-        )
+        (
+            ps_1byte_ch,
+            ps_2byte_ch,
+            ps_3byte_ch,
+            ps_4byte_ch,
+        ) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
    else:
-        pref_search, pref_lookup = bytes(), bytes()
+        ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes()
    if suff_search_chars is not None:
-        suff_search, suff_lookup = get_arrays_for_search_chars(
-            suff_search_chars, case_sensitive
-        )
+        (
+            ss_1byte_ch,
+            ss_2byte_ch,
+            ss_3byte_ch,
+            ss_4byte_ch,
+        ) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
    else:
-        suff_search, suff_lookup = bytes(), bytes()
+        ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes()
    return Model(
        "extract_character_combination_hashes",
        forward,
@ -41,19 +49,17 @@ def RichFeatureExtractor(
            "suff_lengths": ops.asarray1i(suff_lengths)
            if suff_lengths is not None
            else ops.asarray1i([]),
-            "pref_search": pref_search,
-            "pref_lookup": pref_lookup,
-            "pref_search_char_len": len(pref_search) / 4
-            if pref_search_chars is not None
-            else 0,
+            "pref_search_1_byte": ps_1byte_ch,
+            "pref_search_2_bytes": ps_2byte_ch,
+            "pref_search_3_bytes": ps_3byte_ch,
+            "pref_search_4_bytes": ps_4byte_ch,
            "pref_search_lengths": ops.asarray1i(pref_search_lengths)
            if pref_search_lengths is not None
            else ops.asarray1i([]),
-            "suff_search": suff_search,
-            "suff_lookup": suff_lookup,
-            "suff_search_char_len": len(suff_search) / 4
-            if suff_search_chars is not None
-            else 0,
+            "suff_search_1_byte": ss_1byte_ch,
+            "suff_search_2_bytes": ss_2byte_ch,
+            "suff_search_3_bytes": ss_3byte_ch,
+            "suff_search_4_bytes": ss_4byte_ch,
            "suff_search_lengths": ops.asarray1i(suff_search_lengths)
            if suff_search_lengths is not None
            else ops.asarray1i([]),
@ -68,13 +74,15 @@ def forward(
    case_sensitive: bool = model.attrs["case_sensitive"]
    pref_lengths: Ints1d = model.attrs["pref_lengths"]
    suff_lengths: Ints1d = model.attrs["suff_lengths"]
-    pref_search: bytes = model.attrs["pref_search"]
-    pref_lookup: bytes = model.attrs["pref_lookup"]
-    pref_search_char_len: int = model.attrs["pref_search_char_len"]
+    ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
+    ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
+    ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
+    ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
    pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
-    suff_search: bytes = model.attrs["suff_search"]
-    suff_lookup: bytes = model.attrs["suff_lookup"]
-    suff_search_char_len: int = model.attrs["suff_search_char_len"]
+    ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
+    ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
+    ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
+    ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
    suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
    features: List[Ints2d] = []
    for doc in docs:
@ -82,13 +90,15 @@ def forward(
            cs=case_sensitive,
            p_lengths=pref_lengths,
            s_lengths=suff_lengths,
-            ps_search=pref_search,
-            ps_lookup=pref_lookup,
-            ps_l=pref_search_char_len,
+            ps_1byte_ch=ps_1byte_ch,
+            ps_2byte_ch=ps_2byte_ch,
+            ps_3byte_ch=ps_3byte_ch,
+            ps_4byte_ch=ps_4byte_ch,
            ps_lengths=pref_search_lengths,
-            ss_search=suff_search,
-            ss_lookup=suff_lookup,
-            ss_l=suff_search_char_len,
+            ss_1byte_ch=ss_1byte_ch,
+            ss_2byte_ch=ss_2byte_ch,
+            ss_3byte_ch=ss_3byte_ch,
+            ss_4byte_ch=ss_4byte_ch,
            ss_lengths=suff_search_lengths,
        )
        features.append(ops.asarray2i(hashes))
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -27,3 +27,4 @@ cdef class StringStore:

    cdef const Utf8Str* intern_unicode(self, str py_string)
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+    cdef const unsigned char[:] utf8_view(self, attr_t hash_val)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -315,3 +315,25 @@ cdef class StringStore:
        self._map.set(key, value)
        self.keys.push_back(key)
        return value
+
+    cdef const unsigned char[:] utf8_view(self, attr_t hash_val):
+        if hash_val == 0:
+            return ""
+        elif hash_val < len(SYMBOLS_BY_INT):
+            return SYMBOLS_BY_INT[hash_val]
+        cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
+        cdef int i, length
+        if string.s[0] < sizeof(string.s) and string.s[0] != 0:
+            return string.s[1:string.s[0]+1]
+        elif string.p[0] < 255:
+            return string.p[1:string.p[0]+1]
+        else:
+            i = 0
+            length = 0
+            while string.p[i] == 255:
+                i += 1
+                length += 255
+            length += string.p[i]
+            i += 1
+            return string.p[i:length + i]
+        
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -1,55 +1,57 @@
 import spacy
+import pytest


-def test_get_arrays_for_search_chars_width_2_not_case_sensitive():
-    (
-        search,
-        lookup,
-    ) = spacy.util.get_arrays_for_search_chars("bféwfw", False)
-    assert (
-        lookup
-        == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
-    )
+@pytest.mark.parametrize("case_sensitive", [True, False])
+def test_get_search_char_byte_arrays_1_width_only(case_sensitive):
+    sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("zzaaEP", case_sensitive)
+    if case_sensitive:
+        assert sc1 == b"EPaz" 
+    else:
+        assert sc1 == b"aepz"
+    assert sc2 == b""
+    assert sc3 == b""
+    assert sc4 == b""

-    assert (
-        search
-        == b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
-    )
+@pytest.mark.parametrize("case_sensitive", [True, False])
+def test_get_search_char_byte_arrays_4_width_only(case_sensitive):
+    sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞", case_sensitive)
+    assert sc1 == b""
+    assert sc2 == b""
+    assert sc3 == b""
+    assert sc4 == "𐌞".encode("utf-8")

+@pytest.mark.parametrize("case_sensitive", [True, False])
+def test_get_search_char_byte_arrays_all_widths(case_sensitive):
+    sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞Éabé—B𐌞", case_sensitive)
+    if case_sensitive:
+        assert sc1 == b"Bab"
+        assert sc2 == "Éé".encode("utf-8")
+    else:
+        assert sc1 == b"ab"
+        assert sc2 == "é".encode("utf-8")
+    assert sc3 == "—".encode("utf-8")
+    assert sc4 == "𐌞".encode("utf-8")

-def test_get_arrays_for_search_chars_width_2_case_sensitive():
-    (
-        search,
-        lookup,
-    ) = spacy.util.get_arrays_for_search_chars("bféwfw", True)
-    assert (
-        lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
-    )
+@pytest.mark.parametrize("case_sensitive", [True, False])
+def test_turkish_i_with_dot(case_sensitive):
+    sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İ", case_sensitive)
+    if case_sensitive:
+        assert sc2 == "İ".encode("utf-8")
+        assert sc1 == sc3 == sc4 == b""
+    else:
+        assert sc1 == b"i"
+        assert sc2 == b"\xcc\x87"
+        assert sc3 == sc4 == b""

-
-def test_get_arrays_for_search_chars_width_4_not_case_sensitive():
-    (
-        search,
-        lookup,
-    ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
-    assert (
-        search
-        == b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
-    )
-
-    assert (
-        lookup
-        == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
-    )
-
-
-def test_get_arrays_for_search_chars_width_4_case_sensitive():
-    (
-        search,
-        lookup,
-    ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
-    assert search == lookup
-    assert (
-        lookup
-        == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
-    )
+@pytest.mark.parametrize("case_sensitive", [True, False])
+def test_turkish_i_with_dot_and_normal_i(case_sensitive):
+    sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İI", case_sensitive)
+    if case_sensitive:
+        assert sc1 == b"I"
+        assert sc2 == "İ".encode("utf-8")
+        assert sc3 == sc4 == b""
+    else:
+        assert sc1 == b"i"
+        assert sc2 == b"\xcc\x87"
+        assert sc3 == sc4 == b""
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -38,34 +38,22 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
 cdef int [:,:] _get_lca_matrix(Doc, int start, int end)


-cdef void _copy_chars(
-    Py_UCS4* target,
-    const Py_UCS4* source,
-    const int length,
-    const bint to_lower
-)
-
-
-cdef void _set_affixes(
-    const Py_UCS4* text_buf,
-    const int tok_idx, 
-    const int tok_len,
-    Py_UCS4* aff_buf, 
+cdef void _set_affix_lengths(
+    const unsigned char[:] text_buf,
+    char* aff_len_buf, 
    const int pref_len, 
    const int suff_len,
-    const bint to_lower
-)
+) nogil


-cdef void _search_for_chars(
-    const Py_UCS4* text_buf,
-    const int tok_idx, 
-    const int tok_len, 
-    Py_UCS4* search_buf, 
-    Py_UCS4* lookup_buf,
-    const int search_buf_len, 
-    Py_UCS4* result_buf, 
-    const int result_buf_len, 
+cdef bint _search_for_chars(
+    const unsigned char[:] tok_str,
+    const unsigned char[:] s_1byte_ch,
+    const unsigned char[:] s_2byte_ch,
+    const unsigned char[:] s_3byte_ch,
+    const unsigned char[:] s_4byte_ch,
+    unsigned char* res_buf,
+    unsigned char* len_buf,
    bint suffs_not_prefs
 ) nogil

--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -126,7 +126,7 @@ class Doc:
        blocked: Optional[List[Span]] = ...,
        missing: Optional[List[Span]] = ...,
        outside: Optional[List[Span]] = ...,
-        default: str = ...
+        default: str = ...,
    ) -> None: ...
    @property
    def noun_chunks(self) -> Iterator[Span]: ...
@ -178,16 +178,18 @@ class Doc:
        self,
        *,
        cs: bool,
-        pref_lengths: Ints1d,
-        suff_lengths: Ints1d,
-        pref_search_chars: str,
-        pref_lookup_chars: str,
-        pref_search_char_length: int,
-        pref_search_lengths: Ints1d,
-        suff_search_chars: str,
-        suff_lookup_chars: str,
-        suff_search_char_length: int,
-        suff_search_lengths: Ints1d,
+        p_lengths: Ints1d,
+        s_lengths: Ints1d,
+        ps_1byte_ch: bytes,
+        ps_2byte_ch: bytes,
+        ps_3byte_ch: bytes,
+        ps_4byte_ch: bytes,
+        ps_lengths: Ints1d,
+        ss_1byte_ch: bytes,
+        ss_2byte_ch: bytes,
+        ss_3byte_ch: bytes,
+        ss_4byte_ch: bytes,
+        ss_lengths: Ints1d,
    ) -> Ints2d: ...
    @staticmethod
    def _get_array_attrs() -> Tuple[Any]: ...
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1736,18 +1736,20 @@ cdef class Doc:
        return output


-    def get_character_combination_hashes(self,
+    def np.ndarray get_character_combination_hashes(self,
        *,
        const bint cs, 
        np.ndarray p_lengths, 
-        np.ndarray s_lengths, 
-        const char* ps_search,
-        const char* ps_lookup,
-        const int ps_l,
+        np.ndarray s_lengths,
+        const unsigned char[:] ps_1byte_ch,
+        const unsigned char[:] ps_2byte_ch,
+        const unsigned char[:] ps_3byte_ch,
+        const unsigned char[:] ps_4byte_ch,
        np.ndarray ps_lengths,
-        const char* ss_search,
-        const char* ss_lookup,
-        const int ss_l,
+        const unsigned char[:] ss_1byte_ch,
+        const unsigned char[:] ss_2byte_ch,
+        const unsigned char[:] ss_3byte_ch,
+        const unsigned char[:] ss_4byte_ch,
        np.ndarray ss_lengths,
    ):
        """
@ -1766,44 +1768,26 @@ cdef class Doc:
            the prefixes hashed for "spaCy" would be "sp" and "spa".
        s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
            *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
-        ps_search: a byte array containing characters to search for within each token, starting at the beginning.
-        ps_lookup: a byte array containing characters that are added to the result string when a character at
-            the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
-            case-insensitivity to be handled efficiently.
-        ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
+        ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, 
+            starting at the beginning.
        ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if 
            *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for 
            "spaCy" would be "a" and "ac".
-        ss_search: a byte array containing characters to search for within each token, starting at the end.
-        ss_lookup: a byte array containing characters that are added to the result string when a character at
-            the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
-            case-insensitivity to be handled efficiently.
-        ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
+        ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, 
+            starting at the end.
        ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if 
            *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for 
            "spaCy" would be "c" and "ca".
-        
-        For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by 
-        *get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* 
-        would correspond to
-
-        [[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
-        [hash("an"), hash("nd"), hash(" and", hash("   and"), hash(" "), hash("  "))],
-        [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
        """

-        # Encode the document text        
-        cdef bytes encoded_text = self.text.encode("utf-32le")
-        cdef char* intermediate_text = encoded_text
-        cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
-
        # Define the result array and work out what is used for what in axis 1
        cdef int num_toks = len(self)
        cdef int p_h_num = p_lengths.shape[0]
        cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
        cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
        cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
-        cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
+        cdef np.ndarray[np.int64_t, ndim=2] hashes
+        hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")

        # Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
        cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
@ -1811,15 +1795,13 @@ cdef class Doc:
        cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
        cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0

-        # Define / allocate buffer (pr/sr: result buffers)
-        cdef int aff_buf_l  = p_max_l + s_max_l
-        cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
-        cdef Py_UCS4* ps_s_buf = <Py_UCS4*> ps_search
-        cdef Py_UCS4* ps_l_buf = <Py_UCS4*> ps_lookup
-        cdef Py_UCS4* ps_r_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
-        cdef Py_UCS4* ss_s_buf = <Py_UCS4*> ss_search
-        cdef Py_UCS4* ss_l_buf = <Py_UCS4*> ss_lookup
-        cdef Py_UCS4* ss_r_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
+        # Define / allocate buffers
+        cdef int aff_l  = p_max_l + s_max_l
+        cdef char* aff_len_buf = self.mem.alloc(aff_l, 1)
+        cdef char* ps_res_buf = self.mem.alloc(ps_max_l, 4)
+        cdef char* ps_len_buf = self.mem.alloc(ps_max_l, 1)
+        cdef char* ss_res_buf = self.mem.alloc(ss_max_l, 4)
+        cdef char* ss_len_buf = self.mem.alloc(ss_max_l, 1)
        
        # Define memory views on length arrays
        cdef int[:] p_lengths_v = p_lengths
@ -1829,38 +1811,51 @@ cdef class Doc:

        # Define working variables
        cdef TokenC tok_c
-        cdef int tok_i, tok_idx, tok_len, aff_len
+        cdef int tok_i, offset
+        cdef uint64_t hash_val
+        cdef attr_t num_tok_attr
+        cdef const unsigned char[:] tok_str

        for tok_i in range(num_toks):
            tok_c = self.c[tok_i]
-            tok_idx = tok_c.idx
-            tok_len = tok_c.lex.length
+            num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
+            tok_str = self.vocab.strings.utf8_view(num_tok_attr)
            
-            if aff_buf_l > 0:
-                _set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
-                
+            if aff_l > 0:
+                _set_affix_lengths(tok_str, aff_len_buf, p_max_l, s_max_l)
                for hash_idx in range(p_h_num):
-                    hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0)
+                    offset = aff_len_buf[p_lengths_v[hash_idx]]
+                    if offset > 0:
+                        hash_val = hash32(<void*> &qcktest2[0], offset, 0)
+                    hashes[tok_i, hash_idx] = hash_val
            
                for hash_idx in range(p_h_num, s_h_end):
-                    aff_len = s_lengths_v[hash_idx - p_h_num]
-                    hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
+                    offset = s_lengths_v[hash_idx - p_h_num]
+                    if offset > 0:
+                        hash_val = hash32(<void*> &qcktest2[len(qcktest2) - offset], offset, 0)
+                    hashes[tok_i, hash_idx] = hash_val
                
-            if ps_h_num > 0:
-                _search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False)
+            if (
+                ps_h_num > 0 and
+                _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_res_len, False)
+            ):
                for hash_idx in range(s_h_end, ps_h_end):
                    aff_len = ps_lengths_v[hash_idx - s_h_end]
                    hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)

-            if ss_h_num > 0:
-                _search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True)
+            if (
+                ss_h_num > 0 and 
+                _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_res_len, True)
+            ):
                for hash_idx in range(ps_h_end, ss_h_end):
                    aff_len = ss_lengths_v[hash_idx - ps_h_end]
                    hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)

-        self.mem.free(aff_buf)
-        self.mem.free(ps_r_buf)
-        self.mem.free(ss_r_buf)
+        self.mem.free(aff_len_buf)
+        self.mem.free(ps_res_buf)
+        self.mem.free(ps_len_buf)
+        self.mem.free(ss_res_buf)
+        self.mem.free(ss_len_buf)
        return hashes

    @staticmethod
@ -2044,34 +2039,13 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
    return lca_matrix


-cdef void _copy_chars(
-    Py_UCS4* target,
-    const Py_UCS4* source,
-    const int length,
-    const bint to_lower
-):
-    """Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
-    any upper-case characters to lower case within the target buffer.
-    """
-    cdef int idx
-
-    memcpy(target, source, length * sizeof(Py_UCS4))
-    if to_lower:
-        for idx in range(length):
-            if Py_UNICODE_ISUPPER(target[idx]):
-                target[idx] = Py_UNICODE_TOLOWER(target[idx])
-
-
-cdef void _set_affixes(
-    const Py_UCS4* text_buf,
-    const int tok_idx, 
-    const int tok_len,
-    Py_UCS4* aff_buf, 
+cdef void _set_affix_lengths(
+    const unsigned char[:] text_buf,
+    char* aff_len_buf, 
    const int pref_len, 
    const int suff_len,
-    const bint to_lower
-):
-    """ Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
+) nogil:
+    """ TODO : Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
        If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.

        text_buf: a pointer to a UTF-32LE representation of the containing string.
@ -2082,41 +2056,41 @@ cdef void _set_affixes(
        suff_len: the length of the suffix.
        to_lower: if *True*, any upper case characters in either affix are converted to lower case.
    """
-    cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len
-    
-    if pref_len > 0:
-        filled_pref_len = pref_len if pref_len < tok_len else tok_len
-        _copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower)
-        aff_buf_idx = filled_pref_len
+    cdef int text_buf_idx = 0, aff_len_buf_idx = 0, text_buf_len = len(text_buf)

-    if tok_len < pref_len:
-        memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
-        aff_buf_idx = aff_buf_len - suff_len
+    while aff_len_buf_idx < pref_len:
+        if (text_buf[text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
+            aff_len_buf[aff_len_buf_idx] = text_buf_idx + 1
+            aff_len_buf_idx += 1
+        text_buf_idx += 1
+        if text_buf_idx == len(text_buf):
+            break

-    if tok_len < suff_len:
-        memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
-        aff_buf_idx = aff_buf_len - tok_len
+    if aff_len_buf_idx < pref_len:
+        memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx)
+        aff_len_buf_idx = pref_len

-    if suff_len > 0:
-        # in_word_idx: the index within the token where the suffix starts
-        in_word_idx = aff_buf_idx + tok_len - aff_buf_len
-        if in_word_idx < pref_len:
-            memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
-            aff_buf_idx += filled_pref_len - in_word_idx
-            in_word_idx = aff_buf_idx + tok_len - aff_buf_len
-        if aff_buf_idx < aff_buf_len:
-            _copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
+    text_buf_idx = 1
+    while aff_len_buf_idx < pref_len + suff_len:
+        if (text_buf[text_buf_len - text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
+            aff_len_buf[aff_len_buf_idx] = text_buf_len - text_buf_idx
+            aff_len_buf_idx += 1
+        text_buf_idx += 1
+        if text_buf_idx > text_buf_len:
+            break
+
+    if aff_len_buf_idx < pref_len + suff_len:
+        memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx)


-cdef void _search_for_chars(
-    const Py_UCS4* text_buf,
-    const int tok_idx, 
-    const int tok_len, 
-    Py_UCS4* search_buf, 
-    Py_UCS4* lookup_buf,
-    const int search_buf_len, 
-    Py_UCS4* result_buf, 
-    const int result_buf_len, 
+cdef bint _search_for_chars(
+    const unsigned char[:] tok_str,
+    const unsigned char[:] s_1byte_ch,
+    const unsigned char[:] s_2byte_ch,
+    const unsigned char[:] s_3byte_ch,
+    const unsigned char[:] s_4byte_ch,
+    unsigned char* res_buf,
+    unsigned char* len_buf,
    bint suffs_not_prefs
 ) nogil:
    """ Search a word within a string for characters within *search_buf*, starting at the beginning or
@ -2133,6 +2107,8 @@ cdef void _search_for_chars(
        result_buf: the buffer in which to place the results.
        result_buf_len: the length of *result_buf*.
        suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
+
+        Returns *True* if at least one character from *search_buf* was found in the word, otherwise *False*.
    """
    cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
    cdef int search_buf_idx
@ -2158,6 +2134,8 @@ cdef void _search_for_chars(
    # fill in any unused characters in the result buffer with zeros
    if result_buf_idx < result_buf_len:
        memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
+    
+    return result_buf_idx > 0
        

 def pickle_doc(doc):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1737,69 +1737,42 @@ def all_equal(iterable):
    return next(g, True) and not next(g, False)


-def get_arrays_for_search_chars(
+def get_search_char_byte_arrays(
    search_chars: str, case_sensitive: bool
-) -> Tuple[bytes, bytes]:
+) -> Tuple[bytes, bytes, bytes, bytes]:
    """
-    This function supports the rich feature extractor. It returns search byte arrays with
-    4-byte character width that are used for comparison when searching document texts 
-    for search characters. The encoding is little-endian regardless of architecture, as 
-    this is what is expected by the murmurhash library used downstream.
-
-    Alongside the "search array" against which words from document texts are compared
-    is the "lookup array". When a character from the search array is matched,
-    the character at the corresponding position in the lookup array is added to the
-    sequence that then goes on to be hashed. This enables case-sensitivity
-    to be handled without converting the case of the words being searched: if 
-    *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
-    have case are added to the search array, and both the original character and its
-    other-cased counterpart map to the lower-case version in the lookup array.
+    This function supports the rich feature extractor. It splits the UTF-8 representation
+    of *search_chars* into separate byte arrays containing 1-, 2-, 3-, and 4-byte
+    characters respectively. Any duplicates in *search_chars* are removed, and *search_chars*
+    is converted to lower case if *case_sensitive==False*.
    """

-    def encode(ch: str) -> bytes:
-        """
-        ch: a single character
-        """
-        return ch.encode("UTF-32LE")
- 
-    def add_to_arrays(
-        search: List[bytes], lookup: List[bytes], ch: str
-    ) -> None:
-        """Add the byte representations of *ch* to the two byte array lists.
-        """
-        this_char_bytes = encode(ch)
-        if not case_sensitive and ch.islower():
-            if this_char_bytes not in search:
-                search.append(this_char_bytes)
-                lookup.append(this_char_bytes)
-            upper_char_bytes = encode(ch.upper())
-            if upper_char_bytes not in search:
-                search.append(upper_char_bytes)
-                lookup.append(this_char_bytes)
-        elif not case_sensitive and ch.isupper():
-            lower_char_bytes = encode(ch.lower())
-            if this_char_bytes not in search:
-                search.append(this_char_bytes)
-                lookup.append(lower_char_bytes)
-            if lower_char_bytes not in search:
-                search.append(lower_char_bytes)
-                lookup.append(lower_char_bytes)
-        elif this_char_bytes not in search:
-            search.append(this_char_bytes)
-            lookup.append(this_char_bytes)
-
-    def get_ordered_raw_bytes(
-        search: List[bytes], lookup: List[bytes]
-    ) -> Tuple[bytes, bytes]:
-        """Flatten the two lists, ordering both by the entries in *search*.
-        """
-        num_search = [list(entry) for entry in search]
-        search = [entry for _, entry in sorted(zip(num_search, search))]
-        lookup = [entry for _, entry in sorted(zip(num_search, lookup))]
-        return b"".join(search), b"".join(lookup)
-
-    search: List[bytes] = []
-    lookup: List[bytes] = []
-    for ch in search_chars:
-        add_to_arrays(search, lookup, ch)
-    return get_ordered_raw_bytes(search, lookup)
+    sc1 = bytearray()
+    sc2 = bytearray()
+    sc3 = bytearray()
+    sc4 = bytearray()
+    if not case_sensitive:
+        search_chars = search_chars.lower()
+    ordered_search_chars = "".join(sorted(set(search_chars)))
+    encoded_search_char_bytes = ordered_search_chars.encode("UTF-8")
+    working_start = 0
+    for idx in range(len(encoded_search_char_bytes) + 1):
+        if idx == 0:
+            continue
+        if (
+            idx == len(encoded_search_char_bytes)
+            or encoded_search_char_bytes[idx] & 0xC0 != 0x80  # not continuation byte
+        ):
+            char_length = idx - working_start
+            if char_length == 1:
+                sc1.extend(encoded_search_char_bytes[working_start:idx])
+            elif char_length == 2:
+                sc2.extend(encoded_search_char_bytes[working_start:idx])
+            elif char_length == 3:
+                sc3.extend(encoded_search_char_bytes[working_start:idx])
+            elif char_length == 4:
+                sc4.extend(encoded_search_char_bytes[working_start:idx])
+            else:
+                raise RuntimeError(Errors.E1049)
+            working_start = idx
+    return bytes(sc1), bytes(sc2), bytes(sc3), bytes(sc4)