Changes after review discussion — intermed. state

2025-08-02 11:20:19 +03:00 · 2022-10-27 18:03:25 +02:00 · 2022-10-27 18:03:25 +02:00 · a1b8697aab
commit a1b8697aab
parent 7d8258bec8
10 changed files with 294 additions and 308 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -214,6 +214,8 @@ class Warnings(metaclass=ErrorsWithCodes):
            "is a Cython extension type.")
    W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
            "aware that this might affect other components in your pipeline.")
    W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same "
            "information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.")    
 class Errors(metaclass=ErrorsWithCodes):
@ -953,6 +955,8 @@ class Errors(metaclass=ErrorsWithCodes):
    E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
             "knowledge base, use `InMemoryLookupKB`.")
    E1047 = ("Invalid rich group config '{label}'.")
    E1048 = ("Length > 63 in rich group config '{label}.")
    E1049 = ("Error splitting UTF-8 byte string into separate characters.")
 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -1,5 +1,6 @@
 from encodings import search_function
 from typing import Optional, List, Union, cast
 import warnings
 from spacy.ml.richfeatureextractor import RichFeatureExtractor
 from thinc.types import Floats2d, Ints2d, Ragged
 from thinc.api import chain, clone, concatenate, with_array, with_padded
@ -8,7 +9,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 from ...tokens import Doc
 from ...util import registry
-from ...errors import Errors
+from ...errors import Errors, Warnings
 from ...ml import _character_embed
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
@ -207,6 +208,8 @@ def _verify_rich_config_group(
            raise ValueError(Errors.E1047.format(label=label))
    elif search_chars is not None:
        raise ValueError(Errors.E1047.format(label=label))
    if lengths is not None and max(lengths) > 63:
        raise ValueError(Errors.E1048.format(label=label))
@registry.architectures("spacy.RichMultiHashEmbed.v1")
@ -246,13 +249,13 @@ def RichMultiHashEmbed(
    depending on the presence of some other letter before or after it, e.g. German
    plural nouns where the final two vowels are `ä-e` regularly correspond to
    singular lemmas where the `e` is no longer present and the `ä` has become `a`.
-    For most languages used with spaCy, searching is likely to be useful starting 
+    For most languages used with spaCy, searching is likely to be useful starting
-    at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) 
+    at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
-    is also offered for completeness. Search characters should consist of all 
+    is also offered for completeness. Search characters should consist of all
-    characters that regularly alternate with other characters in the language in 
+    characters that regularly alternate with other characters in the language in
-    question or whose presence before or after characters that would otherwise 
+    question or whose presence before or after characters that would otherwise
-    alternate prevents the alternation from occurring, e.g. an `ä` in a German 
+    alternate prevents the alternation from occurring, e.g. an `ä` in a German
-    plural noun does not become `a` if it is the third or fourth vowel from the 
+    plural noun does not become `a` if it is the third or fourth vowel from the
    end of the word.
    width (int): The output width. Also used as the width of the embedding tables.
@ -263,27 +266,27 @@ def RichMultiHashEmbed(
        same length as attrs.
    include_static_vectors (bool): Whether to also use static word vectors.
        Requires a vectors table to be loaded in the Doc objects' vocab.
-    case_sensitive (bool): Whether lower-case and upper-case letters should be 
+    case_sensitive (bool): Whether lower-case and upper-case letters should be
        distinguished when generating the character combinations to use as features.
-    pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features 
+    pref_lengths (Optional[List[int]]): The lengths of prefixes to use as features
-        for each word, e.g. for the word `spaCy`: 
+        for each word, e.g. for the word `spaCy`:
        `[1, 3]` would lead to `s` and `spa` being used as features.
    pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
-    suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features 
+    suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
-        for each word, e.g. for the word `spaCy`: 
+        for each word, e.g. for the word `spaCy`:
        `[1, 3]` would lead to `y` and `aCy` being used as features.
    suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
-    pref_search_chars (Optional[str]): A string containing characters to search for 
+    pref_search_chars (Optional[str]): A string containing characters to search for
        starting from the beginning of each word.
-    pref_search_lengths (Optional[List[int]]): The lengths of search result strings 
+    pref_search_lengths (Optional[List[int]]): The lengths of search result strings
        to use as features, where the searches start from the beginning of each word.
-    pref_search_rows (Optional[List[int]]): The number of rows for each of 
+    pref_search_rows (Optional[List[int]]): The number of rows for each of
        `pref_search_lengths`.
-    suff_search_chars (Optional[str]): A string containing characters to search for 
+    suff_search_chars (Optional[str]): A string containing characters to search for
        starting from the end of each word.
-    suff_search_lengths (Optional[List[int]]): The lengths of search result strings 
+    suff_search_lengths (Optional[List[int]]): The lengths of search result strings
        to use as features, where the searches start from the end of each word.
-    suff_search_rows (Optional[List[int]]): The number of rows for each of 
+    suff_search_rows (Optional[List[int]]): The number of rows for each of
        `suff_search_lengths`.
    """
@ -313,6 +316,9 @@ def RichMultiHashEmbed(
        case_sensitive,
    )
    if "PREFIX" in attrs or "SUFFIX" in attrs:
        warnings.warn(Warnings.W124)
    if pref_rows is not None:
        rows.extend(pref_rows)
    if suff_rows is not None:
--- a/spacy/ml/richfeatureextractor.py
+++ b/spacy/ml/richfeatureextractor.py
@ -1,5 +1,7 @@
 from typing import List, Optional, Callable, Tuple
-from ..util import get_arrays_for_search_chars
+from spacy.util import get_search_char_byte_arrays
 # from ..util import get_arrays_for_search_chars
 from thinc.types import Ints1d, Ints2d
 from thinc.api import Model, registry, get_current_ops
@ -19,17 +21,23 @@ def RichFeatureExtractor(
 ) -> Model[List[Doc], List[Ints2d]]:
    ops = get_current_ops()
    if pref_search_chars is not None:
-        pref_search, pref_lookup = get_arrays_for_search_chars(
+        (
-            pref_search_chars, case_sensitive
+            ps_1byte_ch,
-        )
+            ps_2byte_ch,
            ps_3byte_ch,
            ps_4byte_ch,
        ) = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
    else:
-        pref_search, pref_lookup = bytes(), bytes()
+        ps_1byte_ch = ps_2byte_ch = ps_3byte_ch = ps_4byte_ch = bytes()
    if suff_search_chars is not None:
-        suff_search, suff_lookup = get_arrays_for_search_chars(
+        (
-            suff_search_chars, case_sensitive
+            ss_1byte_ch,
-        )
+            ss_2byte_ch,
            ss_3byte_ch,
            ss_4byte_ch,
        ) = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
    else:
-        suff_search, suff_lookup = bytes(), bytes()
+        ss_1byte_ch = ss_2byte_ch = ss_3byte_ch = ss_4byte_ch = bytes()
    return Model(
        "extract_character_combination_hashes",
        forward,
@ -41,19 +49,17 @@ def RichFeatureExtractor(
            "suff_lengths": ops.asarray1i(suff_lengths)
            if suff_lengths is not None
            else ops.asarray1i([]),
-            "pref_search": pref_search,
+            "pref_search_1_byte": ps_1byte_ch,
-            "pref_lookup": pref_lookup,
+            "pref_search_2_bytes": ps_2byte_ch,
-            "pref_search_char_len": len(pref_search) / 4
+            "pref_search_3_bytes": ps_3byte_ch,
-            if pref_search_chars is not None
+            "pref_search_4_bytes": ps_4byte_ch,
            else 0,
            "pref_search_lengths": ops.asarray1i(pref_search_lengths)
            if pref_search_lengths is not None
            else ops.asarray1i([]),
-            "suff_search": suff_search,
+            "suff_search_1_byte": ss_1byte_ch,
-            "suff_lookup": suff_lookup,
+            "suff_search_2_bytes": ss_2byte_ch,
-            "suff_search_char_len": len(suff_search) / 4
+            "suff_search_3_bytes": ss_3byte_ch,
-            if suff_search_chars is not None
+            "suff_search_4_bytes": ss_4byte_ch,
            else 0,
            "suff_search_lengths": ops.asarray1i(suff_search_lengths)
            if suff_search_lengths is not None
            else ops.asarray1i([]),
@ -68,13 +74,15 @@ def forward(
    case_sensitive: bool = model.attrs["case_sensitive"]
    pref_lengths: Ints1d = model.attrs["pref_lengths"]
    suff_lengths: Ints1d = model.attrs["suff_lengths"]
-    pref_search: bytes = model.attrs["pref_search"]
+    ps_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
-    pref_lookup: bytes = model.attrs["pref_lookup"]
+    ps_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
-    pref_search_char_len: int = model.attrs["pref_search_char_len"]
+    ps_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
    ps_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
    pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
-    suff_search: bytes = model.attrs["suff_search"]
+    ss_1byte_ch: bytes = model.attrs["pref_search_1_byte"]
-    suff_lookup: bytes = model.attrs["suff_lookup"]
+    ss_2byte_ch: bytes = model.attrs["pref_search_2_bytes"]
-    suff_search_char_len: int = model.attrs["suff_search_char_len"]
+    ss_3byte_ch: bytes = model.attrs["pref_search_3_bytes"]
    ss_4byte_ch: bytes = model.attrs["pref_search_4_bytes"]
    suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
    features: List[Ints2d] = []
    for doc in docs:
@ -82,13 +90,15 @@ def forward(
            cs=case_sensitive,
            p_lengths=pref_lengths,
            s_lengths=suff_lengths,
-            ps_search=pref_search,
+            ps_1byte_ch=ps_1byte_ch,
-            ps_lookup=pref_lookup,
+            ps_2byte_ch=ps_2byte_ch,
-            ps_l=pref_search_char_len,
+            ps_3byte_ch=ps_3byte_ch,
            ps_4byte_ch=ps_4byte_ch,
            ps_lengths=pref_search_lengths,
-            ss_search=suff_search,
+            ss_1byte_ch=ss_1byte_ch,
-            ss_lookup=suff_lookup,
+            ss_2byte_ch=ss_2byte_ch,
-            ss_l=suff_search_char_len,
+            ss_3byte_ch=ss_3byte_ch,
            ss_4byte_ch=ss_4byte_ch,
            ss_lengths=suff_search_lengths,
        )
        features.append(ops.asarray2i(hashes))
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -27,3 +27,4 @@ cdef class StringStore:
    cdef const Utf8Str* intern_unicode(self, str py_string)
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
    cdef const unsigned char[:] utf8_view(self, attr_t hash_val)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -315,3 +315,25 @@ cdef class StringStore:
        self._map.set(key, value)
        self.keys.push_back(key)
        return value
    cdef const unsigned char[:] utf8_view(self, attr_t hash_val):
        if hash_val == 0:
            return ""
        elif hash_val < len(SYMBOLS_BY_INT):
            return SYMBOLS_BY_INT[hash_val]
        cdef Utf8Str* string = <Utf8Str*>self._map.get(hash_val)
        cdef int i, length
        if string.s[0] < sizeof(string.s) and string.s[0] != 0:
            return string.s[1:string.s[0]+1]
        elif string.p[0] < 255:
            return string.p[1:string.p[0]+1]
        else:
            i = 0
            length = 0
            while string.p[i] == 255:
                i += 1
                length += 255
            length += string.p[i]
            i += 1
            return string.p[i:length + i]
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -1,55 +1,57 @@
 import spacy
 import pytest
-def test_get_arrays_for_search_chars_width_2_not_case_sensitive():
+@pytest.mark.parametrize("case_sensitive", [True, False])
-    (
+def test_get_search_char_byte_arrays_1_width_only(case_sensitive):
-        search,
+    sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("zzaaEP", case_sensitive)
-        lookup,
+    if case_sensitive:
-    ) = spacy.util.get_arrays_for_search_chars("bféwfw", False)
+        assert sc1 == b"EPaz" 
-    assert (
+    else:
-        lookup
+        assert sc1 == b"aepz"
-        == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
+    assert sc2 == b""
-    )
+    assert sc3 == b""
    assert sc4 == b""
-    assert (
+@pytest.mark.parametrize("case_sensitive", [True, False])
-        search
+def test_get_search_char_byte_arrays_4_width_only(case_sensitive):
-        == b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
+    sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞", case_sensitive)
-    )
+    assert sc1 == b""
    assert sc2 == b""
    assert sc3 == b""
    assert sc4 == "𐌞".encode("utf-8")
@pytest.mark.parametrize("case_sensitive", [True, False])
 def test_get_search_char_byte_arrays_all_widths(case_sensitive):
    sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("𐌞Éabé—B𐌞", case_sensitive)
    if case_sensitive:
        assert sc1 == b"Bab"
        assert sc2 == "Éé".encode("utf-8")
    else:
        assert sc1 == b"ab"
        assert sc2 == "é".encode("utf-8")
    assert sc3 == "—".encode("utf-8")
    assert sc4 == "𐌞".encode("utf-8")
-def test_get_arrays_for_search_chars_width_2_case_sensitive():
+@pytest.mark.parametrize("case_sensitive", [True, False])
-    (
+def test_turkish_i_with_dot(case_sensitive):
-        search,
+    sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İ", case_sensitive)
-        lookup,
+    if case_sensitive:
-    ) = spacy.util.get_arrays_for_search_chars("bféwfw", True)
+        assert sc2 == "İ".encode("utf-8")
-    assert (
+        assert sc1 == sc3 == sc4 == b""
-        lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
+    else:
-    )
+        assert sc1 == b"i"
        assert sc2 == b"\xcc\x87"
        assert sc3 == sc4 == b""
-
+@pytest.mark.parametrize("case_sensitive", [True, False])
-def test_get_arrays_for_search_chars_width_4_not_case_sensitive():
+def test_turkish_i_with_dot_and_normal_i(case_sensitive):
-    (
+    sc1, sc2, sc3, sc4 = spacy.util.get_search_char_byte_arrays("İI", case_sensitive)
-        search,
+    if case_sensitive:
-        lookup,
+        assert sc1 == b"I"
-    ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
+        assert sc2 == "İ".encode("utf-8")
-    assert (
+        assert sc3 == sc4 == b""
-        search
+    else:
-        == b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
+        assert sc1 == b"i"
-    )
+        assert sc2 == b"\xcc\x87"
-
+        assert sc3 == sc4 == b""
    assert (
        lookup
        == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
    )
 def test_get_arrays_for_search_chars_width_4_case_sensitive():
    (
        search,
        lookup,
    ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
    assert search == lookup
    assert (
        lookup
        == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
    )
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -38,34 +38,22 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
 cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
-cdef void _copy_chars(
+cdef void _set_affix_lengths(
-    Py_UCS4* target,
+    const unsigned char[:] text_buf,
-    const Py_UCS4* source,
+    char* aff_len_buf, 
    const int length,
    const bint to_lower
 )
 cdef void _set_affixes(
    const Py_UCS4* text_buf,
    const int tok_idx, 
    const int tok_len,
    Py_UCS4* aff_buf, 
    const int pref_len, 
    const int suff_len,
-    const bint to_lower
+) nogil
 )
-cdef void _search_for_chars(
+cdef bint _search_for_chars(
-    const Py_UCS4* text_buf,
+    const unsigned char[:] tok_str,
-    const int tok_idx, 
+    const unsigned char[:] s_1byte_ch,
-    const int tok_len, 
+    const unsigned char[:] s_2byte_ch,
-    Py_UCS4* search_buf, 
+    const unsigned char[:] s_3byte_ch,
-    Py_UCS4* lookup_buf,
+    const unsigned char[:] s_4byte_ch,
-    const int search_buf_len, 
+    unsigned char* res_buf,
-    Py_UCS4* result_buf, 
+    unsigned char* len_buf,
    const int result_buf_len, 
    bint suffs_not_prefs
 ) nogil
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -126,7 +126,7 @@ class Doc:
        blocked: Optional[List[Span]] = ...,
        missing: Optional[List[Span]] = ...,
        outside: Optional[List[Span]] = ...,
-        default: str = ...
+        default: str = ...,
    ) -> None: ...
    @property
    def noun_chunks(self) -> Iterator[Span]: ...
@ -178,16 +178,18 @@ class Doc:
        self,
        *,
        cs: bool,
-        pref_lengths: Ints1d,
+        p_lengths: Ints1d,
-        suff_lengths: Ints1d,
+        s_lengths: Ints1d,
-        pref_search_chars: str,
+        ps_1byte_ch: bytes,
-        pref_lookup_chars: str,
+        ps_2byte_ch: bytes,
-        pref_search_char_length: int,
+        ps_3byte_ch: bytes,
-        pref_search_lengths: Ints1d,
+        ps_4byte_ch: bytes,
-        suff_search_chars: str,
+        ps_lengths: Ints1d,
-        suff_lookup_chars: str,
+        ss_1byte_ch: bytes,
-        suff_search_char_length: int,
+        ss_2byte_ch: bytes,
-        suff_search_lengths: Ints1d,
+        ss_3byte_ch: bytes,
        ss_4byte_ch: bytes,
        ss_lengths: Ints1d,
    ) -> Ints2d: ...
    @staticmethod
    def _get_array_attrs() -> Tuple[Any]: ...
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1736,18 +1736,20 @@ cdef class Doc:
        return output
-    def get_character_combination_hashes(self,
+    def np.ndarray get_character_combination_hashes(self,
        *,
        const bint cs, 
        np.ndarray p_lengths, 
-        np.ndarray s_lengths, 
+        np.ndarray s_lengths,
-        const char* ps_search,
+        const unsigned char[:] ps_1byte_ch,
-        const char* ps_lookup,
+        const unsigned char[:] ps_2byte_ch,
-        const int ps_l,
+        const unsigned char[:] ps_3byte_ch,
        const unsigned char[:] ps_4byte_ch,
        np.ndarray ps_lengths,
-        const char* ss_search,
+        const unsigned char[:] ss_1byte_ch,
-        const char* ss_lookup,
+        const unsigned char[:] ss_2byte_ch,
-        const int ss_l,
+        const unsigned char[:] ss_3byte_ch,
        const unsigned char[:] ss_4byte_ch,
        np.ndarray ss_lengths,
    ):
        """
@ -1766,44 +1768,26 @@ cdef class Doc:
            the prefixes hashed for "spaCy" would be "sp" and "spa".
        s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
            *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
-        ps_search: a byte array containing characters to search for within each token, starting at the beginning.
+        ps_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, 
-        ps_lookup: a byte array containing characters that are added to the result string when a character at
+            starting at the beginning.
            the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
            case-insensitivity to be handled efficiently.
        ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
        ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if 
            *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for 
            "spaCy" would be "a" and "ac".
-        ss_search: a byte array containing characters to search for within each token, starting at the end.
+        ss_<n>byte_ch: a byte array containing in order n-byte-wide characters to search for within each token, 
-        ss_lookup: a byte array containing characters that are added to the result string when a character at
+            starting at the end.
            the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
            case-insensitivity to be handled efficiently.
        ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
        ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if 
            *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for 
            "spaCy" would be "c" and "ca".
        For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by 
        *get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* 
        would correspond to
        [[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
        [hash("an"), hash("nd"), hash(" and", hash("   and"), hash(" "), hash("  "))],
        [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
        """
        # Encode the document text        
        cdef bytes encoded_text = self.text.encode("utf-32le")
        cdef char* intermediate_text = encoded_text
        cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
        # Define the result array and work out what is used for what in axis 1
        cdef int num_toks = len(self)
        cdef int p_h_num = p_lengths.shape[0]
        cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
        cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
        cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
-        cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
+        cdef np.ndarray[np.int64_t, ndim=2] hashes
        hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
        # Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
        cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
@ -1811,15 +1795,13 @@ cdef class Doc:
        cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
        cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0
-        # Define / allocate buffer (pr/sr: result buffers)
+        # Define / allocate buffers
-        cdef int aff_buf_l  = p_max_l + s_max_l
+        cdef int aff_l  = p_max_l + s_max_l
-        cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
+        cdef char* aff_len_buf = self.mem.alloc(aff_l, 1)
-        cdef Py_UCS4* ps_s_buf = <Py_UCS4*> ps_search
+        cdef char* ps_res_buf = self.mem.alloc(ps_max_l, 4)
-        cdef Py_UCS4* ps_l_buf = <Py_UCS4*> ps_lookup
+        cdef char* ps_len_buf = self.mem.alloc(ps_max_l, 1)
-        cdef Py_UCS4* ps_r_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
+        cdef char* ss_res_buf = self.mem.alloc(ss_max_l, 4)
-        cdef Py_UCS4* ss_s_buf = <Py_UCS4*> ss_search
+        cdef char* ss_len_buf = self.mem.alloc(ss_max_l, 1)
        cdef Py_UCS4* ss_l_buf = <Py_UCS4*> ss_lookup
        cdef Py_UCS4* ss_r_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
        # Define memory views on length arrays
        cdef int[:] p_lengths_v = p_lengths
@ -1829,38 +1811,51 @@ cdef class Doc:
        # Define working variables
        cdef TokenC tok_c
-        cdef int tok_i, tok_idx, tok_len, aff_len
+        cdef int tok_i, offset
        cdef uint64_t hash_val
        cdef attr_t num_tok_attr
        cdef const unsigned char[:] tok_str
        for tok_i in range(num_toks):
            tok_c = self.c[tok_i]
-            tok_idx = tok_c.idx
+            num_tok_attr = tok_c.lex.orth if cs else tok_c.lex.lower
-            tok_len = tok_c.lex.length
+            tok_str = self.vocab.strings.utf8_view(num_tok_attr)
-            if aff_buf_l > 0:
+            if aff_l > 0:
-                _set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
+                _set_affix_lengths(tok_str, aff_len_buf, p_max_l, s_max_l)
                for hash_idx in range(p_h_num):
-                    hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0)
+                    offset = aff_len_buf[p_lengths_v[hash_idx]]
                    if offset > 0:
                        hash_val = hash32(<void*> &qcktest2[0], offset, 0)
                    hashes[tok_i, hash_idx] = hash_val
                for hash_idx in range(p_h_num, s_h_end):
-                    aff_len = s_lengths_v[hash_idx - p_h_num]
+                    offset = s_lengths_v[hash_idx - p_h_num]
-                    hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
+                    if offset > 0:
                        hash_val = hash32(<void*> &qcktest2[len(qcktest2) - offset], offset, 0)
                    hashes[tok_i, hash_idx] = hash_val
-            if ps_h_num > 0:
+            if (
-                _search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False)
+                ps_h_num > 0 and
                _search_for_chars(tok_str, ps_1byte_ch, ps_2byte_ch, ps_3byte_ch, ps_4byte_ch, ps_res_buf, ps_res_len, False)
            ):
                for hash_idx in range(s_h_end, ps_h_end):
                    aff_len = ps_lengths_v[hash_idx - s_h_end]
                    hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)
-            if ss_h_num > 0:
+            if (
-                _search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True)
+                ss_h_num > 0 and 
                _search_for_chars(tok_str, ss_1byte_ch, ss_2byte_ch, ss_3byte_ch, ss_4byte_ch, ss_res_buf, ss_res_len, True)
            ):
                for hash_idx in range(ps_h_end, ss_h_end):
                    aff_len = ss_lengths_v[hash_idx - ps_h_end]
                    hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)
-        self.mem.free(aff_buf)
+        self.mem.free(aff_len_buf)
-        self.mem.free(ps_r_buf)
+        self.mem.free(ps_res_buf)
-        self.mem.free(ss_r_buf)
+        self.mem.free(ps_len_buf)
        self.mem.free(ss_res_buf)
        self.mem.free(ss_len_buf)
        return hashes
    @staticmethod
@ -2044,34 +2039,13 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
    return lca_matrix
-cdef void _copy_chars(
+cdef void _set_affix_lengths(
-    Py_UCS4* target,
+    const unsigned char[:] text_buf,
-    const Py_UCS4* source,
+    char* aff_len_buf, 
    const int length,
    const bint to_lower
 ):
    """Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
    any upper-case characters to lower case within the target buffer.
    """
    cdef int idx
    memcpy(target, source, length * sizeof(Py_UCS4))
    if to_lower:
        for idx in range(length):
            if Py_UNICODE_ISUPPER(target[idx]):
                target[idx] = Py_UNICODE_TOLOWER(target[idx])
 cdef void _set_affixes(
    const Py_UCS4* text_buf,
    const int tok_idx, 
    const int tok_len,
    Py_UCS4* aff_buf, 
    const int pref_len, 
    const int suff_len,
-    const bint to_lower
+) nogil:
-):
+    """ TODO : Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
    """ Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
        If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
        text_buf: a pointer to a UTF-32LE representation of the containing string.
@ -2082,41 +2056,41 @@ cdef void _set_affixes(
        suff_len: the length of the suffix.
        to_lower: if *True*, any upper case characters in either affix are converted to lower case.
    """
-    cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len
+    cdef int text_buf_idx = 0, aff_len_buf_idx = 0, text_buf_len = len(text_buf)
    if pref_len > 0:
        filled_pref_len = pref_len if pref_len < tok_len else tok_len
        _copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower)
        aff_buf_idx = filled_pref_len
-    if tok_len < pref_len:
+    while aff_len_buf_idx < pref_len:
-        memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
+        if (text_buf[text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
-        aff_buf_idx = aff_buf_len - suff_len
+            aff_len_buf[aff_len_buf_idx] = text_buf_idx + 1
            aff_len_buf_idx += 1
        text_buf_idx += 1
        if text_buf_idx == len(text_buf):
            break
-    if tok_len < suff_len:
+    if aff_len_buf_idx < pref_len:
-        memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
+        memset(aff_len_buf + aff_len_buf_idx, 0, pref_len - aff_len_buf_idx)
-        aff_buf_idx = aff_buf_len - tok_len
+        aff_len_buf_idx = pref_len
-    if suff_len > 0:
+    text_buf_idx = 1
-        # in_word_idx: the index within the token where the suffix starts
+    while aff_len_buf_idx < pref_len + suff_len:
-        in_word_idx = aff_buf_idx + tok_len - aff_buf_len
+        if (text_buf[text_buf_len - text_buf_idx] >> 6) ^ 2 != 0: # not a continuation character
-        if in_word_idx < pref_len:
+            aff_len_buf[aff_len_buf_idx] = text_buf_len - text_buf_idx
-            memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
+            aff_len_buf_idx += 1
-            aff_buf_idx += filled_pref_len - in_word_idx
+        text_buf_idx += 1
-            in_word_idx = aff_buf_idx + tok_len - aff_buf_len
+        if text_buf_idx > text_buf_len:
-        if aff_buf_idx < aff_buf_len:
+            break
-            _copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
+
    if aff_len_buf_idx < pref_len + suff_len:
        memset(aff_len_buf + aff_len_buf_idx, 0, suff_len - aff_len_buf_idx)
-cdef void _search_for_chars(
+cdef bint _search_for_chars(
-    const Py_UCS4* text_buf,
+    const unsigned char[:] tok_str,
-    const int tok_idx, 
+    const unsigned char[:] s_1byte_ch,
-    const int tok_len, 
+    const unsigned char[:] s_2byte_ch,
-    Py_UCS4* search_buf, 
+    const unsigned char[:] s_3byte_ch,
-    Py_UCS4* lookup_buf,
+    const unsigned char[:] s_4byte_ch,
-    const int search_buf_len, 
+    unsigned char* res_buf,
-    Py_UCS4* result_buf, 
+    unsigned char* len_buf,
    const int result_buf_len, 
    bint suffs_not_prefs
 ) nogil:
    """ Search a word within a string for characters within *search_buf*, starting at the beginning or
@ -2133,6 +2107,8 @@ cdef void _search_for_chars(
        result_buf: the buffer in which to place the results.
        result_buf_len: the length of *result_buf*.
        suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
        Returns *True* if at least one character from *search_buf* was found in the word, otherwise *False*.
    """
    cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
    cdef int search_buf_idx
@ -2158,6 +2134,8 @@ cdef void _search_for_chars(
    # fill in any unused characters in the result buffer with zeros
    if result_buf_idx < result_buf_len:
        memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
    return result_buf_idx > 0
 def pickle_doc(doc):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1737,69 +1737,42 @@ def all_equal(iterable):
    return next(g, True) and not next(g, False)
-def get_arrays_for_search_chars(
+def get_search_char_byte_arrays(
    search_chars: str, case_sensitive: bool
-) -> Tuple[bytes, bytes]:
+) -> Tuple[bytes, bytes, bytes, bytes]:
    """
-    This function supports the rich feature extractor. It returns search byte arrays with
+    This function supports the rich feature extractor. It splits the UTF-8 representation
-    4-byte character width that are used for comparison when searching document texts 
+    of *search_chars* into separate byte arrays containing 1-, 2-, 3-, and 4-byte
-    for search characters. The encoding is little-endian regardless of architecture, as 
+    characters respectively. Any duplicates in *search_chars* are removed, and *search_chars*
-    this is what is expected by the murmurhash library used downstream.
+    is converted to lower case if *case_sensitive==False*.
    Alongside the "search array" against which words from document texts are compared
    is the "lookup array". When a character from the search array is matched,
    the character at the corresponding position in the lookup array is added to the
    sequence that then goes on to be hashed. This enables case-sensitivity
    to be handled without converting the case of the words being searched: if 
    *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
    have case are added to the search array, and both the original character and its
    other-cased counterpart map to the lower-case version in the lookup array.
    """
-    def encode(ch: str) -> bytes:
+    sc1 = bytearray()
-        """
+    sc2 = bytearray()
-        ch: a single character
+    sc3 = bytearray()
-        """
+    sc4 = bytearray()
-        return ch.encode("UTF-32LE")
+    if not case_sensitive:
- 
+        search_chars = search_chars.lower()
-    def add_to_arrays(
+    ordered_search_chars = "".join(sorted(set(search_chars)))
-        search: List[bytes], lookup: List[bytes], ch: str
+    encoded_search_char_bytes = ordered_search_chars.encode("UTF-8")
-    ) -> None:
+    working_start = 0
-        """Add the byte representations of *ch* to the two byte array lists.
+    for idx in range(len(encoded_search_char_bytes) + 1):
-        """
+        if idx == 0:
-        this_char_bytes = encode(ch)
+            continue
-        if not case_sensitive and ch.islower():
+        if (
-            if this_char_bytes not in search:
+            idx == len(encoded_search_char_bytes)
-                search.append(this_char_bytes)
+            or encoded_search_char_bytes[idx] & 0xC0 != 0x80  # not continuation byte
-                lookup.append(this_char_bytes)
+        ):
-            upper_char_bytes = encode(ch.upper())
+            char_length = idx - working_start
-            if upper_char_bytes not in search:
+            if char_length == 1:
-                search.append(upper_char_bytes)
+                sc1.extend(encoded_search_char_bytes[working_start:idx])
-                lookup.append(this_char_bytes)
+            elif char_length == 2:
-        elif not case_sensitive and ch.isupper():
+                sc2.extend(encoded_search_char_bytes[working_start:idx])
-            lower_char_bytes = encode(ch.lower())
+            elif char_length == 3:
-            if this_char_bytes not in search:
+                sc3.extend(encoded_search_char_bytes[working_start:idx])
-                search.append(this_char_bytes)
+            elif char_length == 4:
-                lookup.append(lower_char_bytes)
+                sc4.extend(encoded_search_char_bytes[working_start:idx])
-            if lower_char_bytes not in search:
+            else:
-                search.append(lower_char_bytes)
+                raise RuntimeError(Errors.E1049)
-                lookup.append(lower_char_bytes)
+            working_start = idx
-        elif this_char_bytes not in search:
+    return bytes(sc1), bytes(sc2), bytes(sc3), bytes(sc4)
            search.append(this_char_bytes)
            lookup.append(this_char_bytes)
    def get_ordered_raw_bytes(
        search: List[bytes], lookup: List[bytes]
    ) -> Tuple[bytes, bytes]:
        """Flatten the two lists, ordering both by the entries in *search*.
        """
        num_search = [list(entry) for entry in search]
        search = [entry for _, entry in sorted(zip(num_search, search))]
        lookup = [entry for _, entry in sorted(zip(num_search, lookup))]
        return b"".join(search), b"".join(lookup)
    search: List[bytes] = []
    lookup: List[bytes] = []
    for ch in search_chars:
        add_to_arrays(search, lookup, ch)
    return get_ordered_raw_bytes(search, lookup)