Intermediate state

2025-08-02 11:20:19 +03:00 · 2022-10-20 21:48:53 +02:00 · 2022-10-20 21:48:53 +02:00 · f7d9942e7c
commit f7d9942e7c
parent 2707d30ce0
10 changed files with 377 additions and 329 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -946,7 +946,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "{value}.")
    E1044 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.")
    E1045 = ("Invalid rich group config '{label}'.")
    E1046 = ("Search characters may not contain characters that occupy four bytes in UTF-16.")
 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -260,27 +260,6 @@ def RichMultiHashEmbed(
    prevents the alternation from occurring, e.g. an `ä` in a German plural noun does
    not become `a` if it is the third or fourth vowel from the end of the word.
    Internally, the model converts each token string to UTF-16 and assumes that each
    character from the string occupies two bytes. This assumption holds for all 
    characters in the Basic Multilingual Plane, which encompasses all characters that 
    are ever likely to be of interest when extracting features. There are, however,
    characters like emojis that are in the Extended Multilingual Plane and occupy
    four bytes, although importantly neither of the two byte pairs that make up such
    a representation can be a valid two-byte character in its own right. The
    following considerations apply to the processing of four-byte characters:
    - An exceptional four-byte character within a text consisting mostly of two-byte
      characters will probably be ignored by the neural network accepting the
      embedding layer as not matching any of the learned features.
    - If anyone did want to train a model for a language like Lycian that is
      generally written in four-byte characters, prefix and suffix features can
      still be extracted, but the length specifications should all be doubled, i.e.
      `[2,4,6]` to extract one-, two- and three-character affixes. In such a
      situation length specifications that are odd numbers would serve no useful
      purpose since they would refer to half-characters.
    - Four-byte characters are not accepted within search character specification
      strings and lead to an error being thrown.
    width (int): The output width. Also used as the width of the embedding tables.
        Recommended values are between 64 and 300.
    attrs (list of attr IDs): The token attributes to embed. A separate
--- a/spacy/ml/richfeatureextractor.py
+++ b/spacy/ml/richfeatureextractor.py
@ -1,7 +1,7 @@
 from typing import List, Optional, Callable, Tuple
-from ..util import get_byte_arrays_for_search_chars
+from ..util import get_arrays_for_search_chars
-from thinc.types import Ints2d
+from thinc.types import Ints1d, Ints2d
-from thinc.api import Model, registry
+from thinc.api import Model, registry, get_current_ops
 from ..tokens import Doc
@ -17,33 +17,46 @@ def RichFeatureExtractor(
    suff_search_chars: Optional[str] = None,
    suff_search_lengths: Optional[List[int]] = None,
 ) -> Model[List[Doc], List[Ints2d]]:
    ops = get_current_ops()
    if pref_search_chars is not None:
-        pref_search, pref_ref = get_byte_arrays_for_search_chars(pref_search_chars, case_sensitive)
+        pref_search, pref_lookup = get_arrays_for_search_chars(
            pref_search_chars, case_sensitive
        )
    else:
-        pref_search, pref_ref = bytes(), bytes()
+        pref_search, pref_lookup = bytes(), bytes()
    if suff_search_chars is not None:
-        suff_search, suff_ref = get_byte_arrays_for_search_chars(suff_search_chars, case_sensitive)
+        suff_search, suff_lookup = get_arrays_for_search_chars(
            suff_search_chars, case_sensitive
        )
    else:
-        suff_search, suff_ref = bytes(), bytes()
+        suff_search, suff_lookup = bytes(), bytes()
    return Model(
        "extract_character_combination_hashes",
        forward,
        attrs={
            "case_sensitive": case_sensitive,
-            "pref_lengths": pref_lengths if pref_lengths is not None else [],
+            "pref_lengths": ops.asarray1i(pref_lengths)
-            "suff_lengths": suff_lengths if suff_lengths is not None else [],
+            if pref_lengths is not None
            else ops.asarray1i([]),
            "suff_lengths": ops.asarray1i(suff_lengths)
            if suff_lengths is not None
            else ops.asarray1i([]),
            "pref_search": pref_search,
-            "pref_ref": pref_ref,
+            "pref_lookup": pref_lookup,
-            "pref_s_char_l": len(pref_search) / 4 if pref_search_chars is not None else 0,
+            "pref_search_char_len": len(pref_search) / 4
-            "pref_search_lengths": pref_search_lengths
+            if pref_search_chars is not None
            else 0,
            "pref_search_lengths": ops.asarray1i(pref_search_lengths)
            if pref_search_lengths is not None
-            else [],
+            else ops.asarray1i([]),
            "suff_search": suff_search,
-            "suff_ref": suff_ref,
+            "suff_lookup": suff_lookup,
-            "suff_s_char_l": len(suff_search) / 4 if suff_search_chars is not None else 0,
+            "suff_search_char_len": len(suff_search) / 4
-            "suff_search_lengths": suff_search_lengths
+            if suff_search_chars is not None
            else 0,
            "suff_search_lengths": ops.asarray1i(suff_search_lengths)
            if suff_search_lengths is not None
-            else [],
+            else ops.asarray1i([]),
        },
    )
@ -53,30 +66,30 @@ def forward(
 ) -> Tuple[List[Ints2d], Callable]:
    ops = model.ops
    case_sensitive: bool = model.attrs["case_sensitive"]
-    pref_lengths: List[int] = model.attrs["pref_lengths"]
+    pref_lengths: Ints1d = model.attrs["pref_lengths"]
-    suff_lengths: List[int] = model.attrs["suff_lengths"]
+    suff_lengths: Ints1d = model.attrs["suff_lengths"]
    pref_search: bytes = model.attrs["pref_search"]
-    pref_ref: bytes = model.attrs["pref_ref"]
+    pref_lookup: bytes = model.attrs["pref_lookup"]
-    pref_s_char_l: int = model.attr["pref_s_char_l"]
+    pref_search_char_len: int = model.attrs["pref_search_char_len"]
-    pref_search_lengths: List[int] = model.attrs["pref_search_lengths"]
+    pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"]
    suff_search: bytes = model.attrs["suff_search"]
-    suff_ref: bytes = model.attrs["suff_ref"]
+    suff_lookup: bytes = model.attrs["suff_lookup"]
-    suff_s_char_l: int = model.attr["suff_s_char_l"]
+    suff_search_char_len: int = model.attrs["suff_search_char_len"]
-    suff_search_lengths: List[int] = model.attrs["suff_search_lengths"]
+    suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"]
    features: List[Ints2d] = []
    for doc in docs:
        hashes = doc.get_character_combination_hashes(
-            case_sensitive=case_sensitive,
+            cs=case_sensitive,
-            pref_lengths=pref_lengths,
+            p_lengths=pref_lengths,
-            suff_lengths=suff_lengths,
+            s_lengths=suff_lengths,
-            pref_search=pref_search,
+            ps_search=pref_search,
-            pref_ref=pref_ref,
+            ps_lookup=pref_lookup,
-            pref_s_char_l=pref_s_char_l,
+            ps_l=pref_search_char_len,
-            pref_search_lengths=pref_search_lengths,
+            ps_lengths=pref_search_lengths,
-            suff_search=suff_search,
+            ss_search=suff_search,
-            suff_ref=suff_ref,
+            ss_lookup=suff_lookup,
-            suff_s_char_l=suff_s_char_l,
+            ss_l=suff_search_char_len,
-            suff_search_lengths=suff_search_lengths,
+            ss_lengths=suff_search_lengths,
        )
        features.append(ops.asarray2i(hashes))
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -14,7 +14,7 @@ from spacy.lang.xx import MultiLanguage
 from spacy.language import Language
 from spacy.lexeme import Lexeme
 from spacy.tokens import Doc, Span, SpanGroup, Token
-from spacy.util import get_byte_arrays_for_search_chars
+from spacy.util import get_arrays_for_search_chars
 from spacy.vocab import Vocab
 from .test_underscore import clean_underscore  # noqa: F401
@ -1004,21 +1004,22 @@ def _get_unsigned_32_bit_hash(input: str) -> int:
@pytest.mark.parametrize("case_sensitive", [True, False])
 def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
    doc = en_tokenizer("spaCy✨ and Prodigy")
-    suff_search, suff_ref = get_byte_arrays_for_search_chars("xx✨rp", case_sensitive)
+    ops = get_current_ops()
    pref_search, pref_lookup = get_arrays_for_search_chars("Rp", case_sensitive)
    suff_search, suff_lookup = get_arrays_for_search_chars("xx✨rp", case_sensitive)
    hashes = doc.get_character_combination_hashes(
        cs=case_sensitive,
-        pref_lengths=[1, 4, 3],
+        p_lengths=ops.asarray1i([1, 4, 3]),
-        suff_lengths=[2, 3, 4, 5],
+        s_lengths=ops.asarray1i([2, 3, 4, 5]),
-        pref_search=bytes(),
+        ps_search=pref_search,
-        pref_ref=bytes(),
+        ps_lookup=pref_lookup,
-        pref_s_char_l = 0,
+        ps_l=2 if case_sensitive else 4,
-        pref_search_lengths=[2],
+        ps_lengths=ops.asarray1i([2]),
-        suff_search=suff_search,
+        ss_search=suff_search,
-        suff_ref=suff_ref,
+        ss_lookup=suff_lookup,
-        suff_s_char_l=5 if case_sensitive else 9,
+        ss_l=5 if case_sensitive else 9,
-        suff_search_lengths=[2,1],
+        ss_lengths=ops.asarray1i([2, 1]),
    )
    assert hashes[0][0] == _get_unsigned_32_bit_hash("s")
@ -1035,7 +1036,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
        "spaCy" if case_sensitive else "spacy"
    )
-    assert hashes[0][7] == _get_unsigned_32_bit_hash("  ")
+    assert hashes[0][7] == _get_unsigned_32_bit_hash("p ")
    assert hashes[0][8] == _get_unsigned_32_bit_hash("p ")
    assert hashes[0][9] == _get_unsigned_32_bit_hash("p")
    assert hashes[1][0] == _get_unsigned_32_bit_hash("✨")
@ -1067,7 +1068,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
    assert hashes[3][4] == _get_unsigned_32_bit_hash("igy")
    assert hashes[3][5] == _get_unsigned_32_bit_hash("digy")
    assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy")
-    assert hashes[3][7] == _get_unsigned_32_bit_hash("  ")
+    assert hashes[3][7] == _get_unsigned_32_bit_hash("  " if case_sensitive else "pr")
    assert hashes[3][9] == _get_unsigned_32_bit_hash("r")
@ -1077,73 +1078,93 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
        assert hashes[3][8] == _get_unsigned_32_bit_hash("rp")
    # check values are the same cross-platform
-    assert hashes[0][1] == 753329845 if case_sensitive else 18446744071614199016
+    if case_sensitive:
-    assert hashes[1][3] == 3425774424
+        assert hashes[0][1] == 3712103410
-    assert hashes[2][8] == 3076404432
+    else:
        assert hashes[0][1] == 307339932
    assert hashes[1][3] == 2414314354
    assert hashes[2][8] == 1669671676
-def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer):
+def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
-    doc = en_tokenizer("and𐌞")
+    doc = en_tokenizer("spaCy✨ and Prodigy")
-    suff_search, suff_ref = get_byte_arrays_for_search_chars("a", True)
+    ops = get_current_ops()
    pref_search, pref_lookup = get_arrays_for_search_chars("rp", False)
    hashes = doc.get_character_combination_hashes(
-        cs=True,
+        cs=False,
-        pref_lengths=[],
+        p_lengths=ops.asarray1i([]),
-        suff_lengths=[1, 2, 3],
+        s_lengths=ops.asarray1i([2, 3, 4, 5]),
-        pref_search=bytes(),
+        ps_search=pref_search,
-        pref_ref=bytes(),
+        ps_lookup=pref_lookup,
-        pref_s_char_l = 0,
+        ps_l=4,
-        pref_search_lengths=[],
+        ps_lengths=ops.asarray1i([2]),
-        suff_search=suff_search,
+        ss_search=bytes(),
-        suff_ref=suff_ref,
+        ss_lookup=bytes(),
-        suff_s_char_l=1,
+        ss_l=0,
-        suff_search_lengths=[1],
+        ss_lengths=ops.asarray1i([]),
    )
-    assert hashes[0][0] == _get_unsigned_32_bit_hash("𐌞")
+
-    assert hashes[0][1] == _get_unsigned_32_bit_hash("d𐌞")
+    assert hashes[0][0] == _get_unsigned_32_bit_hash("cy")
-    assert hashes[0][2] == _get_unsigned_32_bit_hash("nd𐌞")
+    assert hashes[0][1] == _get_unsigned_32_bit_hash("acy")
-    assert hashes[0][3] == _get_unsigned_32_bit_hash("a")
+    assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy")
    assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy")
    assert hashes[0][4] == _get_unsigned_32_bit_hash("p ")
    assert hashes[1][0] == _get_unsigned_32_bit_hash(" ✨")
    assert hashes[1][1] == _get_unsigned_32_bit_hash("  ✨")
    assert hashes[1][2] == _get_unsigned_32_bit_hash("   ✨")
    assert hashes[1][3] == _get_unsigned_32_bit_hash("    ✨")
    assert hashes[1][4] == _get_unsigned_32_bit_hash("  ")
    assert hashes[2][0] == _get_unsigned_32_bit_hash("nd")
    assert hashes[2][1] == _get_unsigned_32_bit_hash("and")
    assert hashes[2][2] == _get_unsigned_32_bit_hash(" and")
    assert hashes[2][3] == _get_unsigned_32_bit_hash("  and")
    assert hashes[2][4] == _get_unsigned_32_bit_hash("  ")
    assert hashes[3][0] == _get_unsigned_32_bit_hash("gy")
    assert hashes[3][1] == _get_unsigned_32_bit_hash("igy")
    assert hashes[3][2] == _get_unsigned_32_bit_hash("digy")
    assert hashes[3][3] == _get_unsigned_32_bit_hash("odigy")
    assert hashes[3][4] == _get_unsigned_32_bit_hash("pr")
 def test_get_character_combination_hashes_4_byte_char_in_middle(en_tokenizer):
    doc = en_tokenizer("and𐌞a")
    hashes = doc.get_character_combination_hashes(
        case_sensitive=False,
        pref_lengths=[],
        suff_lengths=[1, 2, 3, 4],
        pref_search_chars="",
        pref_search_lengths=[],
        suff_search_chars="a",
        suff_search_lengths=[1, 2],
    )
    assert hashes[0][0] == _get_unsigned_32_bit_hash("a")
    assert hashes[0][2] == _get_unsigned_32_bit_hash("𐌞a")
    assert hashes[0][3] == _get_unsigned_32_bit_hash("d𐌞a")
    assert hashes[0][4] == _get_unsigned_32_bit_hash("a")
    assert hashes[0][5] == _get_unsigned_32_bit_hash("aa")
-def test_get_character_combination_hashes_4_byte_special_char(en_tokenizer):
+def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
-    doc = en_tokenizer("and𐌞")
+    doc = en_tokenizer("sp𐌞Cé")
-    with pytest.raises(ValueError):
+    ops = get_current_ops()
-        doc.get_character_combination_hashes(
+
-            case_sensitive=True,
+    for p_length in range(1, 8):
-            pref_lengths=[],
+        for s_length in range(1, 8):
-            suff_lengths=[2, 3, 4, 5],
+            hashes = doc.get_character_combination_hashes(
-            pref_search_chars="",
+                cs=False,
-            pref_search_lengths=[],
+                p_lengths=ops.asarray1i([p_length]),
-            suff_search_chars="𐌞",
+                s_lengths=ops.asarray1i([s_length]),
-            suff_search_lengths=[2],
+                ps_search=bytes(),
-        )
+                ps_lookup=bytes(),
                ps_l=0,
                ps_lengths=ops.asarray1i([]),
                ss_search=bytes(),
                ss_lookup=bytes(),
                ss_l=0,
                ss_lengths=ops.asarray1i([]),
            )
            assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé   "[:p_length])
            assert hashes[0][1] == _get_unsigned_32_bit_hash("   sp𐌞cé"[8 - s_length :])
 def test_character_combination_hashes_empty_lengths(en_tokenizer):
    doc = en_tokenizer("and𐌞")
-    assert doc.get_character_combination_hashes(
+    ops = get_current_ops()
-        case_sensitive=True,
+    hashes = doc.get_character_combination_hashes(
-        pref_lengths=[],
+        cs=True,
-        suff_lengths=[],
+        p_lengths=ops.asarray1i([]),
-        pref_search_chars="",
+        s_lengths=ops.asarray1i([]),
-        pref_search_lengths=[],
+        ps_search=bytes(),
-        suff_search_chars="",
+        ps_lookup=bytes(),
-        suff_search_lengths=[],
+        ps_l=0,
        ps_lengths=ops.asarray1i([]),
        ss_search=bytes(),
        ss_lookup=bytes(),
        ss_l=0,
        ss_lengths=ops.asarray1i([]),
    ).shape == (1, 0)
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -1,13 +1,13 @@
 import spacy
-def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
+def test_get_arrays_for_search_chars_width_2_not_case_sensitive():
    (
        search,
-        ref,
+        lookup,
-    ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False)
+    ) = spacy.util.get_arrays_for_search_chars("bféwfw", False)
    assert (
-        ref
+        lookup
        == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
    )
@ -17,39 +17,39 @@ def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive():
    )
-def test_get_byte_arrays_for_search_chars_width_2_case_sensitive():
+def test_get_arrays_for_search_chars_width_2_case_sensitive():
    (
        search,
-        ref,
+        lookup,
-    ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True)
+    ) = spacy.util.get_arrays_for_search_chars("bféwfw", True)
    assert (
-        ref == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
+        lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00"
    )
-def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive():
+def test_get_arrays_for_search_chars_width_4_not_case_sensitive():
    (
        search,
-        ref,
+        lookup,
-    ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
+    ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False)
    assert (
        search
        == b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
    )
    assert (
-        ref
+        lookup
        == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00"
    )
-def test_get_byte_arrays_for_search_chars_width_4_case_sensitive():
+def test_get_arrays_for_search_chars_width_4_case_sensitive():
    (
        search,
-        ref,
+        lookup,
-    ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
+    ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True)
-    assert search == ref
+    assert search == lookup
    assert (
-        ref
+        lookup
        == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00"
    )
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -18,6 +18,11 @@ ctypedef fused LexemeOrToken:
    const_TokenC_ptr
 cdef extern from "unicodeobject.h":
    bint Py_UNICODE_ISUPPER(Py_UCS4 ch)
    Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
 cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
@ -33,25 +38,34 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
 cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
-cdef void _populate_aff_buf(
+cdef void _copy_chars(
    Py_UCS4* target,
    const Py_UCS4* source,
    const int length,
    const bint to_lower
 )
 cdef void _set_affixes(
    const Py_UCS4* text_buf,
    const int tok_idx, 
    const int tok_len,
    Py_UCS4* aff_buf, 
-    const int pref_length, 
+    const int pref_len, 
-    const int suff_length,
+    const int suff_len,
    const bint to_lower
 )
-cdef void _populate_search_buf(
+
 cdef void _search_for_chars(
    const Py_UCS4* text_buf,
    const int tok_idx, 
    const int tok_len, 
    Py_UCS4* search_buf, 
-    Py_UCS4* ref_buf,
+    Py_UCS4* lookup_buf,
    const int search_buf_len, 
-    Py_UCS4* finding_buf, 
+    Py_UCS4* result_buf, 
-    const int finding_buf_len, 
+    const int result_buf_len, 
    bint suffs_not_prefs
 )
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -1,7 +1,7 @@
 from typing import Callable, Protocol, Iterable, Iterator, Optional
 from typing import Union, Tuple, List, Dict, Any, overload
 from cymem.cymem import Pool
-from thinc.types import Floats1d, Floats2d, Ints2d
+from thinc.types import Floats1d, Floats2d, Ints1d, Ints2d
 from .span import Span
 from .token import Token
 from ._dict_proxies import SpanGroups
@ -177,17 +177,17 @@ class Doc:
    def get_character_combination_hashes(
        self,
        *,
-        case_sensitive: bool,
+        cs: bool,
-        pref_lengths: List[int],
+        pref_lengths: Ints1d,
-        suff_lengths: List[int],
+        suff_lengths: Ints1d,
        pref_search_chars: str,
-        pref_ref_chars: str,
+        pref_lookup_chars: str,
        pref_search_char_length: int,
-        pref_search_lengths: List[int],
+        pref_search_lengths: Ints1d,
        suff_search_chars: str,
-        suff_ref_chars: str,
+        suff_lookup_chars: str,
        suff_search_char_length: int,
-        suff_search_lengths: List[int],
+        suff_search_lengths: Ints1d,
    ) -> Ints2d: ...
    @staticmethod
    def _get_array_attrs() -> Tuple[Any]: ...
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -3,6 +3,7 @@ from typing import Set, List
 cimport cython
 cimport numpy as np
 from cpython cimport array
 from libc.string cimport memcpy, memcmp, memset
 from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t
@ -105,16 +106,6 @@ class SetEntsDefault(str, Enum):
        return list(cls.__members__.keys())
 cdef extern from "unicodeobject.h":
    Py_UCS4 PyUnicode_READ(int kind, void *data, int index)
    void* PyUnicode_DATA(void* o)
    void PyUnicode_READY(void * o)
    int PyUnicode_KIND(void *data)
    int PyUnicode_IS_COMPACT(void *data)
    Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
 cdef class Doc:
    """A sequence of Token objects. Access sentences and named entities, export
    annotations to numpy arrays, losslessly serialize to compressed binary
@ -1745,103 +1736,129 @@ cdef class Doc:
        return output
-    def get_character_combination_hashes(
+    def get_character_combination_hashes(self,
        self,
        *,
-        bint cs, 
+        const bint cs, 
-        pref_lengths: List[int], 
+        np.ndarray p_lengths, 
-        suff_lengths: List[int], 
+        np.ndarray s_lengths, 
-        char* pref_search,
+        const char* ps_search,
-        char* pref_ref,
+        const char* ps_lookup,
-        int pref_s_char_l,
+        const int ps_l,
-        pref_search_lengths: List[int],
+        np.ndarray ps_lengths,
-        char* suff_search,
+        const char* ss_search,
-        char* suff_ref,
+        const char* ss_lookup,
-        int suff_s_char_l,
+        const int ss_l,
-        suff_search_lengths: List[int],
+        np.ndarray ss_lengths,
    ):
        """
        Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations 
-            derived from the string (text/orth) of each token.
+            derived from the raw text of each token.
        Generally:
            p_ variables relate to prefixes (affixes starting at the beginning of the word)
            s_ variables relate to suffixes (affixes starting at the end of the word)
            ps_ variables relate to searches starting at the beginning of the word
            ss_ variables relate to searches starting at the end of the word
-        cs: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that
+        cs: if *False*, hashes are generated based on the lower-case version of each token.
-            if *cs==False*, upper-case characters in *search_chars* will not be found in token strings.
+        p_lengths: an Ints1d specifying the lengths of prefixes to be hashed. For example, if *p_lengths==[2, 3]*, 
        pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*, 
            the prefixes hashed for "spaCy" would be "sp" and "spa".
-        suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and
+        s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and
-            *case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
+            *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy".
-        pref_search_chars: a string containing characters to search for within each token, starting at the beginning.
+        ps_search: a byte array containing characters to search for within each token, starting at the beginning.
-        pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if 
+        ps_lookup: a byte array containing characters that are added to the result string when a character at
-            *pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *cs==False*, the searched strings hashed for 
+            the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
            case-insensitivity to be handled efficiently.
        ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
        ps_lengths: an Ints1d specifying the lengths of search results to be hashed. For example if 
            *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for 
            "spaCy" would be "a" and "ac".
-        suff_search_chars: a string containing characters to search for within each token, starting at the end.
+        ss_search: a byte array containing characters to search for within each token, starting at the end.
-        suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if 
+        ss_lookup: a byte array containing characters that are added to the result string when a character at
            the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
            case-insensitivity to be handled efficiently.
        ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
        ss_lengths: an integer list specifying the lengths of search results to be hashed. For example if 
            *suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for 
            "spaCy" would be "c" and "ca".
        For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by 
-        *get_character_combination_hashes(True, [2], [2, 4, 6], "yC", [1], [2])* would correspond to
+        *get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* would correspond to
-        [[hash("sp"), [hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")],
+        [[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
-        [hash("an"), hash("nd"), hash("and", hash("and"), hash(" "), hash("  "))],
+        [hash("an"), hash("nd"), hash(" and", hash("   and"), hash(" "), hash("  "))],
        [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]]
        """
-        cdef int max_pref_l = max(pref_lengths) if len(pref_lengths) > 0 else 0
+        # Encode the document text        
        cdef int max_suff_l = max(suff_lengths) if len(suff_lengths) > 0 else 0
        cdef int aff_buf_l  = max_pref_l + max_suff_l
        cdef int max_s_pref_l = max(pref_search_lengths) if len(pref_search_lengths) > 0 else 0
        cdef int max_s_suff_l = max(suff_search_lengths) if len(suff_search_lengths) > 0 else 0
        cdef Py_UCS4* aff_buf = <Py_UCS4*>self.mem.alloc(4, aff_buf_l)
        cdef Py_UCS4* pref_s_buf = <Py_UCS4*>pref_search
        cdef Py_UCS4* pref_r_buf = <Py_UCS4*>pref_ref
        cdef Py_UCS4* pref_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_pref_l)
        cdef Py_UCS4* suff_s_buf = <Py_UCS4*>suff_search
        cdef Py_UCS4* suff_r_buf = <Py_UCS4*>suff_ref
        cdef Py_UCS4* suff_f_buf = <Py_UCS4*>self.mem.alloc(4, max_s_suff_l)
        cdef bytes encoded_text = self.text.encode("utf-32le")
        cdef char* intermediate_text = encoded_text
        cdef Py_UCS4* text_buf = <Py_UCS4*> intermediate_text
-        cdef unsigned int num_toks = len(self), aff_len
+        # Define the result array and work out what is used for what in axis 1
-        cdef unsigned int h_pref_n = len(pref_lengths)
+        cdef int num_toks = len(self)
-        cdef unsigned int h_suff_n = len(suff_lengths), h_suff_end_idx = len(pref_lengths) + len(suff_lengths)
+        cdef int p_h_num = p_lengths.shape[0]
-        cdef unsigned int h_pref_s_n = len(pref_search_lengths), h_pref_s_end_idx = h_suff_end_idx + h_pref_s_n
+        cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num
-        cdef unsigned int h_suff_s_n = len(suff_search_lengths), h_suff_s_end_idx = h_pref_s_end_idx + h_suff_s_n
+        cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num
-        cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, h_suff_s_end_idx), dtype="int64")
+        cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num
        cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64")
        # Determine the maximum possible lengths of the affixes to work out how big the buffers need to be
        cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0
        cdef int s_max_l = max(s_lengths) if s_h_num > 0 else 0
        cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0
        cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0
        # Define / allocate buffer (pr/sr: result buffers)
        cdef int aff_buf_l  = p_max_l + s_max_l
        cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
        cdef Py_UCS4* ps_buf = <Py_UCS4*> ps_search
        cdef Py_UCS4* pl_buf = <Py_UCS4*> ps_lookup
        cdef Py_UCS4* pr_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
        cdef Py_UCS4* ss_buf = <Py_UCS4*> ss_search
        cdef Py_UCS4* sl_buf = <Py_UCS4*> ss_lookup
        cdef Py_UCS4* sr_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
        # Define memory views on length arrays
        cdef int[:] p_v = p_lengths
        cdef int[:] s_v = s_lengths
        cdef int[:] ps_v = ps_lengths
        cdef int[:] ss_v = ss_lengths
        # Define working variables
        cdef TokenC tok_c
        cdef int tok_i, tok_idx, tok_len, aff_len
        for tok_i in range(num_toks):
            tok_c = self.c[tok_i]
            tok_idx = tok_c.idx
            tok_len = tok_c.lex.length
-            _populate_aff_buf(text_buf, tok_idx, tok_len, aff_buf, max_pref_l, max_suff_l, not cs)
+            if aff_buf_l > 0:
-            _populate_search_buf(text_buf, tok_idx, tok_len, pref_s_buf, pref_r_buf, pref_s_char_l, pref_f_buf, max_s_pref_l, False)
+                _set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
            _populate_search_buf(text_buf, tok_idx, tok_len, suff_s_buf, suff_r_buf, suff_s_char_l, suff_f_buf, max_s_suff_l, True)
            for hash_idx in range(h_pref_n):
                aff_len = pref_lengths[hash_idx]
                hashes[tok_i, hash_idx] = hash32(aff_buf, aff_len * 4, 0)
            for hash_idx in range(h_pref_n, h_suff_end_idx):
                aff_len = suff_lengths[hash_idx - h_pref_n]
                hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * 4, 0)
-            for hash_idx in range(h_suff_end_idx, h_pref_s_end_idx):
+                for hash_idx in range(p_h_num):
-                aff_len = pref_search_lengths[hash_idx - h_suff_end_idx]
+                    hashes[tok_i, hash_idx] = hash32(aff_buf, p_v[hash_idx] * sizeof(Py_UCS4), 0)
-                hashes[tok_i, hash_idx] = hash32(pref_f_buf, aff_len * 4, 0)
+            
                for hash_idx in range(p_h_num, s_h_end):
                    aff_len = s_v[hash_idx - p_h_num]
                    hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
            if ps_h_num > 0:
                _search_for_chars(text_buf, tok_idx, tok_len, ps_buf, pl_buf, ps_l, pr_buf, ps_max_l, False)
                for hash_idx in range(s_h_end, ps_h_end):
                    aff_len = ps_v[hash_idx - s_h_end]
                    hashes[tok_i, hash_idx] = hash32(pr_buf, aff_len * sizeof(Py_UCS4), 0)
-            for hash_idx in range(h_pref_s_end_idx, h_suff_s_end_idx):
+            if ss_h_num > 0:
-                aff_len = suff_search_lengths[hash_idx - h_pref_s_end_idx]
+                _search_for_chars(text_buf, tok_idx, tok_len, ss_buf, sl_buf, ss_l, sr_buf, ss_max_l, True)
-                hashes[tok_i, hash_idx] = hash32(suff_f_buf, aff_len * 4, 0)
+                for hash_idx in range(ps_h_end, ss_h_end):
                    aff_len = ss_v[hash_idx - ps_h_end]
                    hashes[tok_i, hash_idx] = hash32(sr_buf, aff_len * sizeof(Py_UCS4), 0)
        self.mem.free(aff_buf)
-        self.mem.free(pref_f_buf)
+        self.mem.free(pr_buf)
-        self.mem.free(suff_f_buf)
+        self.mem.free(sr_buf)
        return hashes
    @staticmethod
@ -2025,76 +2042,103 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
    return lca_matrix
-cdef void _populate_aff_buf(
+cdef void _copy_chars(
    Py_UCS4* target,
    const Py_UCS4* source,
    const int length,
    const bint to_lower
 ):
    """Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
    any upper-case characters to lower case within the target buffer.
    """
    memcpy(target, source, length * sizeof(Py_UCS4))
    cdef int idx
    if to_lower:
        for idx in range(length):
            if Py_UNICODE_ISUPPER(target[idx]):
                target[idx] = Py_UNICODE_TOLOWER(target[idx])
 cdef void _set_affixes(
    const Py_UCS4* text_buf,
    const int tok_idx, 
    const int tok_len,
    Py_UCS4* aff_buf, 
-    const int pref_length, 
+    const int pref_len, 
-    const int suff_length,
+    const int suff_len,
    const bint to_lower
 ):
-    """ Populate a buffer of length p+s with the first p and the last s characters of a word within a string.
+    """ Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string.
-        If the word is shorter than p and/or s, the empty character positions in the middle are filled with zeros.
+        If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros.
-        str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical 
+        text_buf: a pointer to a UTF-32LE representation of the containing string.
-            Unicode form (see PEP 393).
+        tok_idx: the index of the first character of the word within the containing string.
-        kind: the number of bytes occupied by each character in the containing string.
+        tok_len: the length of the word.
        word_idx: the index of the first character of the word within the containing string.
        word_len: the length of the word.
        aff_buf: the buffer to populate.
-        pref_length: the length of the prefix.
+        pref_len: the length of the prefix.
-        suff_length: the length of the suffix.
+        suff_len: the length of the suffix.
        to_lower: if *True*, any upper case characters in either affix are converted to lower case.
    """
-    cdef int aff_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx
+    cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len
-    while aff_buf_idx < pref_length and aff_buf_idx < tok_len:
+    if pref_len > 0:
        filled_pref_len = pref_len if pref_len < tok_len else tok_len
        _copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower)
        aff_buf_idx = filled_pref_len
-        memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + aff_buf_idx, 4)
+    if tok_len < pref_len:
-        if to_lower:
+        memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
-            aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
+        aff_buf_idx = aff_buf_len - suff_len
-        aff_buf_idx += 1
+    if tok_len < suff_len:
        memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
        aff_buf_idx = aff_buf_len - tok_len
-    if aff_buf_idx < buf_size - tok_len:
+    if suff_len > 0:
-        # fill out the empty middle part of the buffer with zeros
+        in_word_idx = aff_buf_idx + tok_len - aff_buf_len
-        memset(aff_buf, 0, buf_size - suff_length - aff_buf_idx)
+        if in_word_idx < pref_len:
            memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
            aff_buf_idx += filled_pref_len - in_word_idx
        if aff_buf_idx < aff_buf_len:
            _copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
    while aff_buf_idx < buf_size:
        in_word_idx = aff_buf_idx + tok_len - buf_size
        # for suffixes we have to track the in-word index separately from the in-buffer index
        if in_word_idx < pref_length:
            # we've already retrieved this character as part of the prefix, so copy it from there
            # as that's quicker than retrieving it from the input string a second time
            memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, 4)
        else:
            memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, 4)
            if to_lower:
                aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx])
        aff_buf_idx += 1
-cdef void _populate_search_buf(
+cdef void _search_for_chars(
    const Py_UCS4* text_buf,
    const int tok_idx, 
    const int tok_len, 
    Py_UCS4* search_buf, 
-    Py_UCS4* ref_buf,
+    Py_UCS4* lookup_buf,
    const int search_buf_len, 
-    Py_UCS4* finding_buf, 
+    Py_UCS4* result_buf, 
-    const int finding_buf_len, 
+    const int result_buf_len, 
    bint suffs_not_prefs
 ):
-    cdef unsigned int finding_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
+    """ Search a word within a string for characters within *search_buf*, starting at the beginning or
-    cdef unsigned int search_buf_idx
+        end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches,
-    cdef int cmp_res
+        the corresponding character from *lookup_buf* is added to *result_buf*.
-    while finding_buf_idx < finding_buf_len:
+        text_buf: a pointer to a UTF-32LE representation of the containing string.
        tok_idx: the index of the first character of the word within the containing string.
        tok_len: the length of the word.
        search_buf: the characters to search for (ordered).
        lookup_buf: characters corresponding to *search_buf* to add to *result_buf* in the case of a match.
            Having separate search and lookup arrays enables case-insensitivity to be handled efficiently.
        search_buf_len: the length of *search_buf* and hence also of *lookup_buf*.
        result_buf: the buffer in which to place the results.
        result_buf_len: the length of *result_buf*.
        suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning.
    """
    cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx
    cdef int search_buf_idx
    cdef int cmp_result
    while result_buf_idx < result_buf_len:
        for search_buf_idx in range (search_buf_len):
-            cmp_res = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, 4)
+            cmp_result = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, sizeof(Py_UCS4))
-            if cmp_res == 0:
+            if cmp_result == 0:
-                memcpy(finding_buf + finding_buf_idx, ref_buf + search_buf_idx, 4)
+                memcpy(result_buf + result_buf_idx, lookup_buf + search_buf_idx, sizeof(Py_UCS4))
-                finding_buf_idx += 1
+                result_buf_idx += 1
-            if cmp_res >= 0:
+            if cmp_result >= 0: 
                break
        if suffs_not_prefs:
            if text_string_idx <= tok_idx:
@ -2105,11 +2149,11 @@ cdef void _populate_search_buf(
            if text_string_idx >= tok_idx + tok_len:
                break
-    if finding_buf_idx < finding_buf_len:
+    # fill in any unused characters in the result buffer with zeros
-        memset(finding_buf + finding_buf_idx, 0, finding_buf_len - finding_buf_idx)
+    if result_buf_idx < result_buf_len:
        memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4))
 def pickle_doc(doc):
    bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
    hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1737,7 +1737,7 @@ def all_equal(iterable):
    return next(g, True) and not next(g, False)
-def get_byte_arrays_for_search_chars(
+def get_arrays_for_search_chars(
    search_chars: str, case_sensitive: bool
 ) -> Tuple[bytes, bytes]:
    """
@ -1746,14 +1746,14 @@ def get_byte_arrays_for_search_chars(
    for search characters. The encoding is little-endian regardless of architecture, as 
    this is what is expected by the murmurhash library used downstream.
-    Alongside the "search byte array" against which words from document texts are compared
+    Alongside the "search array" against which words from document texts are compared
-    is the "ref byte array". When a character from the search byte array is matched,
+    is the "lookup array". When a character from the search array is matched,
-    the character at the corresponding position in the ref byte array is added to the
+    the character at the corresponding position in the lookup array is added to the
-    byte sequence of the configured length that is then hashed. This enables case-sensitivity
+    sequence that then goes on to be hashed. This enables case-sensitivity
    to be handled without converting the case of the words being searched: if 
    *case_sensitive==False*, the lower- or uppercase counterparts of any characters that
-    have case are added to the search byte arrays, and both the original character and its
+    have case are added to the search array, and both the original character and its
-    other-cased counterpart map to the lower-case version in the ref byte array.
+    other-cased counterpart map to the lower-case version in the lookup array.
    """
    def encode(ch: str) -> bytes:
@ -1762,8 +1762,8 @@ def get_byte_arrays_for_search_chars(
        """
        return ch.encode("UTF-32LE")
-    def add_to_byte_arrays(
+    def add_to_arrays(
-        search: List[bytes], ref: List[bytes], ch: str
+        search: List[bytes], lookup: List[bytes], ch: str
    ) -> None:
        """Add the byte representations of *ch* to the two byte array lists.
        """
@ -1771,36 +1771,36 @@ def get_byte_arrays_for_search_chars(
        if not case_sensitive and ch.islower():
            if this_char_bytes not in search:
                search.append(this_char_bytes)
-                ref.append(this_char_bytes)
+                lookup.append(this_char_bytes)
            upper_char_bytes = encode(ch.upper())
            if upper_char_bytes not in search:
                search.append(upper_char_bytes)
-                ref.append(this_char_bytes)
+                lookup.append(this_char_bytes)
        elif not case_sensitive and ch.isupper():
            lower_char_bytes = encode(ch.lower())
            if this_char_bytes not in search:
                search.append(this_char_bytes)
-                ref.append(lower_char_bytes)
+                lookup.append(lower_char_bytes)
            if lower_char_bytes not in search:
                search.append(lower_char_bytes)
-                ref.append(lower_char_bytes)
+                lookup.append(lower_char_bytes)
        elif this_char_bytes not in search:
            search.append(this_char_bytes)
-            ref.append(this_char_bytes)
+            lookup.append(this_char_bytes)
    def get_ordered_raw_bytes(
-        search: List[bytes], ref: List[bytes]
+        search: List[bytes], lookup: List[bytes]
    ) -> Tuple[bytes, bytes]:
        """Flatten the two lists, ordering both by the entries in *search*
        using the native endianness of the platform.
        """
        num_search = [list(entry) for entry in search]
        search = [entry for _, entry in sorted(zip(num_search, search))]
-        ref = [entry for _, entry in sorted(zip(num_search, ref))]
+        lookup = [entry for _, entry in sorted(zip(num_search, lookup))]
-        return b"".join(search), b"".join(ref)
+        return b"".join(search), b"".join(lookup)
    search: List[bytes] = []
-    ref: List[bytes] = []
+    lookup: List[bytes] = []
    for ch in search_chars:
-        add_to_byte_arrays(search, ref, ch)
+        add_to_arrays(search, lookup, ch)
-    return get_ordered_raw_bytes(search, ref)
+    return get_ordered_raw_bytes(search, lookup)
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -218,28 +218,6 @@ whose presence before or after characters that would otherwise alternate
 prevents the alternation from occurring, e.g. an `ä` in a German plural noun
 does not become `a` if it is the third or fourth vowel from the end of the word.
 Internally, the model converts each token string to
 [UTF-16](https://www.ietf.org/rfc/rfc2781.txt) and assumes that each character
 from the string occupies two bytes. This assumption holds for all characters in
 the Basic Multilingual Plane, which encompasses all characters that are ever
 likely to be of interest when extracting features. There are, however,
 characters like emojis that are in the Extended Multilingual Plane and occupy
 four bytes, although importantly neither of the two byte pairs that make up such
 a representation can be a valid two-byte character in its own right. The
 following considerations apply to the processing of four-byte characters:
 - An exceptional four-byte character within a text consisting mostly of two-byte
  characters will probably be ignored by the neural network accepting the
  embedding layer as not matching any of the learned features.
 - If anyone did want to train a model for a language like Lycian that is
  generally written in four-byte characters, prefix and suffix features can
  still be extracted, but the length specifications should all be doubled, i.e.
  `[2,4,6]` to extract one-, two- and three-character affixes. In such a
  situation length specifications that are odd numbers would serve no useful
  purpose since they would refer to half-characters.
 - Four-byte characters are not accepted within search character specification
  strings and lead to an error being thrown.
 | Name                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `width`                  | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~                                                              |