Remove search functionality

2025-08-02 19:30:19 +03:00 · 2023-01-10 13:06:50 +01:00 · 2023-01-10 13:06:50 +01:00 · e95dd432d1
commit e95dd432d1
parent 2116a71962
9 changed files with 10 additions and 530 deletions
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -193,23 +193,14 @@ def _verify_rich_config_group(
    label: str,
    lengths: Optional[List[int]],
    rows: Optional[List[int]],
    search_chars: Optional[str],
    is_search_char_group: bool,
 ) -> None:
    if lengths is not None or rows is not None:
        if is_search_char_group and (search_chars is None or len(search_chars) == 0):
            raise ValueError(Errors.E1048.format(label=label))
        if search_chars is not None and len(search_chars) > 63:
            raise ValueError(Errors.E1049.format(label=label))
        if lengths is None or rows is None:
            raise ValueError(Errors.E1048.format(label=label))
        if len(lengths) != len(rows):
            raise ValueError(Errors.E1048.format(label=label))
        if any([length < 1 for length in lengths]):
            raise ValueError(Errors.E1048.format(label=label))
    elif search_chars is not None:
        raise ValueError(Errors.E1048.format(label=label))
    if lengths is not None:
        if lengths[-1] > 63:
            raise ValueError(Errors.E1049.format(label=label))
        if len(lengths) != len(set(lengths)) or lengths != sorted(lengths):
@ -227,12 +218,6 @@ def RichMultiHashEmbed(
    pref_rows: Optional[List[int]] = None,
    suff_lengths: Optional[List[int]] = None,
    suff_rows: Optional[List[int]] = None,
    pref_search_chars: Optional[str] = None,
    pref_search_lengths: Optional[List[int]] = None,
    pref_search_rows: Optional[List[int]] = None,
    suff_search_chars: Optional[str] = None,
    suff_search_lengths: Optional[List[int]] = None,
    suff_search_rows: Optional[List[int]] = None,
 ) -> Model[List[Doc], List[Floats2d]]:
    """
    Construct an embedding layer with the features of `MultiHashEmbed` (see above)
@ -240,37 +225,12 @@ def RichMultiHashEmbed(
    The fixed-length `PREFIX` and `SUFFIX` features used in `MultiHashEmbed`
    are sometimes not rich enough when working with languages with complex morphology,
    and this layer allows the specification of multiple prefixes and suffixes
-    of any lengths.
+    of any lengths. Arrays specifying lengths must be in ascending order.
    Additionally, it is possible to use as features the results of character
    searches of specified lengths. A list of search characters is specified; the
    characters in each word are examined in order starting at the beginning or at
    the end; and each character that matches one of the search characters is added,
    in order, to the string to be used as a feature. The search continues until
    either the search result string is full or the whole word has been examined.
    This is useful because some languages exhibit morphological alternations where
    one letter or letters regularly alternate with another letter or letters
    depending on the presence of some other letter before or after it, e.g. German
    plural nouns where the final two vowels are `ä-e` regularly correspond to
    singular lemmas where the `e` is no longer present and the `ä` has become `a`,
    e.g. `die Bäche` (plural) vs. `der Bach` (singular).
    For most languages used with spaCy, searching is likely to be useful starting
    at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
    is also offered for completeness. Search characters should consist of all
    characters that regularly alternate with other characters in the language in
    question or whose presence before or after characters that would otherwise
    alternate prevents the alternation from occurring, e.g. an `ä` in a German
    plural noun does not become `a` if it is the third or fourth vowel from the
    end of the word.
    There are a few rare situations where a graphical character is expressed as
    more than one UTF-8 character, e.g. *i* when representing the lower-case form
    of the Turkish letter *İ*. Such situations are supported, but the lengths of
-    prefixes, suffixes and character search results may need to be increased
+    prefixes and suffixes may need to be increased accordingly.
    accordingly.
    All arrays specifying lengths must be in ascending order.
    width (int): The output width. Also used as the width of the embedding tables.
        Recommended values are between 64 and 300.
@ -290,39 +250,13 @@ def RichMultiHashEmbed(
        for each word, e.g. for the word `spaCy`:
        `[1, 3]` would lead to `y` and `yCa` being used as features.
    suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
    pref_search_chars (Optional[str]): A string containing characters to search for
        starting from the beginning of each word.
    pref_search_lengths (Optional[List[int]]): The lengths of search result strings
        to use as features, where the searches start from the beginning of each word.
    pref_search_rows (Optional[List[int]]): The number of rows for each of
        `pref_search_lengths`.
    suff_search_chars (Optional[str]): A string containing characters to search for
        starting from the end of each word.
    suff_search_lengths (Optional[List[int]]): The lengths of search result strings
        to use as features, where the searches start from the end of each word.
    suff_search_rows (Optional[List[int]]): The number of rows for each of
        `suff_search_lengths`.
    """
    if len(rows) != len(attrs):
        raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
-    _verify_rich_config_group("prefix", pref_lengths, pref_rows, None, False)
+    _verify_rich_config_group("prefix", pref_lengths, pref_rows)
-    _verify_rich_config_group("suffix", suff_lengths, suff_rows, None, False)
+    _verify_rich_config_group("suffix", suff_lengths, suff_rows)
    _verify_rich_config_group(
        "prefix search",
        pref_search_lengths,
        pref_search_rows,
        pref_search_chars,
        True,
    )
    _verify_rich_config_group(
        "suffix search",
        suff_search_lengths,
        suff_search_rows,
        suff_search_chars,
        True,
    )
    if "PREFIX" in attrs or "SUFFIX" in attrs:
        warnings.warn(Warnings.W124)
@ -331,10 +265,6 @@ def RichMultiHashEmbed(
        rows.extend(pref_rows)
    if suff_rows is not None:
        rows.extend(suff_rows)
    if pref_search_rows is not None:
        rows.extend(pref_search_rows)
    if suff_search_rows is not None:
        rows.extend(suff_search_rows)
    embeddings: List[Model[Ints2d, Floats2d]] = [
        HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0)
@ -350,10 +280,6 @@ def RichMultiHashEmbed(
            case_sensitive=case_sensitive,
            pref_lengths=pref_lengths,
            suff_lengths=suff_lengths,
            pref_search_chars=pref_search_chars,
            pref_search_lengths=pref_search_lengths,
            suff_search_chars=suff_search_chars,
            suff_search_lengths=suff_search_lengths,
        ),
    )
--- a/spacy/ml/richfeatureextractor.py
+++ b/spacy/ml/richfeatureextractor.py
@ -2,7 +2,6 @@ from typing import List, Optional, Callable, Tuple
 from thinc.types import Ints2d
 from thinc.api import Model, registry, get_current_ops
 from ..tokens import Doc
 from ..util import get_search_char_byte_arrays
@registry.layers("spacy.RichFeatureExtractor.v1")
@ -11,27 +10,7 @@ def RichFeatureExtractor(
    case_sensitive: bool,
    pref_lengths: Optional[List[int]] = None,
    suff_lengths: Optional[List[int]] = None,
    pref_search_chars: Optional[str] = None,
    pref_search_lengths: Optional[List[int]] = None,
    suff_search_chars: Optional[str] = None,
    suff_search_lengths: Optional[List[int]] = None,
 ) -> Model[List[Doc], List[Ints2d]]:
    ops = get_current_ops()
    if pref_search_chars is not None:
        ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(
            pref_search_chars, case_sensitive
        )
    else:
        ps_search_chars = bytes()
        ps_width_offsets = bytes()
    if suff_search_chars is not None:
        ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(
            suff_search_chars, case_sensitive
        )
    else:
        ss_search_chars = bytes()
        ss_width_offsets = bytes()
    return Model(
        "extract_character_combination_hashes",
        forward,
@ -39,16 +18,6 @@ def RichFeatureExtractor(
            "case_sensitive": case_sensitive,
            "p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
            "s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
            "ps_search_chars": ps_search_chars,
            "ps_width_offsets": ps_width_offsets,
            "ps_lengths": bytes(pref_search_lengths)
            if pref_search_lengths is not None
            else bytes(),
            "ss_search_chars": ss_search_chars,
            "ss_width_offsets": ss_width_offsets,
            "ss_lengths": bytes(suff_search_lengths)
            if suff_search_lengths is not None
            else bytes(),
        },
    )
@ -60,24 +29,12 @@ def forward(
    case_sensitive: bool = model.attrs["case_sensitive"]
    p_lengths: bytes = model.attrs["p_lengths"]
    s_lengths: bytes = model.attrs["s_lengths"]
    ps_search_chars: bytes = model.attrs["ps_search_chars"]
    ps_width_offsets: bytes = model.attrs["ps_width_offsets"]
    ps_lengths: bytes = model.attrs["ps_lengths"]
    ss_search_chars: bytes = model.attrs["ss_search_chars"]
    ss_width_offsets: bytes = model.attrs["ss_width_offsets"]
    ss_lengths: bytes = model.attrs["ss_lengths"]
    features: List[Ints2d] = []
    for doc in docs:
        hashes = doc.get_character_combination_hashes(
            case_sensitive=case_sensitive,
            p_lengths=p_lengths,
            s_lengths=s_lengths,
            ps_search_chars=ps_search_chars,
            ps_width_offsets=ps_width_offsets,
            ps_lengths=ps_lengths,
            ss_search_chars=ss_search_chars,
            ss_width_offsets=ss_width_offsets,
            ss_lengths=ss_lengths,
        )
        features.append(ops.asarray2i(hashes, dtype="uint64"))
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -15,7 +15,6 @@ from spacy.lang.xx import MultiLanguage
 from spacy.language import Language
 from spacy.lexeme import Lexeme
 from spacy.tokens import Doc, Span, SpanGroup, Token
 from spacy.util import get_search_char_byte_arrays
 from spacy.vocab import Vocab
 from .test_underscore import clean_underscore  # noqa: F401
@ -1450,12 +1449,6 @@ def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
 def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
    doc = en_tokenizer("spaCy✨ and Prodigy")
    ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(
        "Rp", case_sensitive
    )
    ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(
        "xx✨rp", case_sensitive
    )
    hashes = doc.get_character_combination_hashes(
        case_sensitive=case_sensitive,
        p_lengths=bytes(
@ -1473,17 +1466,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
                5,
            )
        ),
        ps_search_chars=ps_search_chars,
        ps_width_offsets=ps_width_offsets,
        ps_lengths=bytes((2,)),
        ss_search_chars=ss_search_chars,
        ss_width_offsets=ss_width_offsets,
        ss_lengths=bytes(
            (
                1,
                2,
            )
        ),
    )
    assert hashes[0][0] == _encode_and_hash("s")
    assert hashes[0][1] == _encode_and_hash("spa")
@ -1492,9 +1474,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
    assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca")
    assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap")
    assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps")
    assert hashes[0][7] == _encode_and_hash("p")
    assert hashes[0][8] == _encode_and_hash("p")
    assert hashes[0][9] == _encode_and_hash("p")
    assert hashes[1][0] == _encode_and_hash("✨")
    assert hashes[1][1] == _encode_and_hash("✨")
    assert hashes[1][2] == _encode_and_hash("✨")
@ -1502,9 +1481,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
    assert hashes[1][4] == _encode_and_hash("✨", reverse=True)
    assert hashes[1][5] == _encode_and_hash("✨", reverse=True)
    assert hashes[1][6] == _encode_and_hash("✨", reverse=True)
    assert hashes[1][7] == EMPTY_HASH_VALUE
    assert hashes[1][8] == _encode_and_hash("✨")
    assert hashes[1][9] == _encode_and_hash("✨")
    assert hashes[2][0] == _encode_and_hash("a")
    assert hashes[2][1] == _encode_and_hash("and")
    assert hashes[2][2] == _encode_and_hash("and")
@ -1512,9 +1488,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
    assert hashes[2][4] == _encode_and_hash("dna")
    assert hashes[2][5] == _encode_and_hash("dna")
    assert hashes[2][6] == _encode_and_hash("dna")
    assert hashes[2][7] == EMPTY_HASH_VALUE
    assert hashes[2][8] == EMPTY_HASH_VALUE
    assert hashes[2][9] == EMPTY_HASH_VALUE
    assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
    assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
    assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
@ -1522,21 +1495,10 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
    assert hashes[3][4] == _encode_and_hash("ygi")
    assert hashes[3][5] == _encode_and_hash("ygid")
    assert hashes[3][6] == _encode_and_hash("ygido")
    assert (
        hashes[3][7] == EMPTY_HASH_VALUE if case_sensitive else _encode_and_hash("pr")
    )
    assert hashes[3][8] == _encode_and_hash("r")
    if case_sensitive:
        assert hashes[3][9] == _encode_and_hash("r")
    else:
        assert hashes[3][9] == _encode_and_hash("rp")
 def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
    doc = en_tokenizer("spaCy✨ and Prodigy")
    ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("rp", False)
    hashes = doc.get_character_combination_hashes(
        case_sensitive=False,
        p_lengths=bytes(),
@ -1548,34 +1510,24 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
                5,
            )
        ),
        ps_search_chars=ps_search_chars,
        ps_width_offsets=ps_width_offsets,
        ps_lengths=bytes((2,)),
        ss_search_chars=bytes(),
        ss_width_offsets=bytes(),
        ss_lengths=bytes(),
    )
    assert hashes[0][0] == _encode_and_hash("yc")
    assert hashes[0][1] == _encode_and_hash("yca")
    assert hashes[0][2] == _encode_and_hash("ycap")
    assert hashes[0][3] == _encode_and_hash("ycaps")
    assert hashes[0][4] == _encode_and_hash("p")
    assert hashes[1][0] == _encode_and_hash("✨", reverse=True)
    assert hashes[1][1] == _encode_and_hash("✨", reverse=True)
    assert hashes[1][2] == _encode_and_hash("✨", reverse=True)
    assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
    assert hashes[1][4] == EMPTY_HASH_VALUE
    assert hashes[2][0] == _encode_and_hash("dn")
    assert hashes[2][1] == _encode_and_hash("dna")
    assert hashes[2][2] == _encode_and_hash("dna")
    assert hashes[2][3] == _encode_and_hash("dna")
    assert hashes[2][4] == EMPTY_HASH_VALUE
    assert hashes[3][0] == _encode_and_hash("yg")
    assert hashes[3][1] == _encode_and_hash("ygi")
    assert hashes[3][2] == _encode_and_hash("ygid")
    assert hashes[3][3] == _encode_and_hash("ygido")
    assert hashes[3][4] == _encode_and_hash("pr")
 def test_get_character_combination_hashes_various_lengths(en_tokenizer):
@ -1588,12 +1540,6 @@ def test_get_character_combination_hashes_various_lengths(en_tokenizer):
                case_sensitive=False,
                p_lengths=bytes((p_length,)),
                s_lengths=bytes((s_length,)),
                ps_search_chars=bytes(),
                ps_width_offsets=bytes(),
                ps_lengths=bytes(),
                ss_search_chars=bytes(),
                ss_width_offsets=bytes(),
                ss_lengths=bytes(),
            )
            assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
@ -1605,7 +1551,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
    en_tokenizer, case_sensitive
 ):
    doc = en_tokenizer("İ".lower() + "İ")
    search_chars, width_offsets = get_search_char_byte_arrays("İ", case_sensitive)
    hashes = doc.get_character_combination_hashes(
        case_sensitive=case_sensitive,
        p_lengths=bytes(
@ -1624,26 +1569,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
                4,
            )
        ),
        ps_search_chars=search_chars,
        ps_width_offsets=width_offsets,
        ps_lengths=bytes(
            (
                1,
                2,
                3,
                4,
            )
        ),
        ss_search_chars=search_chars,
        ss_width_offsets=width_offsets,
        ss_lengths=bytes(
            (
                1,
                2,
                3,
                4,
            )
        ),
    )
    COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
@ -1656,10 +1581,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
        assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True)
        assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True)
        assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True)
        assert hashes[0][8] == _encode_and_hash("İ")
        assert hashes[0][9] == _encode_and_hash("İ")
        assert hashes[0][12] == _encode_and_hash("İ")
        assert hashes[0][13] == _encode_and_hash("İ")
    else:
        assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
@ -1670,16 +1591,6 @@ def test_get_character_combination_hashes_turkish_i_with_dot(
            COMBINING_DOT_ABOVE + "İ".lower(), reverse=True
        )
        assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True)
        assert hashes[0][8] == _encode_and_hash("i")
        assert hashes[0][9] == _encode_and_hash("İ".lower())
        assert hashes[0][10] == _encode_and_hash("İ".lower() + "i")
        assert hashes[0][11] == _encode_and_hash("İ".lower() * 2)
        assert hashes[0][12] == _encode_and_hash(COMBINING_DOT_ABOVE)
        assert hashes[0][13] == _encode_and_hash(COMBINING_DOT_ABOVE + "i")
        assert hashes[0][14] == _encode_and_hash(
            COMBINING_DOT_ABOVE + "i" + COMBINING_DOT_ABOVE
        )
        assert hashes[0][15] == _encode_and_hash((COMBINING_DOT_ABOVE + "i") * 2)
@pytest.mark.parametrize("case_sensitive", [True, False])
@ -1693,33 +1604,17 @@ def test_get_character_combination_hashes_string_store_spec_cases(
    assert len(long_word) > 255
    doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
    assert len(doc) == 4
    ps_search_chars, ps_width_offsets = get_search_char_byte_arrays("E", case_sensitive)
    hashes = doc.get_character_combination_hashes(
        case_sensitive=case_sensitive,
        p_lengths=bytes((2,)),
        s_lengths=bytes((2,)),
        ps_search_chars=ps_search_chars,
        ps_width_offsets=ps_width_offsets,
        ps_lengths=bytes((2,)),
        ss_search_chars=bytes(),
        ss_width_offsets=bytes(),
        ss_lengths=bytes(),
    )
    assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
    assert hashes[0][1] == _encode_and_hash("91")
    assert hashes[0][2] == EMPTY_HASH_VALUE
    assert hashes[1][0] == _encode_and_hash("be")
    assert hashes[1][1] == _encode_and_hash("ee")
    if case_sensitive:
        assert hashes[1][2] == EMPTY_HASH_VALUE
    else:
        assert hashes[1][2] == _encode_and_hash("ee")
    assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
    assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt")
    if case_sensitive:
        assert hashes[2][2] == hashes[3][2] == EMPTY_HASH_VALUE
    else:
        assert hashes[2][2] == hashes[3][2] == _encode_and_hash("ee")
 def test_character_combination_hashes_empty_lengths(en_tokenizer):
@ -1728,10 +1623,4 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer):
        case_sensitive=True,
        p_lengths=bytes(),
        s_lengths=bytes(),
        ps_search_chars=bytes(),
        ps_width_offsets=bytes(),
        ps_lengths=bytes(),
        ss_search_chars=bytes(),
        ss_width_offsets=bytes(),
        ss_lengths=bytes(),
    ).shape == (1, 0)
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -1,86 +0,0 @@
 import spacy
 import pytest
@pytest.mark.parametrize("case_sensitive", [True, False])
 def test_get_search_char_byte_arrays_1_width_only(case_sensitive):
    search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
        "zzaaEP", case_sensitive
    )
    if case_sensitive:
        assert search_chars == b"EPaz"
    else:
        assert search_chars == b"aepz"
    assert width_offsets == b"\x00\x04\x04\x04\x04"
@pytest.mark.parametrize("case_sensitive", [True, False])
 def test_get_search_char_byte_arrays_4_width_only(case_sensitive):
    search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
        "𐌞", case_sensitive
    )
    assert search_chars == "𐌞".encode("utf-8")
    assert width_offsets == b"\x00\x00\x00\x00\x04"
@pytest.mark.parametrize("case_sensitive", [True, False])
 def test_get_search_char_byte_arrays_all_widths(case_sensitive):
    search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
        "𐌞Éabé—B𐌞", case_sensitive
    )
    if case_sensitive:
        assert search_chars == "BabÉé—𐌞".encode("utf-8")
        assert width_offsets == b"\x00\x03\x07\x0a\x0e"
    else:
        assert search_chars == "abé—𐌞".encode("utf-8")
        assert width_offsets == b"\x00\x02\x04\x07\x0b"
@pytest.mark.parametrize("case_sensitive", [True, False])
 def test_get_search_char_byte_arrays_widths_1_and_3(case_sensitive):
    search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
        "B—", case_sensitive
    )
    if case_sensitive:
        assert search_chars == "B—".encode("utf-8")
    else:
        assert search_chars == "b—".encode("utf-8")
    assert width_offsets == b"\x00\x01\x01\x04\x04"
@pytest.mark.parametrize("case_sensitive", [True, False])
 def test_get_search_char_byte_arrays_widths_1_and_4(case_sensitive):
    search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
        "B𐌞", case_sensitive
    )
    if case_sensitive:
        assert search_chars == "B𐌞".encode("utf-8")
    else:
        assert search_chars == "b𐌞".encode("utf-8")
    assert width_offsets == b"\x00\x01\x01\x01\x05"
@pytest.mark.parametrize("case_sensitive", [True, False])
 def test_turkish_i_with_dot(case_sensitive):
    search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
        "İ", case_sensitive
    )
    if case_sensitive:
        assert search_chars == "İ".encode("utf-8")
        assert width_offsets == b"\x00\x00\x02\x02\x02"
    else:
        assert search_chars == b"i\xcc\x87"
        assert width_offsets == b"\x00\x01\x03\x03\x03"
@pytest.mark.parametrize("case_sensitive", [True, False])
 def test_turkish_i_with_dot_and_normal_i(case_sensitive):
    search_chars, width_offsets = spacy.util.get_search_char_byte_arrays(
        "İI", case_sensitive
    )
    if case_sensitive:
        assert search_chars == "Iİ".encode("utf-8")
        assert width_offsets == b"\x00\x01\x03\x03\x03"
    else:
        assert search_chars == b"i\xcc\x87"
        assert width_offsets == b"\x00\x01\x03\x03\x03"
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -50,18 +50,6 @@ cdef void _set_suffix_lengths(
 ) nogil
 cdef void _search_for_chars(
    const unsigned char* tok_str,
    const int tok_str_l,
    const unsigned char* search_chars,
    const unsigned char* width_offsets,
    const int max_res_l,
    const bint suffs_not_prefs,
    unsigned char* res_buf,
    unsigned char* l_buf,
 ) nogil
 cdef int _write_hashes(
    const unsigned char* res_buf,
    const unsigned char* aff_l_buf,
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -180,12 +180,6 @@ class Doc:
        case_sensitive: bool,
        p_lengths: bytes,
        s_lengths: bytes,
        ps_search_chars: bytes,
        ps_width_offsets: bytes,
        ps_lengths: bytes,
        ss_search_chars: bytes,
        ss_width_offsets: bytes,
        ss_lengths: bytes,
    ) -> Ints2d: ...
    @staticmethod
    def _get_array_attrs() -> Tuple[Any]: ...
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1751,43 +1751,16 @@ cdef class Doc:
        const bint case_sensitive, 
        const unsigned char* p_lengths,
        const unsigned char* s_lengths,
        const unsigned char* ps_search_chars,
        const unsigned char* ps_width_offsets,
        const unsigned char* ps_lengths,
        const unsigned char* ss_search_chars,
        const unsigned char* ss_width_offsets,
        const unsigned char* ss_lengths,
    ):
        """
        Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations 
            derived from the raw text of each token.
-        Generally:
+        case_sensitive: if *False*, hashes are generated based on the lower-case version of each token.
        p_ variables relate to prefixes (affixes starting at the beginning of the word)
        s_ variables relate to suffixes (affixes starting at the end of the word)
        ps_ variables relate to searches starting at the beginning of the word
        ss_ variables relate to searches starting at the end of the word
        cs: if *False*, hashes are generated based on the lower-case version of each token.
        p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order. 
            For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
        s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order. 
            For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa".
        ps_search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within each token, 
            starting at the beginning.
        ps_width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end]
            specifying the offsets within *ps_search_chars* that contain UTF-8 characters with the specified widths.
        ps_lengths: an array of single-byte values specifying the lengths of search results (from the beginning) to be hashed 
            in ascending order. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings 
            hashed for "spaCy" would be "a" and "ac".
        ss_search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within each token, 
            starting at the end.
        ss_width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end]
            specifying the offsets within *ss_search_chars* that contain UTF-8 characters with the specified widths.
        ss_lengths: an array of single-byte values specifying the lengths of search results (from the end) to be hashed
             in ascending order. For example, if *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings 
             hashed for "spaCy" would be "c" and "ca".
        Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of 
        the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
@ -1801,27 +1774,16 @@ cdef class Doc:
        # Work out lengths
        cdef int p_lengths_l = strlen(<char*> p_lengths)
        cdef int s_lengths_l = strlen(<char*> s_lengths)
        cdef int ps_lengths_l = strlen(<char*> ps_lengths)
        cdef int ss_lengths_l = strlen(<char*> ss_lengths)
        cdef int hashes_per_tok = p_lengths_l + s_lengths_l + ps_lengths_l + ss_lengths_l
        cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0
        cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0
        cdef int ps_max_l = ps_lengths[ps_lengths_l - 1] if ps_lengths_l > 0 else 0
        cdef int ss_max_l = ss_lengths[ss_lengths_l - 1] if ss_lengths_l > 0 else 0
        # Define / allocate buffers
        cdef Pool mem = Pool()
        cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
        cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
        cdef unsigned char* ps_res_buf = <unsigned char*> mem.alloc(ps_max_l, 
            MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char))
        cdef unsigned char* ps_l_buf = <unsigned char*> mem.alloc(ps_max_l, sizeof(char))
        cdef unsigned char* ss_res_buf = <unsigned char*> mem.alloc(ss_max_l, 
            MAX_UTF8_CHAR_BYTE_WIDTH * sizeof(char))
        cdef unsigned char* ss_l_buf = <unsigned char*> mem.alloc(ss_max_l, sizeof(char))
        cdef int doc_l = self.length
        cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
-            (doc_l, hashes_per_tok), dtype="uint64")
+            (doc_l, p_lengths_l + s_lengths_l), dtype="uint64")
        cdef np.uint64_t* hashes_ptr = <np.uint64_t*> hashes.data
        # Define working variables
@ -1852,16 +1814,6 @@ cdef class Doc:
                _set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf)
                hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr)
            if ps_max_l > 0:
                _search_for_chars(tok_str, tok_str_l, ps_search_chars, ps_width_offsets, 
                    ps_max_l, False, ps_res_buf, ps_l_buf)
                hashes_ptr += _write_hashes(ps_res_buf, ps_lengths, ps_l_buf, 0, hashes_ptr)
            if ss_max_l > 0:
                _search_for_chars(tok_str, tok_str_l, ss_search_chars, ss_width_offsets, 
                    ss_max_l, True, ss_res_buf, ss_l_buf)
                hashes_ptr += _write_hashes(ss_res_buf, ss_lengths, ss_l_buf, 0, hashes_ptr)
        return hashes
@ -2111,73 +2063,6 @@ cdef void _set_suffix_lengths(
        memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
 cdef void _search_for_chars(
    const unsigned char* tok_str,
    const int tok_str_l,
    const unsigned char* search_chars,
    const unsigned char* width_offsets,
    const int max_res_l,
    const bint suffs_not_prefs,
    unsigned char* res_buf,
    unsigned char* l_buf,
 ) nogil:
    """ Search *tok_str* within a string for characters within *search_chars*, starting at the 
        beginning or end depending on the value of *suffs_not_prefs*. Wherever a character matches,
        it is added to *res_buf* and the byte length up to that point is added to *l_buf*. When nothing
        more is found, the remainder of *l_buf* is populated wth the byte length from the last result,
        which may be *0* if the search was not successful.
        tok_str: a UTF-8 representation of a string.
        tok_str_l: the length of *tok_str*.
        search_chars: a byte array containing, in numerical order, UTF-8 characters to search for within *tok_str*. 
        width_offsets: an array of single-byte values [1-char-start, 2-char-start, 3-char-start, 4-char-start, 4-char-end]
            specifying the offsets within *search_chars* that contain UTF-8 characters with the specified widths.
        max_res_l: the maximum number of found characters to place in *res_buf*.
        suffs_not_prefs: if *True*, searching starts from the end of the word; 
            if *False*, from the beginning.
        res_buf: the buffer in which to place the search results.
        l_buf: a buffer of length *max_res_l* in which to store the end byte offsets of the found characters.
            The calling code ensures that lengths greater than 255 cannot occur. 
    """
    cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx, end_search_idx
    cdef int last_tok_str_idx = tok_str_l if suffs_not_prefs else 0
    cdef int this_tok_str_idx = tok_str_l - 1 if suffs_not_prefs else 1
    while this_tok_str_idx >= 0 and this_tok_str_idx <= tok_str_l:
        if ( 
            (this_tok_str_idx == tok_str_l) or 
            ((tok_str[this_tok_str_idx] & 0xc0) != 0x80) # not continuation character, always applies to [0].
        ):
            if this_tok_str_idx > last_tok_str_idx:
                ch_wdth = this_tok_str_idx - last_tok_str_idx
            else:
                ch_wdth = last_tok_str_idx - this_tok_str_idx
            tok_start_idx = this_tok_str_idx if suffs_not_prefs else last_tok_str_idx
            search_char_idx = width_offsets[ch_wdth - 1]
            end_search_idx = width_offsets[ch_wdth]
            while search_char_idx < end_search_idx:
                cmp_result = memcmp(&tok_str[tok_start_idx], &search_chars[search_char_idx], ch_wdth)
                if cmp_result == 0:
                    memcpy(res_buf + res_buf_idx, &search_chars[search_char_idx], ch_wdth)
                    res_buf_idx += ch_wdth
                    l_buf[l_buf_idx] = res_buf_idx
                    l_buf_idx += 1
                    if l_buf_idx == max_res_l:
                        return
                if cmp_result <= 0: 
                    break
                search_char_idx += ch_wdth
            last_tok_str_idx = this_tok_str_idx
        if suffs_not_prefs:
            this_tok_str_idx -= 1
        else:
            this_tok_str_idx += 1
    # fill in unused characters in the length buffer
    memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
 cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
 cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1736,44 +1736,3 @@ def all_equal(iterable):
    (or if the input is an empty sequence), False otherwise."""
    g = itertools.groupby(iterable)
    return next(g, True) and not next(g, False)
 def get_search_char_byte_arrays(
    search_char_string: str, case_sensitive: bool
 ) -> Tuple[bytes, bytes]:
    """
    This function supports *RichMultiHashEmbed*. It orders the characters in
    *search_char_string*, removing any duplicates, encodes them as UTF-8, and
    returns the result bufer together with a byte array containing the offsets
    where the characters of various byte lengths start within the result buffer,
    i.e.
    <1-byte-start>, <2-byte-start>, <3-byte-start>, <4-byte-start>, <4-byte-end>.
    If the result buffer does not contain any characters of length *n*,
    <n_byte_start> == <n+1_byte_start>.
    """
    if not case_sensitive:
        search_char_string = search_char_string.lower()
    ordered_search_char_string = "".join(sorted(set(search_char_string)))
    search_chars = ordered_search_char_string.encode("UTF-8")
    width_offsets = [-1] * 5
    working_start = 0
    working_width = 0
    for idx in range(1, len(search_chars) + 1):
        if (
            idx == len(search_chars)
            or search_chars[idx] & 0xC0 != 0x80  # not continuation byte
        ):
            this_width = idx - working_start
            if this_width > 4 or this_width < working_width:
                raise RuntimeError(Errors.E1051)
            if this_width > working_width:
                for i in range(working_width, 5):
                    width_offsets[i] = working_start
                working_width = this_width
            working_start = idx
    for i in range(this_width, 5):
        width_offsets[i] = idx
    return search_chars, bytes((width_offsets))
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -186,9 +186,6 @@ updated).
 > pref_rows = [10000,10000,10000]
 > suff_lengths = [2, 3, 4, 5]
 > suff_rows = [10000,10000,10000,10000]
 > suff_search_chars = "aeiouäöüß"
 > suff_search_lengths = [2, 3]
 > suff_search_rows = [10000,10000]
 > ```
 Construct an embedding layer with the features of
@ -198,35 +195,12 @@ features extracted from various positions in each token string. The fixed-length
 [MultiHashEmbed](#spacymultihashembedv2-multihashembed) are sometimes not rich
 enough when working with languages with complex morphology, and this layer
 allows the specification of multiple prefixes and suffixes of any lengths.
-
+Arrays specifying lengths must be in ascending order.
 Additionally, it is possible to use as features the results of character
 searches of specified lengths. A list of search characters is specified; the
 characters in each word are examined in order starting at the beginning or at
 the end; and each character that matches one of the search characters is added,
 in order, to the string to be used as a feature. The search continues until
 either the search result string is full or the whole word has been examined.
 This is useful because some languages exhibit morphological alternations where
 one letter or letters regularly alternate with another letter or letters
 depending on the presence of some other letter before or after it, e.g. German
 plural nouns where the final two vowels are `ä-e` regularly correspond to
 singular lemmas where the `e` is no longer present and the `ä` has become `a`,
 e.g. `die Bäche` (plural) vs. `der Bach` (singular).
 For most languages used with spaCy, searching is likely to be useful starting at
 the end (`suff_*`), but the ability to search from the beginning (`pref_*`) is
 also offered for completeness. Search characters should consist of all
 characters that regularly alternate with other characters in the language in
 question or whose presence before or after characters that would otherwise
 alternate prevents the alternation from occurring, e.g. an `ä` in a German
 plural noun does not become `a` if it is the third or fourth vowel from the end
 of the word.
 There are a few rare situations where a graphical character is expressed as more
 than one UTF-8 character, e.g. _i_ when representing the lower-case form of the
-Turkish letter _İ_. Such situations are supported, but the lengths of prefixes,
+Turkish letter _İ_. Such situations are supported, but the lengths of prefixes
-suffixes and character search results may need to be increased accordingly.
+and suffixes may need to be increased accordingly.
 All arrays specifying lengths must be in ascending order.
 | Name                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -239,12 +213,6 @@ All arrays specifying lengths must be in ascending order.
 | `pref_rows`              | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                              |
 | `suff_lengths`           | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `yCa` being used as features. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                           |
 | `suff_rows`              | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                              |
 | `pref_search_chars`      | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~                                                                                                                                                                                                                                                                                                                                           |
 | `pref_search_lengths`    | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~                                                                                                                                                                                                                                                                                                         |
 | `pref_search_rows`       | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                       |
 | `suff_search_chars`      | A string containing characters to search for starting from the end of each word. ~~Optional[str]~~                                                                                                                                                                                                                                                                                                                                                 |
 | `suff_search_lengths`    | The lengths of search result strings to use as features, where the searches start from the end of each word. ~~Optional[List[int]]~~                                                                                                                                                                                                                                                                                                               |
 | `suff_search_rows`       | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~                                                                                                                                                                                                                                                                                                                                                                       |
 | **CREATES**              | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                                                                                                                             |
 ### spacy.CharacterEmbed.v2 {#CharacterEmbed}