Move functionality to richfeatureextractor

2025-09-11 22:52:39 +03:00 · 2023-02-01 18:04:06 +01:00 · 2023-02-01 18:04:06 +01:00 · 5d24934cf5
commit 5d24934cf5
parent 5a08596f92
12 changed files with 920 additions and 897 deletions
--- a/setup.py
+++ b/setup.py
@ -34,6 +34,7 @@ MOD_NAMES = [
    "spacy.kb.kb",
    "spacy.kb.kb_in_memory",
    "spacy.ml.parser_model",
+    "spacy.ml.richfeatureextractor",
    "spacy.morphology",
    "spacy.pipeline.dep_parser",
    "spacy.pipeline._edit_tree_internals.edit_trees",
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -973,6 +973,7 @@ class Errors(metaclass=ErrorsWithCodes):
    E1051 = ("Invalid rich group config '{label}'.")
    E1052 = ("Length > 63 in rich group config '{label}'.")
    E1053 = ("Rich group config {label} specifies lengths that are not in ascending order.")
+    E1054 = ("Mismatched lengths in hash embed config: {len_rows} rows, {len_attrs} attrs.")


 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -151,7 +151,7 @@ def MultiHashEmbed(
        Requires a vectors table to be loaded in the Doc objects' vocab.
    """
    if len(rows) != len(attrs):
-        raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
+        raise ValueError(Errors.E1054.format(len_rows=len(rows), len_attrs=len(attrs)))
    seed = 7

    def make_hash_embed(index):
@ -253,7 +253,7 @@ def RichMultiHashEmbed(
    """

    if len(rows) != len(attrs):
-        raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
+        raise ValueError(Errors.E1054.format(len_rows=len(rows), len_attrs=len(attrs)))

    _verify_rich_config_group("prefix", pref_lengths, pref_rows)
    _verify_rich_config_group("suffix", suff_lengths, suff_rows)
@ -296,7 +296,7 @@ def RichMultiHashEmbed(
            ),
            max_out,
            ragged2list(),
-            Dropout(0.0)
+            Dropout(0.0),
        )
    else:
        model = chain(
@ -305,7 +305,7 @@ def RichMultiHashEmbed(
            with_array(concatenate(*embeddings)),
            max_out,
            ragged2list(),
-            Dropout(0.0)
+            Dropout(0.0),
        )
    return model

--- a/spacy/ml/richfeatureextractor.pxd
+++ b/spacy/ml/richfeatureextractor.pxd
@ -0,0 +1,27 @@
+cimport numpy as np
+
+cdef void _set_prefix_lengths(
+    const unsigned char* tok_str,
+    const int tok_str_l,
+    const int p_max_l, 
+    unsigned char* pref_l_buf,
+) nogil
+
+
+cdef void _set_suffix_lengths(
+    const unsigned char* tok_str,
+    const int tok_str_l,
+    const int s_max_l, 
+    unsigned char* suff_l_buf,
+) nogil
+
+
+cdef int _write_hashes(
+    const unsigned char* res_buf,
+    const unsigned char* aff_l_buf,
+    const unsigned char* offset_buf,
+    const int res_buf_last,
+    np.uint64_t* hashes_ptr,
+) nogil
+
+
--- a/spacy/ml/richfeatureextractor.py
+++ b/spacy/ml/richfeatureextractor.py
@ -1,42 +0,0 @@
-from typing import List, Optional, Callable, Tuple
-from thinc.types import Ints2d
-from thinc.api import Model, registry, get_current_ops
-from ..tokens import Doc
-
-
-@registry.layers("spacy.RichFeatureExtractor.v1")
-def RichFeatureExtractor(
-    *,
-    case_sensitive: bool,
-    pref_lengths: Optional[List[int]] = None,
-    suff_lengths: Optional[List[int]] = None,
-) -> Model[List[Doc], List[Ints2d]]:
-    return Model(
-        "extract_character_combination_hashes",
-        forward,
-        attrs={
-            "case_sensitive": case_sensitive,
-            "p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
-            "s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
-        },
-    )
-
-
-def forward(
-    model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
-) -> Tuple[List[Ints2d], Callable]:
-    ops = model.ops
-    case_sensitive: bool = model.attrs["case_sensitive"]
-    p_lengths: bytes = model.attrs["p_lengths"]
-    s_lengths: bytes = model.attrs["s_lengths"]
-    features: List[Ints2d] = []
-    for doc in docs:
-        hashes = doc.get_character_combination_hashes(
-            case_sensitive=case_sensitive,
-            p_lengths=p_lengths,
-            s_lengths=s_lengths,
-        )
-        features.append(ops.asarray2i(hashes, dtype="uint64"))
-
-    backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
-    return features, backprop
--- a/spacy/ml/richfeatureextractor.pyi
+++ b/spacy/ml/richfeatureextractor.pyi
@ -0,0 +1,9 @@
+from ..tokens import Doc
+
+def get_character_combination_hashes(
+    *,
+    doc: Doc,
+    case_sensitive: bool, 
+    p_lengths: bytes,
+    s_lengths: bytes,
+):
--- a/spacy/ml/richfeatureextractor.pyx
+++ b/spacy/ml/richfeatureextractor.pyx
@ -0,0 +1,233 @@
+from typing import List, Optional, Callable, Tuple
+import numpy
+from thinc.types import Ints2d
+from thinc.api import Model, registry, get_current_ops
+from ..symbols import NAMES as SYMBOLS_BY_INT
+
+cimport numpy as np
+from cymem.cymem cimport Pool
+from libc.string cimport memset, strlen
+from libc.stdint cimport uint64_t
+from ..tokens.doc cimport Doc
+from ..structs cimport TokenC
+from ..typedefs cimport attr_t
+
+
+@registry.layers("spacy.RichFeatureExtractor.v1")
+def RichFeatureExtractor(
+    *,
+    case_sensitive: bool,
+    pref_lengths: Optional[List[int]] = None,
+    suff_lengths: Optional[List[int]] = None,
+) -> Model[List[Doc], List[Ints2d]]:
+    # Because the calling code guarantees that the integers in the list are each less than 256,
+    # the integer list can be converted into *bytes*.
+    return Model(
+        "extract_character_combination_hashes",
+        forward,
+        attrs={
+            "case_sensitive": case_sensitive,
+            "p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
+            "s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
+        },
+    )
+
+
+def forward(
+    model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
+) -> Tuple[List[Ints2d], Callable]:
+    ops = model.ops
+    case_sensitive: bool = model.attrs["case_sensitive"]
+    p_lengths: bytes = model.attrs["p_lengths"]
+    s_lengths: bytes = model.attrs["s_lengths"]
+    features: List[Ints2d] = []
+    for doc in docs:
+        hashes = get_character_combination_hashes(
+            doc=doc,
+            case_sensitive=case_sensitive,
+            p_lengths=p_lengths,
+            s_lengths=s_lengths,
+        )
+        features.append(ops.asarray2i(hashes, dtype="uint64"))
+
+    backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
+    return features, backprop
+
+
+def get_character_combination_hashes(
+    *,
+    Doc doc,
+    const bint case_sensitive, 
+    const unsigned char* p_lengths,
+    const unsigned char* s_lengths,
+):
+    """
+    Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations 
+        derived from the raw text of each token.
+
+    doc: the document
+    case_sensitive: if *False*, hashes are generated based on the lower-case version of each token.
+    p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order. 
+        For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
+    s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order. 
+        For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa".
+
+    Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of 
+    the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
+    for ensuring that lengths being passed in cannot exceed 63 and hence, with maximally four-byte 
+    character widths, that individual values within buffers can never exceed the capacity of a single byte (255).
+
+    Note that this method performs no data validation itself as it expects the calling code will already have done so, and
+    that the behaviour of the code may be erratic if the supplied parameters do not conform to expectations.
+    """
+    
+    # Work out lengths
+    cdef int p_lengths_l = strlen(<char*> p_lengths)
+    cdef int s_lengths_l = strlen(<char*> s_lengths)
+    cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0
+    cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0
+    
+    # Define / allocate buffers
+    cdef Pool mem = Pool()
+    cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
+    cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
+    cdef int doc_l = doc.length
+    cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
+        (doc_l, p_lengths_l + s_lengths_l), dtype="uint64")
+    cdef np.uint64_t* hashes_ptr = <np.uint64_t*> hashes.data
+        
+    # Define working variables
+    cdef TokenC tok_c
+    cdef int tok_i, tok_str_l
+    cdef attr_t num_tok_attr
+    cdef bytes tok_str_bytes
+    cdef const unsigned char* tok_str
+    
+    for tok_i in range(doc_l):
+        tok_c = <TokenC> doc.c[tok_i]
+        num_tok_attr = tok_c.lex.orth if case_sensitive else tok_c.lex.lower
+        if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens
+            if num_tok_attr == 0:
+                tok_str_bytes = b""
+            else:
+                tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8")
+            tok_str = tok_str_bytes
+            tok_str_l = len(tok_str_bytes)
+        else:
+            tok_str, tok_str_l = doc.vocab.strings.utf8_ptr(num_tok_attr)
+
+        if p_max_l > 0:
+            _set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf)
+            hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes_ptr)
+
+        if s_max_l > 0:
+            _set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf)
+            hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr)
+                
+    return hashes
+
+cdef void _set_prefix_lengths(
+    const unsigned char* tok_str,
+    const int tok_str_l,
+    const int p_max_l, 
+    unsigned char* pref_l_buf,
+) nogil:
+    """ Populate *pref_l_buf*, which has length *p_max_l*, with the byte lengths of each of the substrings terminated by the first *p_max_l* 
+        characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length 
+        of the whole word.
+
+        tok_str: a UTF-8 representation of a string.
+        tok_str_l: the length of *tok_str*.
+        p_max_l: the number of characters to process at the beginning of the word.
+        pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is 
+        responsible for ensuring that *p_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values 
+        within the buffer can never exceed the capacity of a single byte (255).
+    """
+    cdef int tok_str_idx = 1, pref_l_buf_idx = 0
+
+    while pref_l_buf_idx < p_max_l:
+        if (tok_str_idx >= tok_str_l 
+            or 
+            ((tok_str[tok_str_idx] & 0xc0) != 0x80)  # not a continuation character
+        ):
+            pref_l_buf[pref_l_buf_idx] = tok_str_idx
+            pref_l_buf_idx += 1
+        if tok_str_idx >= tok_str_l:
+            break
+        tok_str_idx += 1
+
+    if pref_l_buf_idx < p_max_l:
+        memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
+
+
+cdef void _set_suffix_lengths(
+    const unsigned char* tok_str,
+    const int tok_str_l,
+    const int s_max_l,
+    unsigned char* suff_l_buf,
+) nogil:
+    """ Populate *suff_l_buf*, which has length *s_max_l*, with the byte lengths of each of the substrings started by the last *s_max_l* 
+        characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length 
+        of the whole word.
+
+        tok_str: a UTF-8 representation of a string.
+        tok_str_l: the length of *tok_str*.
+        s_max_l: the number of characters to process at the end of the word.
+        suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is 
+        responsible for ensuring that *s_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values 
+        within the buffer can never exceed the capacity of a single byte (255).
+    """
+    cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0
+
+    while suff_l_buf_idx < s_max_l:
+        if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
+            suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx
+            suff_l_buf_idx += 1
+        tok_str_idx -= 1
+        if tok_str_idx < 0:
+            break
+
+    if suff_l_buf_idx < s_max_l:
+        memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
+
+
+cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
+cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
+
+
+cdef int _write_hashes(
+    const unsigned char* res_buf,
+    const unsigned char* aff_l_buf,
+    const unsigned char* offset_buf,
+    const int res_buf_last,
+    np.uint64_t* hashes_ptr,
+) nogil:    
+    """ Write 64-bit FNV1A hashes for a token/rich property group combination.
+
+    res_buf: the string from which to generate the hash values.
+    aff_l_buf: one-byte lengths describing how many characters to hash.
+    offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
+    res_buf_last: if affixes should start at the end of *res_buf*, the offset of the last byte in
+         *res_buf*; if affixes should start at the beginning of *res_buf*, *0*.
+    hashes_ptr: a pointer starting from which the new hashes should be written.
+
+    Returns: the number of hashes written.
+    """
+
+    cdef int last_offset = 0, hash_idx = 0, offset, aff_l
+    cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
+    
+    while True:
+        aff_l = aff_l_buf[hash_idx]
+        if aff_l == 0:
+            return hash_idx     
+        offset = offset_buf[aff_l - 1]
+        while last_offset < offset:
+            if res_buf_last > 0:
+                hash_val ^= res_buf[res_buf_last - last_offset]
+            else:
+                hash_val ^= res_buf[last_offset]
+            hash_val *= FNV1A_PRIME
+            last_offset += 1
+        hashes_ptr[hash_idx] = hash_val
+        hash_idx += 1
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -1,8 +1,6 @@
-from pickle import EMPTY_DICT
 import weakref

 import numpy
-from time import time
 from numpy.testing import assert_array_equal
 import pytest
 import warnings
@ -992,635 +990,3 @@ def test_doc_spans_setdefault(en_tokenizer):
    assert len(doc.spans["key2"]) == 1
    doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
    assert len(doc.spans["key3"]) == 2
-
-
-EMPTY_HASH_VALUE = 0xCBF29CE484222325
-FNV1A_OFFSET_BASIS = 0xCBF29CE484222325
-FNV1A_PRIME = 0x00000100000001B3
-
-
-def _get_fnv1a_hash(input: bytes) -> int:
-    hash_val = FNV1A_OFFSET_BASIS
-    length = len(input)
-    offset = 0
-
-    while offset < length:
-        hash_val ^= input[offset]
-        hash_val *= FNV1A_PRIME
-        hash_val %= 2**64
-        offset += 1
-    return hash_val
-
-
-def test_fnv1a_hash():
-    """Checks the conformity of the 64-bit FNV1A implementation with
-    http://www.isthe.com/chongo/src/fnv/test_fnv.c.
-    The method called here, _get_fnv1a_hash(), is only used in testing;
-    in production code, the hashing is performed in a fashion that is interweaved
-    with other logic. The conformity of the production code is demonstrated by the
-    character combination hash tests, where hashes produced by the production code
-    are tested for equality against hashes produced by _get_fnv1a_hash().
-    """
-    INPUTS = [
-        b"",
-        b"a",
-        b"b",
-        b"c",
-        b"d",
-        b"e",
-        b"f",
-        b"fo",
-        b"foo",
-        b"foob",
-        b"fooba",
-        b"foobar",
-        b"\x00",
-        b"a\x00",
-        b"b\x00",
-        b"c\x00",
-        b"d\x00",
-        b"e\x00",
-        b"f\x00",
-        b"fo\x00",
-        b"foo\x00",
-        b"foob\x00",
-        b"fooba\x00",
-        b"foobar\x00",
-        b"ch",
-        b"cho",
-        b"chon",
-        b"chong",
-        b"chongo",
-        b"chongo ",
-        b"chongo w",
-        b"chongo wa",
-        b"chongo was",
-        b"chongo was ",
-        b"chongo was h",
-        b"chongo was he",
-        b"chongo was her",
-        b"chongo was here",
-        b"chongo was here!",
-        b"chongo was here!\n",
-        b"ch\x00",
-        b"cho\x00",
-        b"chon\x00",
-        b"chong\x00",
-        b"chongo\x00",
-        b"chongo \x00",
-        b"chongo w\x00",
-        b"chongo wa\x00",
-        b"chongo was\x00",
-        b"chongo was \x00",
-        b"chongo was h\x00",
-        b"chongo was he\x00",
-        b"chongo was her\x00",
-        b"chongo was here\x00",
-        b"chongo was here!\x00",
-        b"chongo was here!\n\x00",
-        b"cu",
-        b"cur",
-        b"curd",
-        b"curds",
-        b"curds ",
-        b"curds a",
-        b"curds an",
-        b"curds and",
-        b"curds and ",
-        b"curds and w",
-        b"curds and wh",
-        b"curds and whe",
-        b"curds and whey",
-        b"curds and whey\n",
-        b"cu\x00",
-        b"cur\x00",
-        b"curd\x00",
-        b"curds\x00",
-        b"curds \x00",
-        b"curds a\x00",
-        b"curds an\x00",
-        b"curds and\x00",
-        b"curds and \x00",
-        b"curds and w\x00",
-        b"curds and wh\x00",
-        b"curds and whe\x00",
-        b"curds and whey\x00",
-        b"curds and whey\n\x00",
-        b"hi",
-        b"hi\x00",
-        b"hello",
-        b"hello\x00",
-        b"\xff\x00\x00\x01",
-        b"\x01\x00\x00\xff",
-        b"\xff\x00\x00\x02",
-        b"\x02\x00\x00\xff",
-        b"\xff\x00\x00\x03",
-        b"\x03\x00\x00\xff",
-        b"\xff\x00\x00\x04",
-        b"\x04\x00\x00\xff",
-        b"\x40\x51\x4e\x44",
-        b"\x44\x4e\x51\x40",
-        b"\x40\x51\x4e\x4a",
-        b"\x4a\x4e\x51\x40",
-        b"\x40\x51\x4e\x54",
-        b"\x54\x4e\x51\x40",
-        b"127.0.0.1",
-        b"127.0.0.1\x00",
-        b"127.0.0.2",
-        b"127.0.0.2\x00",
-        b"127.0.0.3",
-        b"127.0.0.3\x00",
-        b"64.81.78.68",
-        b"64.81.78.68\x00",
-        b"64.81.78.74",
-        b"64.81.78.74\x00",
-        b"64.81.78.84",
-        b"64.81.78.84\x00",
-        b"feedface",
-        b"feedface\x00",
-        b"feedfacedaffdeed",
-        b"feedfacedaffdeed\x00",
-        b"feedfacedeadbeef",
-        b"feedfacedeadbeef\x00",
-        b"line 1\nline 2\nline 3",
-        b"chongo <Landon Curt Noll> /\\../\\",
-        b"chongo <Landon Curt Noll> /\\../\\\x00",
-        b"chongo (Landon Curt Noll) /\\../\\",
-        b"chongo (Landon Curt Noll) /\\../\\\x00",
-        b"http://antwrp.gsfc.nasa.gov/apod/astropix.html",
-        b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash",
-        b"http://epod.usra.edu/",
-        b"http://exoplanet.eu/",
-        b"http://hvo.wr.usgs.gov/cam3/",
-        b"http://hvo.wr.usgs.gov/cams/HMcam/",
-        b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html",
-        b"http://hvo.wr.usgs.gov/kilauea/update/images.html",
-        b"http://hvo.wr.usgs.gov/kilauea/update/maps.html",
-        b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html",
-        b"http://neo.jpl.nasa.gov/risk/",
-        b"http://norvig.com/21-days.html",
-        b"http://primes.utm.edu/curios/home.php",
-        b"http://slashdot.org/",
-        b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html",
-        b"http://volcano.wr.usgs.gov/kilaueastatus.php",
-        b"http://www.avo.alaska.edu/activity/Redoubt.php",
-        b"http://www.dilbert.com/fast/",
-        b"http://www.fourmilab.ch/gravitation/orbits/",
-        b"http://www.fpoa.net/",
-        b"http://www.ioccc.org/index.html",
-        b"http://www.isthe.com/cgi-bin/number.cgi",
-        b"http://www.isthe.com/chongo/bio.html",
-        b"http://www.isthe.com/chongo/index.html",
-        b"http://www.isthe.com/chongo/src/calc/lucas-calc",
-        b"http://www.isthe.com/chongo/tech/astro/venus2004.html",
-        b"http://www.isthe.com/chongo/tech/astro/vita.html",
-        b"http://www.isthe.com/chongo/tech/comp/c/expert.html",
-        b"http://www.isthe.com/chongo/tech/comp/calc/index.html",
-        b"http://www.isthe.com/chongo/tech/comp/fnv/index.html",
-        b"http://www.isthe.com/chongo/tech/math/number/howhigh.html",
-        b"http://www.isthe.com/chongo/tech/math/number/number.html",
-        b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html",
-        b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest",
-        b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi",
-        b"http://www.lavarnd.org/cgi-bin/haiku.cgi",
-        b"http://www.lavarnd.org/cgi-bin/rand-none.cgi",
-        b"http://www.lavarnd.org/cgi-bin/randdist.cgi",
-        b"http://www.lavarnd.org/index.html",
-        b"http://www.lavarnd.org/what/nist-test.html",
-        b"http://www.macosxhints.com/",
-        b"http://www.mellis.com/",
-        b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm",
-        b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm",
-        b"http://www.paulnoll.com/",
-        b"http://www.pepysdiary.com/",
-        b"http://www.sciencenews.org/index/home/activity/view",
-        b"http://www.skyandtelescope.com/",
-        b"http://www.sput.nl/~rob/sirius.html",
-        b"http://www.systemexperts.com/",
-        b"http://www.tq-international.com/phpBB3/index.php",
-        b"http://www.travelquesttours.com/index.htm",
-        b"http://www.wunderground.com/global/stations/89606.html",
-        b"21701" * 10,
-        b"M21701" * 10,
-        b"2^21701-1" * 10,
-        b"\x54\xc5" * 10,
-        b"\xc5\x54" * 10,
-        b"23209" * 10,
-        b"M23209" * 10,
-        b"2^23209-1" * 10,
-        b"\x5a\xa9" * 10,
-        b"\xa9\x5a" * 10,
-        b"391581216093" * 10,
-        b"391581*2^216093-1" * 10,
-        b"\x05\xf9\x9d\x03\x4c\x81" * 10,
-        b"FEDCBA9876543210" * 10,
-        b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10,
-        b"EFCDAB8967452301" * 10,
-        b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10,
-        b"0123456789ABCDEF" * 10,
-        b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10,
-        b"1032547698BADCFE" * 10,
-        b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10,
-        b"\x00" * 500,
-        b"\x07" * 500,
-        b"~" * 500,
-        b"\x7f" * 500,
-    ]
-
-    OUTPUTS = [
-        EMPTY_HASH_VALUE,
-        0xAF63DC4C8601EC8C,
-        0xAF63DF4C8601F1A5,
-        0xAF63DE4C8601EFF2,
-        0xAF63D94C8601E773,
-        0xAF63D84C8601E5C0,
-        0xAF63DB4C8601EAD9,
-        0x08985907B541D342,
-        0xDCB27518FED9D577,
-        0xDD120E790C2512AF,
-        0xCAC165AFA2FEF40A,
-        0x85944171F73967E8,
-        0xAF63BD4C8601B7DF,
-        0x089BE207B544F1E4,
-        0x08A61407B54D9B5F,
-        0x08A2AE07B54AB836,
-        0x0891B007B53C4869,
-        0x088E4A07B5396540,
-        0x08987C07B5420EBB,
-        0xDCB28A18FED9F926,
-        0xDD1270790C25B935,
-        0xCAC146AFA2FEBF5D,
-        0x8593D371F738ACFE,
-        0x34531CA7168B8F38,
-        0x08A25607B54A22AE,
-        0xF5FAF0190CF90DF3,
-        0xF27397910B3221C7,
-        0x2C8C2B76062F22E0,
-        0xE150688C8217B8FD,
-        0xF35A83C10E4F1F87,
-        0xD1EDD10B507344D0,
-        0x2A5EE739B3DDB8C3,
-        0xDCFB970CA1C0D310,
-        0x4054DA76DAA6DA90,
-        0xF70A2FF589861368,
-        0x4C628B38AED25F17,
-        0x9DD1F6510F78189F,
-        0xA3DE85BD491270CE,
-        0x858E2FA32A55E61D,
-        0x46810940EFF5F915,
-        0xF5FADD190CF8EDAA,
-        0xF273ED910B32B3E9,
-        0x2C8C5276062F6525,
-        0xE150B98C821842A0,
-        0xF35AA3C10E4F55E7,
-        0xD1ED680B50729265,
-        0x2A5F0639B3DDED70,
-        0xDCFBAA0CA1C0F359,
-        0x4054BA76DAA6A430,
-        0xF709C7F5898562B0,
-        0x4C62E638AED2F9B8,
-        0x9DD1A8510F779415,
-        0xA3DE2ABD4911D62D,
-        0x858E0EA32A55AE0A,
-        0x46810F40EFF60347,
-        0xC33BCE57BEF63EAF,
-        0x08A24307B54A0265,
-        0xF5B9FD190CC18D15,
-        0x4C968290ACE35703,
-        0x07174BD5C64D9350,
-        0x5A294C3FF5D18750,
-        0x05B3C1AEB308B843,
-        0xB92A48DA37D0F477,
-        0x73CDDDCCD80EBC49,
-        0xD58C4C13210A266B,
-        0xE78B6081243EC194,
-        0xB096F77096A39F34,
-        0xB425C54FF807B6A3,
-        0x23E520E2751BB46E,
-        0x1A0B44CCFE1385EC,
-        0xF5BA4B190CC2119F,
-        0x4C962690ACE2BAAF,
-        0x0716DED5C64CDA19,
-        0x5A292C3FF5D150F0,
-        0x05B3E0AEB308ECF0,
-        0xB92A5EDA37D119D9,
-        0x73CE41CCD80F6635,
-        0xD58C2C132109F00B,
-        0xE78BAF81243F47D1,
-        0xB0968F7096A2EE7C,
-        0xB425A84FF807855C,
-        0x23E4E9E2751B56F9,
-        0x1A0B4ECCFE1396EA,
-        0x54ABD453BB2C9004,
-        0x08BA5F07B55EC3DA,
-        0x337354193006CB6E,
-        0xA430D84680AABD0B,
-        0xA9BC8ACCA21F39B1,
-        0x6961196491CC682D,
-        0xAD2BB1774799DFE9,
-        0x6961166491CC6314,
-        0x8D1BB3904A3B1236,
-        0x6961176491CC64C7,
-        0xED205D87F40434C7,
-        0x6961146491CC5FAE,
-        0xCD3BAF5E44F8AD9C,
-        0xE3B36596127CD6D8,
-        0xF77F1072C8E8A646,
-        0xE3B36396127CD372,
-        0x6067DCE9932AD458,
-        0xE3B37596127CF208,
-        0x4B7B10FA9FE83936,
-        0xAABAFE7104D914BE,
-        0xF4D3180B3CDE3EDA,
-        0xAABAFD7104D9130B,
-        0xF4CFB20B3CDB5BB1,
-        0xAABAFC7104D91158,
-        0xF4CC4C0B3CD87888,
-        0xE729BAC5D2A8D3A7,
-        0x74BC0524F4DFA4C5,
-        0xE72630C5D2A5B352,
-        0x6B983224EF8FB456,
-        0xE73042C5D2AE266D,
-        0x8527E324FDEB4B37,
-        0x0A83C86FEE952ABC,
-        0x7318523267779D74,
-        0x3E66D3D56B8CACA1,
-        0x956694A5C0095593,
-        0xCAC54572BB1A6FC8,
-        0xA7A4C9F3EDEBF0D8,
-        0x7829851FAC17B143,
-        0x2C8F4C9AF81BCF06,
-        0xD34E31539740C732,
-        0x3605A2AC253D2DB1,
-        0x08C11B8346F4A3C3,
-        0x6BE396289CE8A6DA,
-        0xD9B957FB7FE794C5,
-        0x05BE33DA04560A93,
-        0x0957F1577BA9747C,
-        0xDA2CC3ACC24FBA57,
-        0x74136F185B29E7F0,
-        0xB2F2B4590EDB93B2,
-        0xB3608FCE8B86AE04,
-        0x4A3A865079359063,
-        0x5B3A7EF496880A50,
-        0x48FAE3163854C23B,
-        0x07AAA640476E0B9A,
-        0x2F653656383A687D,
-        0xA1031F8E7599D79C,
-        0xA31908178FF92477,
-        0x097EDF3C14C3FB83,
-        0xB51CA83FEAA0971B,
-        0xDD3C0D96D784F2E9,
-        0x86CD26A9EA767D78,
-        0xE6B215FF54A30C18,
-        0xEC5B06A1C5531093,
-        0x45665A929F9EC5E5,
-        0x8C7609B4A9F10907,
-        0x89AAC3A491F0D729,
-        0x32CE6B26E0F4A403,
-        0x614AB44E02B53E01,
-        0xFA6472EB6EEF3290,
-        0x9E5D75EB1948EB6A,
-        0xB6D12AD4A8671852,
-        0x88826F56EBA07AF1,
-        0x44535BF2645BC0FD,
-        0x169388FFC21E3728,
-        0xF68AAC9E396D8224,
-        0x8E87D7E7472B3883,
-        0x295C26CAA8B423DE,
-        0x322C814292E72176,
-        0x8A06550EB8AF7268,
-        0xEF86D60E661BCF71,
-        0x9E5426C87F30EE54,
-        0xF1EA8AA826FD047E,
-        0x0BABAF9A642CB769,
-        0x4B3341D4068D012E,
-        0xD15605CBC30A335C,
-        0x5B21060AED8412E5,
-        0x45E2CDA1CE6F4227,
-        0x50AE3745033AD7D4,
-        0xAA4588CED46BF414,
-        0xC1B0056C4A95467E,
-        0x56576A71DE8B4089,
-        0xBF20965FA6DC927E,
-        0x569F8383C2040882,
-        0xE1E772FBA08FECA0,
-        0x4CED94AF97138AC4,
-        0xC4112FFB337A82FB,
-        0xD64A4FD41DE38B7D,
-        0x4CFC32329EDEBCBB,
-        0x0803564445050395,
-        0xAA1574ECF4642FFD,
-        0x694BC4E54CC315F9,
-        0xA3D7CB273B011721,
-        0x577C2F8B6115BFA5,
-        0xB7EC8C1A769FB4C1,
-        0x5D5CFCE63359AB19,
-        0x33B96C3CD65B5F71,
-        0xD845097780602BB9,
-        0x84D47645D02DA3D5,
-        0x83544F33B58773A5,
-        0x9175CBB2160836C5,
-        0xC71B3BC175E72BC5,
-        0x636806AC222EC985,
-        0xB6EF0E6950F52ED5,
-        0xEAD3D8A0F3DFDAA5,
-        0x922908FE9A861BA5,
-        0x6D4821DE275FD5C5,
-        0x1FE3FCE62BD816B5,
-        0xC23E9FCCD6F70591,
-        0xC1AF12BDFE16B5B5,
-        0x39E9F18F2F85E221,
-    ]
-
-    assert len(INPUTS) == len(OUTPUTS)
-    for i in range(len(INPUTS)):
-        assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
-
-
-def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
-    encoded_input = input.encode("UTF-8")
-    if reverse:
-        encoded_input = encoded_input[::-1]
-    return _get_fnv1a_hash(encoded_input)
-
-
-@pytest.mark.parametrize("case_sensitive", [True, False])
-def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
-    doc = en_tokenizer("spaCy✨ and Prodigy")
-
-    hashes = doc.get_character_combination_hashes(
-        case_sensitive=case_sensitive,
-        p_lengths=bytes(
-            (
-                1,
-                3,
-                4,
-            )
-        ),
-        s_lengths=bytes(
-            (
-                2,
-                3,
-                4,
-                5,
-            )
-        ),
-    )
-    assert hashes[0][0] == _encode_and_hash("s")
-    assert hashes[0][1] == _encode_and_hash("spa")
-    assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac")
-    assert hashes[0][3] == _encode_and_hash("yC" if case_sensitive else "yc")
-    assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca")
-    assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap")
-    assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps")
-    assert hashes[1][0] == _encode_and_hash("✨")
-    assert hashes[1][1] == _encode_and_hash("✨")
-    assert hashes[1][2] == _encode_and_hash("✨")
-    assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
-    assert hashes[1][4] == _encode_and_hash("✨", reverse=True)
-    assert hashes[1][5] == _encode_and_hash("✨", reverse=True)
-    assert hashes[1][6] == _encode_and_hash("✨", reverse=True)
-    assert hashes[2][0] == _encode_and_hash("a")
-    assert hashes[2][1] == _encode_and_hash("and")
-    assert hashes[2][2] == _encode_and_hash("and")
-    assert hashes[2][3] == _encode_and_hash("dn")
-    assert hashes[2][4] == _encode_and_hash("dna")
-    assert hashes[2][5] == _encode_and_hash("dna")
-    assert hashes[2][6] == _encode_and_hash("dna")
-    assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
-    assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
-    assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
-    assert hashes[3][3] == _encode_and_hash("yg")
-    assert hashes[3][4] == _encode_and_hash("ygi")
-    assert hashes[3][5] == _encode_and_hash("ygid")
-    assert hashes[3][6] == _encode_and_hash("ygido")
-
-
-def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
-    doc = en_tokenizer("spaCy✨ and Prodigy")
-    hashes = doc.get_character_combination_hashes(
-        case_sensitive=False,
-        p_lengths=bytes(),
-        s_lengths=bytes(
-            (
-                2,
-                3,
-                4,
-                5,
-            )
-        ),
-    )
-
-    assert hashes[0][0] == _encode_and_hash("yc")
-    assert hashes[0][1] == _encode_and_hash("yca")
-    assert hashes[0][2] == _encode_and_hash("ycap")
-    assert hashes[0][3] == _encode_and_hash("ycaps")
-    assert hashes[1][0] == _encode_and_hash("✨", reverse=True)
-    assert hashes[1][1] == _encode_and_hash("✨", reverse=True)
-    assert hashes[1][2] == _encode_and_hash("✨", reverse=True)
-    assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
-    assert hashes[2][0] == _encode_and_hash("dn")
-    assert hashes[2][1] == _encode_and_hash("dna")
-    assert hashes[2][2] == _encode_and_hash("dna")
-    assert hashes[2][3] == _encode_and_hash("dna")
-    assert hashes[3][0] == _encode_and_hash("yg")
-    assert hashes[3][1] == _encode_and_hash("ygi")
-    assert hashes[3][2] == _encode_and_hash("ygid")
-    assert hashes[3][3] == _encode_and_hash("ygido")
-
-
-def test_get_character_combination_hashes_various_lengths(en_tokenizer):
-    doc = en_tokenizer("sp𐌞Cé")
-
-    for p_length in range(1, 8):
-        for s_length in range(1, 8):
-
-            hashes = doc.get_character_combination_hashes(
-                case_sensitive=False,
-                p_lengths=bytes((p_length,)),
-                s_lengths=bytes((s_length,)),
-            )
-
-            assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
-            assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:], reverse=True)
-
-
-@pytest.mark.parametrize("case_sensitive", [True, False])
-def test_get_character_combination_hashes_turkish_i_with_dot(
-    en_tokenizer, case_sensitive
-):
-    doc = en_tokenizer("İ".lower() + "İ")
-    hashes = doc.get_character_combination_hashes(
-        case_sensitive=case_sensitive,
-        p_lengths=bytes(
-            (
-                1,
-                2,
-                3,
-                4,
-            )
-        ),
-        s_lengths=bytes(
-            (
-                1,
-                2,
-                3,
-                4,
-            )
-        ),
-    )
-
-    COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
-    assert hashes[0][0] == _encode_and_hash("i")
-    assert hashes[0][1] == _encode_and_hash("İ".lower())
-    if case_sensitive:
-        assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ")
-        assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ")
-        assert hashes[0][4] == _encode_and_hash("İ", reverse=True)
-        assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True)
-        assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True)
-        assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True)
-
-    else:
-        assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
-        assert hashes[0][3] == _encode_and_hash("İ".lower() * 2)
-        assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE, reverse=True)
-        assert hashes[0][5] == _encode_and_hash("İ".lower(), reverse=True)
-        assert hashes[0][6] == _encode_and_hash(
-            COMBINING_DOT_ABOVE + "İ".lower(), reverse=True
-        )
-        assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True)
-
-
-@pytest.mark.parametrize("case_sensitive", [True, False])
-def test_get_character_combination_hashes_string_store_spec_cases(
-    en_tokenizer, case_sensitive
-):
-    symbol = "FLAG19"
-    short_word = "bee"
-    normal_word = "serendipity"
-    long_word = "serendipity" * 50
-    assert len(long_word) > 255
-    doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
-    assert len(doc) == 4
-    hashes = doc.get_character_combination_hashes(
-        case_sensitive=case_sensitive,
-        p_lengths=bytes((2,)),
-        s_lengths=bytes((2,)),
-    )
-    assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
-    assert hashes[0][1] == _encode_and_hash("91")
-    assert hashes[1][0] == _encode_and_hash("be")
-    assert hashes[1][1] == _encode_and_hash("ee")
-    assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
-    assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt")
-
-
-def test_character_combination_hashes_empty_lengths(en_tokenizer):
-    doc = en_tokenizer("and𐌞")
-    assert doc.get_character_combination_hashes(
-        case_sensitive=True,
-        p_lengths=bytes(),
-        s_lengths=bytes(),
-    ).shape == (1, 0)
--- a/spacy/tests/doc/test_richfeatureextractor.py
+++ b/spacy/tests/doc/test_richfeatureextractor.py
@ -0,0 +1,639 @@
+import pytest
+from ...ml.richfeatureextractor import get_character_combination_hashes
+
+EMPTY_HASH_VALUE = 0xCBF29CE484222325
+FNV1A_OFFSET_BASIS = 0xCBF29CE484222325
+FNV1A_PRIME = 0x00000100000001B3
+
+
+def _get_fnv1a_hash(input: bytes) -> int:
+    hash_val = FNV1A_OFFSET_BASIS
+    length = len(input)
+    offset = 0
+
+    while offset < length:
+        hash_val ^= input[offset]
+        hash_val *= FNV1A_PRIME
+        hash_val %= 2**64
+        offset += 1
+    return hash_val
+
+
+def test_fnv1a_hash():
+    """Checks the conformity of the 64-bit FNV1A implementation with
+    http://www.isthe.com/chongo/src/fnv/test_fnv.c.
+    The method called here, _get_fnv1a_hash(), is only used in testing;
+    in production code, the hashing is performed in a fashion that is interweaved
+    with other logic. The conformity of the production code is demonstrated by the
+    character combination hash tests, where hashes produced by the production code
+    are tested for equality against hashes produced by _get_fnv1a_hash().
+    """
+    INPUTS = [
+        b"",
+        b"a",
+        b"b",
+        b"c",
+        b"d",
+        b"e",
+        b"f",
+        b"fo",
+        b"foo",
+        b"foob",
+        b"fooba",
+        b"foobar",
+        b"\x00",
+        b"a\x00",
+        b"b\x00",
+        b"c\x00",
+        b"d\x00",
+        b"e\x00",
+        b"f\x00",
+        b"fo\x00",
+        b"foo\x00",
+        b"foob\x00",
+        b"fooba\x00",
+        b"foobar\x00",
+        b"ch",
+        b"cho",
+        b"chon",
+        b"chong",
+        b"chongo",
+        b"chongo ",
+        b"chongo w",
+        b"chongo wa",
+        b"chongo was",
+        b"chongo was ",
+        b"chongo was h",
+        b"chongo was he",
+        b"chongo was her",
+        b"chongo was here",
+        b"chongo was here!",
+        b"chongo was here!\n",
+        b"ch\x00",
+        b"cho\x00",
+        b"chon\x00",
+        b"chong\x00",
+        b"chongo\x00",
+        b"chongo \x00",
+        b"chongo w\x00",
+        b"chongo wa\x00",
+        b"chongo was\x00",
+        b"chongo was \x00",
+        b"chongo was h\x00",
+        b"chongo was he\x00",
+        b"chongo was her\x00",
+        b"chongo was here\x00",
+        b"chongo was here!\x00",
+        b"chongo was here!\n\x00",
+        b"cu",
+        b"cur",
+        b"curd",
+        b"curds",
+        b"curds ",
+        b"curds a",
+        b"curds an",
+        b"curds and",
+        b"curds and ",
+        b"curds and w",
+        b"curds and wh",
+        b"curds and whe",
+        b"curds and whey",
+        b"curds and whey\n",
+        b"cu\x00",
+        b"cur\x00",
+        b"curd\x00",
+        b"curds\x00",
+        b"curds \x00",
+        b"curds a\x00",
+        b"curds an\x00",
+        b"curds and\x00",
+        b"curds and \x00",
+        b"curds and w\x00",
+        b"curds and wh\x00",
+        b"curds and whe\x00",
+        b"curds and whey\x00",
+        b"curds and whey\n\x00",
+        b"hi",
+        b"hi\x00",
+        b"hello",
+        b"hello\x00",
+        b"\xff\x00\x00\x01",
+        b"\x01\x00\x00\xff",
+        b"\xff\x00\x00\x02",
+        b"\x02\x00\x00\xff",
+        b"\xff\x00\x00\x03",
+        b"\x03\x00\x00\xff",
+        b"\xff\x00\x00\x04",
+        b"\x04\x00\x00\xff",
+        b"\x40\x51\x4e\x44",
+        b"\x44\x4e\x51\x40",
+        b"\x40\x51\x4e\x4a",
+        b"\x4a\x4e\x51\x40",
+        b"\x40\x51\x4e\x54",
+        b"\x54\x4e\x51\x40",
+        b"127.0.0.1",
+        b"127.0.0.1\x00",
+        b"127.0.0.2",
+        b"127.0.0.2\x00",
+        b"127.0.0.3",
+        b"127.0.0.3\x00",
+        b"64.81.78.68",
+        b"64.81.78.68\x00",
+        b"64.81.78.74",
+        b"64.81.78.74\x00",
+        b"64.81.78.84",
+        b"64.81.78.84\x00",
+        b"feedface",
+        b"feedface\x00",
+        b"feedfacedaffdeed",
+        b"feedfacedaffdeed\x00",
+        b"feedfacedeadbeef",
+        b"feedfacedeadbeef\x00",
+        b"line 1\nline 2\nline 3",
+        b"chongo <Landon Curt Noll> /\\../\\",
+        b"chongo <Landon Curt Noll> /\\../\\\x00",
+        b"chongo (Landon Curt Noll) /\\../\\",
+        b"chongo (Landon Curt Noll) /\\../\\\x00",
+        b"http://antwrp.gsfc.nasa.gov/apod/astropix.html",
+        b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash",
+        b"http://epod.usra.edu/",
+        b"http://exoplanet.eu/",
+        b"http://hvo.wr.usgs.gov/cam3/",
+        b"http://hvo.wr.usgs.gov/cams/HMcam/",
+        b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html",
+        b"http://hvo.wr.usgs.gov/kilauea/update/images.html",
+        b"http://hvo.wr.usgs.gov/kilauea/update/maps.html",
+        b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html",
+        b"http://neo.jpl.nasa.gov/risk/",
+        b"http://norvig.com/21-days.html",
+        b"http://primes.utm.edu/curios/home.php",
+        b"http://slashdot.org/",
+        b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html",
+        b"http://volcano.wr.usgs.gov/kilaueastatus.php",
+        b"http://www.avo.alaska.edu/activity/Redoubt.php",
+        b"http://www.dilbert.com/fast/",
+        b"http://www.fourmilab.ch/gravitation/orbits/",
+        b"http://www.fpoa.net/",
+        b"http://www.ioccc.org/index.html",
+        b"http://www.isthe.com/cgi-bin/number.cgi",
+        b"http://www.isthe.com/chongo/bio.html",
+        b"http://www.isthe.com/chongo/index.html",
+        b"http://www.isthe.com/chongo/src/calc/lucas-calc",
+        b"http://www.isthe.com/chongo/tech/astro/venus2004.html",
+        b"http://www.isthe.com/chongo/tech/astro/vita.html",
+        b"http://www.isthe.com/chongo/tech/comp/c/expert.html",
+        b"http://www.isthe.com/chongo/tech/comp/calc/index.html",
+        b"http://www.isthe.com/chongo/tech/comp/fnv/index.html",
+        b"http://www.isthe.com/chongo/tech/math/number/howhigh.html",
+        b"http://www.isthe.com/chongo/tech/math/number/number.html",
+        b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html",
+        b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest",
+        b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi",
+        b"http://www.lavarnd.org/cgi-bin/haiku.cgi",
+        b"http://www.lavarnd.org/cgi-bin/rand-none.cgi",
+        b"http://www.lavarnd.org/cgi-bin/randdist.cgi",
+        b"http://www.lavarnd.org/index.html",
+        b"http://www.lavarnd.org/what/nist-test.html",
+        b"http://www.macosxhints.com/",
+        b"http://www.mellis.com/",
+        b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm",
+        b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm",
+        b"http://www.paulnoll.com/",
+        b"http://www.pepysdiary.com/",
+        b"http://www.sciencenews.org/index/home/activity/view",
+        b"http://www.skyandtelescope.com/",
+        b"http://www.sput.nl/~rob/sirius.html",
+        b"http://www.systemexperts.com/",
+        b"http://www.tq-international.com/phpBB3/index.php",
+        b"http://www.travelquesttours.com/index.htm",
+        b"http://www.wunderground.com/global/stations/89606.html",
+        b"21701" * 10,
+        b"M21701" * 10,
+        b"2^21701-1" * 10,
+        b"\x54\xc5" * 10,
+        b"\xc5\x54" * 10,
+        b"23209" * 10,
+        b"M23209" * 10,
+        b"2^23209-1" * 10,
+        b"\x5a\xa9" * 10,
+        b"\xa9\x5a" * 10,
+        b"391581216093" * 10,
+        b"391581*2^216093-1" * 10,
+        b"\x05\xf9\x9d\x03\x4c\x81" * 10,
+        b"FEDCBA9876543210" * 10,
+        b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10,
+        b"EFCDAB8967452301" * 10,
+        b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10,
+        b"0123456789ABCDEF" * 10,
+        b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10,
+        b"1032547698BADCFE" * 10,
+        b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10,
+        b"\x00" * 500,
+        b"\x07" * 500,
+        b"~" * 500,
+        b"\x7f" * 500,
+    ]
+
+    OUTPUTS = [
+        EMPTY_HASH_VALUE,
+        0xAF63DC4C8601EC8C,
+        0xAF63DF4C8601F1A5,
+        0xAF63DE4C8601EFF2,
+        0xAF63D94C8601E773,
+        0xAF63D84C8601E5C0,
+        0xAF63DB4C8601EAD9,
+        0x08985907B541D342,
+        0xDCB27518FED9D577,
+        0xDD120E790C2512AF,
+        0xCAC165AFA2FEF40A,
+        0x85944171F73967E8,
+        0xAF63BD4C8601B7DF,
+        0x089BE207B544F1E4,
+        0x08A61407B54D9B5F,
+        0x08A2AE07B54AB836,
+        0x0891B007B53C4869,
+        0x088E4A07B5396540,
+        0x08987C07B5420EBB,
+        0xDCB28A18FED9F926,
+        0xDD1270790C25B935,
+        0xCAC146AFA2FEBF5D,
+        0x8593D371F738ACFE,
+        0x34531CA7168B8F38,
+        0x08A25607B54A22AE,
+        0xF5FAF0190CF90DF3,
+        0xF27397910B3221C7,
+        0x2C8C2B76062F22E0,
+        0xE150688C8217B8FD,
+        0xF35A83C10E4F1F87,
+        0xD1EDD10B507344D0,
+        0x2A5EE739B3DDB8C3,
+        0xDCFB970CA1C0D310,
+        0x4054DA76DAA6DA90,
+        0xF70A2FF589861368,
+        0x4C628B38AED25F17,
+        0x9DD1F6510F78189F,
+        0xA3DE85BD491270CE,
+        0x858E2FA32A55E61D,
+        0x46810940EFF5F915,
+        0xF5FADD190CF8EDAA,
+        0xF273ED910B32B3E9,
+        0x2C8C5276062F6525,
+        0xE150B98C821842A0,
+        0xF35AA3C10E4F55E7,
+        0xD1ED680B50729265,
+        0x2A5F0639B3DDED70,
+        0xDCFBAA0CA1C0F359,
+        0x4054BA76DAA6A430,
+        0xF709C7F5898562B0,
+        0x4C62E638AED2F9B8,
+        0x9DD1A8510F779415,
+        0xA3DE2ABD4911D62D,
+        0x858E0EA32A55AE0A,
+        0x46810F40EFF60347,
+        0xC33BCE57BEF63EAF,
+        0x08A24307B54A0265,
+        0xF5B9FD190CC18D15,
+        0x4C968290ACE35703,
+        0x07174BD5C64D9350,
+        0x5A294C3FF5D18750,
+        0x05B3C1AEB308B843,
+        0xB92A48DA37D0F477,
+        0x73CDDDCCD80EBC49,
+        0xD58C4C13210A266B,
+        0xE78B6081243EC194,
+        0xB096F77096A39F34,
+        0xB425C54FF807B6A3,
+        0x23E520E2751BB46E,
+        0x1A0B44CCFE1385EC,
+        0xF5BA4B190CC2119F,
+        0x4C962690ACE2BAAF,
+        0x0716DED5C64CDA19,
+        0x5A292C3FF5D150F0,
+        0x05B3E0AEB308ECF0,
+        0xB92A5EDA37D119D9,
+        0x73CE41CCD80F6635,
+        0xD58C2C132109F00B,
+        0xE78BAF81243F47D1,
+        0xB0968F7096A2EE7C,
+        0xB425A84FF807855C,
+        0x23E4E9E2751B56F9,
+        0x1A0B4ECCFE1396EA,
+        0x54ABD453BB2C9004,
+        0x08BA5F07B55EC3DA,
+        0x337354193006CB6E,
+        0xA430D84680AABD0B,
+        0xA9BC8ACCA21F39B1,
+        0x6961196491CC682D,
+        0xAD2BB1774799DFE9,
+        0x6961166491CC6314,
+        0x8D1BB3904A3B1236,
+        0x6961176491CC64C7,
+        0xED205D87F40434C7,
+        0x6961146491CC5FAE,
+        0xCD3BAF5E44F8AD9C,
+        0xE3B36596127CD6D8,
+        0xF77F1072C8E8A646,
+        0xE3B36396127CD372,
+        0x6067DCE9932AD458,
+        0xE3B37596127CF208,
+        0x4B7B10FA9FE83936,
+        0xAABAFE7104D914BE,
+        0xF4D3180B3CDE3EDA,
+        0xAABAFD7104D9130B,
+        0xF4CFB20B3CDB5BB1,
+        0xAABAFC7104D91158,
+        0xF4CC4C0B3CD87888,
+        0xE729BAC5D2A8D3A7,
+        0x74BC0524F4DFA4C5,
+        0xE72630C5D2A5B352,
+        0x6B983224EF8FB456,
+        0xE73042C5D2AE266D,
+        0x8527E324FDEB4B37,
+        0x0A83C86FEE952ABC,
+        0x7318523267779D74,
+        0x3E66D3D56B8CACA1,
+        0x956694A5C0095593,
+        0xCAC54572BB1A6FC8,
+        0xA7A4C9F3EDEBF0D8,
+        0x7829851FAC17B143,
+        0x2C8F4C9AF81BCF06,
+        0xD34E31539740C732,
+        0x3605A2AC253D2DB1,
+        0x08C11B8346F4A3C3,
+        0x6BE396289CE8A6DA,
+        0xD9B957FB7FE794C5,
+        0x05BE33DA04560A93,
+        0x0957F1577BA9747C,
+        0xDA2CC3ACC24FBA57,
+        0x74136F185B29E7F0,
+        0xB2F2B4590EDB93B2,
+        0xB3608FCE8B86AE04,
+        0x4A3A865079359063,
+        0x5B3A7EF496880A50,
+        0x48FAE3163854C23B,
+        0x07AAA640476E0B9A,
+        0x2F653656383A687D,
+        0xA1031F8E7599D79C,
+        0xA31908178FF92477,
+        0x097EDF3C14C3FB83,
+        0xB51CA83FEAA0971B,
+        0xDD3C0D96D784F2E9,
+        0x86CD26A9EA767D78,
+        0xE6B215FF54A30C18,
+        0xEC5B06A1C5531093,
+        0x45665A929F9EC5E5,
+        0x8C7609B4A9F10907,
+        0x89AAC3A491F0D729,
+        0x32CE6B26E0F4A403,
+        0x614AB44E02B53E01,
+        0xFA6472EB6EEF3290,
+        0x9E5D75EB1948EB6A,
+        0xB6D12AD4A8671852,
+        0x88826F56EBA07AF1,
+        0x44535BF2645BC0FD,
+        0x169388FFC21E3728,
+        0xF68AAC9E396D8224,
+        0x8E87D7E7472B3883,
+        0x295C26CAA8B423DE,
+        0x322C814292E72176,
+        0x8A06550EB8AF7268,
+        0xEF86D60E661BCF71,
+        0x9E5426C87F30EE54,
+        0xF1EA8AA826FD047E,
+        0x0BABAF9A642CB769,
+        0x4B3341D4068D012E,
+        0xD15605CBC30A335C,
+        0x5B21060AED8412E5,
+        0x45E2CDA1CE6F4227,
+        0x50AE3745033AD7D4,
+        0xAA4588CED46BF414,
+        0xC1B0056C4A95467E,
+        0x56576A71DE8B4089,
+        0xBF20965FA6DC927E,
+        0x569F8383C2040882,
+        0xE1E772FBA08FECA0,
+        0x4CED94AF97138AC4,
+        0xC4112FFB337A82FB,
+        0xD64A4FD41DE38B7D,
+        0x4CFC32329EDEBCBB,
+        0x0803564445050395,
+        0xAA1574ECF4642FFD,
+        0x694BC4E54CC315F9,
+        0xA3D7CB273B011721,
+        0x577C2F8B6115BFA5,
+        0xB7EC8C1A769FB4C1,
+        0x5D5CFCE63359AB19,
+        0x33B96C3CD65B5F71,
+        0xD845097780602BB9,
+        0x84D47645D02DA3D5,
+        0x83544F33B58773A5,
+        0x9175CBB2160836C5,
+        0xC71B3BC175E72BC5,
+        0x636806AC222EC985,
+        0xB6EF0E6950F52ED5,
+        0xEAD3D8A0F3DFDAA5,
+        0x922908FE9A861BA5,
+        0x6D4821DE275FD5C5,
+        0x1FE3FCE62BD816B5,
+        0xC23E9FCCD6F70591,
+        0xC1AF12BDFE16B5B5,
+        0x39E9F18F2F85E221,
+    ]
+
+    assert len(INPUTS) == len(OUTPUTS)
+    for i in range(len(INPUTS)):
+        assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
+
+
+def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
+    encoded_input = input.encode("UTF-8")
+    if reverse:
+        encoded_input = encoded_input[::-1]
+    return _get_fnv1a_hash(encoded_input)
+
+
+@pytest.mark.parametrize("case_sensitive", [True, False])
+def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
+    doc = en_tokenizer("spaCy✨ and Prodigy")
+
+    hashes = get_character_combination_hashes(
+        doc=doc,
+        case_sensitive=case_sensitive,
+        p_lengths=bytes(
+            (
+                1,
+                3,
+                4,
+            )
+        ),
+        s_lengths=bytes(
+            (
+                2,
+                3,
+                4,
+                5,
+            )
+        ),
+    )
+    assert hashes[0][0] == _encode_and_hash("s")
+    assert hashes[0][1] == _encode_and_hash("spa")
+    assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac")
+    assert hashes[0][3] == _encode_and_hash("yC" if case_sensitive else "yc")
+    assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca")
+    assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap")
+    assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps")
+    assert hashes[1][0] == _encode_and_hash("✨")
+    assert hashes[1][1] == _encode_and_hash("✨")
+    assert hashes[1][2] == _encode_and_hash("✨")
+    assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
+    assert hashes[1][4] == _encode_and_hash("✨", reverse=True)
+    assert hashes[1][5] == _encode_and_hash("✨", reverse=True)
+    assert hashes[1][6] == _encode_and_hash("✨", reverse=True)
+    assert hashes[2][0] == _encode_and_hash("a")
+    assert hashes[2][1] == _encode_and_hash("and")
+    assert hashes[2][2] == _encode_and_hash("and")
+    assert hashes[2][3] == _encode_and_hash("dn")
+    assert hashes[2][4] == _encode_and_hash("dna")
+    assert hashes[2][5] == _encode_and_hash("dna")
+    assert hashes[2][6] == _encode_and_hash("dna")
+    assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
+    assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
+    assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
+    assert hashes[3][3] == _encode_and_hash("yg")
+    assert hashes[3][4] == _encode_and_hash("ygi")
+    assert hashes[3][5] == _encode_and_hash("ygid")
+    assert hashes[3][6] == _encode_and_hash("ygido")
+
+
+def test_get_character_combination_hashes_good_case_no_prefixes(en_tokenizer):
+    doc = en_tokenizer("spaCy✨ and Prodigy")
+    hashes = get_character_combination_hashes(
+        doc=doc,
+        case_sensitive=False,
+        p_lengths=bytes(),
+        s_lengths=bytes(
+            (
+                2,
+                3,
+                4,
+                5,
+            )
+        ),
+    )
+
+    assert hashes[0][0] == _encode_and_hash("yc")
+    assert hashes[0][1] == _encode_and_hash("yca")
+    assert hashes[0][2] == _encode_and_hash("ycap")
+    assert hashes[0][3] == _encode_and_hash("ycaps")
+    assert hashes[1][0] == _encode_and_hash("✨", reverse=True)
+    assert hashes[1][1] == _encode_and_hash("✨", reverse=True)
+    assert hashes[1][2] == _encode_and_hash("✨", reverse=True)
+    assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
+    assert hashes[2][0] == _encode_and_hash("dn")
+    assert hashes[2][1] == _encode_and_hash("dna")
+    assert hashes[2][2] == _encode_and_hash("dna")
+    assert hashes[2][3] == _encode_and_hash("dna")
+    assert hashes[3][0] == _encode_and_hash("yg")
+    assert hashes[3][1] == _encode_and_hash("ygi")
+    assert hashes[3][2] == _encode_and_hash("ygid")
+    assert hashes[3][3] == _encode_and_hash("ygido")
+
+
+def test_get_character_combination_hashes_loop_through_lengths(en_tokenizer):
+    doc = en_tokenizer("sp𐌞Cé")
+
+    for p_length in range(1, 8):
+        for s_length in range(1, 8):
+
+            hashes = get_character_combination_hashes(
+                doc=doc,
+                case_sensitive=False,
+                p_lengths=bytes((p_length,)),
+                s_lengths=bytes((s_length,)),
+            )
+
+            assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
+            assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:], reverse=True)
+
+
+@pytest.mark.parametrize("case_sensitive", [True, False])
+def test_get_character_combination_hashes_turkish_i_with_dot(
+    en_tokenizer, case_sensitive
+):
+    doc = en_tokenizer("İ".lower() + "İ")
+    hashes = get_character_combination_hashes(
+        doc=doc,
+        case_sensitive=case_sensitive,
+        p_lengths=bytes(
+            (
+                1,
+                2,
+                3,
+                4,
+            )
+        ),
+        s_lengths=bytes(
+            (
+                1,
+                2,
+                3,
+                4,
+            )
+        ),
+    )
+
+    COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
+    assert hashes[0][0] == _encode_and_hash("i")
+    assert hashes[0][1] == _encode_and_hash("İ".lower())
+    if case_sensitive:
+        assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ")
+        assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ")
+        assert hashes[0][4] == _encode_and_hash("İ", reverse=True)
+        assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True)
+        assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True)
+        assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True)
+
+    else:
+        assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
+        assert hashes[0][3] == _encode_and_hash("İ".lower() * 2)
+        assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE, reverse=True)
+        assert hashes[0][5] == _encode_and_hash("İ".lower(), reverse=True)
+        assert hashes[0][6] == _encode_and_hash(
+            COMBINING_DOT_ABOVE + "İ".lower(), reverse=True
+        )
+        assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True)
+
+
+@pytest.mark.parametrize("case_sensitive", [True, False])
+def test_get_character_combination_hashes_string_store_spec_cases(
+    en_tokenizer, case_sensitive
+):
+    symbol = "FLAG19"
+    short_word = "bee"
+    normal_word = "serendipity"
+    long_word = "serendipity" * 50
+    assert len(long_word) > 255
+    doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
+    assert len(doc) == 4
+    hashes = get_character_combination_hashes(
+        doc=doc,
+        case_sensitive=case_sensitive,
+        p_lengths=bytes((2,)),
+        s_lengths=bytes((2,)),
+    )
+    assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
+    assert hashes[0][1] == _encode_and_hash("91")
+    assert hashes[1][0] == _encode_and_hash("be")
+    assert hashes[1][1] == _encode_and_hash("ee")
+    assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
+    assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt")
+
+
+def test_character_combination_hashes_empty_lengths(en_tokenizer):
+    doc = en_tokenizer("and𐌞")
+    assert get_character_combination_hashes(
+        doc=doc,
+        case_sensitive=True,
+        p_lengths=bytes(),
+        s_lengths=bytes(),
+    ).shape == (1, 0)
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -8,7 +8,8 @@ from ..attrs cimport attr_id_t


 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
-cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil    
+cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil
+

 ctypedef const LexemeC* const_Lexeme_ptr
 ctypedef const TokenC* const_TokenC_ptr
@ -18,7 +19,6 @@ ctypedef fused LexemeOrToken:
    const_TokenC_ptr


-
 cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1


@ -34,31 +34,6 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
 cdef int [:,:] _get_lca_matrix(Doc, int start, int end)


-cdef void _set_prefix_lengths(
-    const unsigned char* tok_str,
-    const int tok_str_l,
-    const int p_max_l, 
-    unsigned char* pref_l_buf,
-) nogil
-
-
-cdef void _set_suffix_lengths(
-    const unsigned char* tok_str,
-    const int tok_str_l,
-    const int s_max_l, 
-    unsigned char* suff_l_buf,
-) nogil
-
-
-cdef int _write_hashes(
-    const unsigned char* res_buf,
-    const unsigned char* aff_l_buf,
-    const unsigned char* offset_buf,
-    const int res_buf_last,
-    np.uint64_t* hashes_ptr,
-) nogil
-
-
 cdef class Doc:
    cdef readonly Pool mem
    cdef readonly Vocab vocab
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -126,7 +126,7 @@ class Doc:
        blocked: Optional[List[Span]] = ...,
        missing: Optional[List[Span]] = ...,
        outside: Optional[List[Span]] = ...,
-        default: str = ...,
+        default: str = ...
    ) -> None: ...
    @property
    def noun_chunks(self) -> Iterator[Span]: ...
@ -174,12 +174,5 @@ class Doc:
        self, doc_json: Dict[str, Any] = ..., validate: bool = False
    ) -> Doc: ...
    def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ...
-    def get_character_combination_hashes(
-        self,
-        *,
-        case_sensitive: bool,
-        p_lengths: bytes,
-        s_lengths: bytes,
-    ) -> Ints2d: ...
    @staticmethod
    def _get_array_attrs() -> Tuple[Any]: ...
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1,9 +1,9 @@
 # cython: infer_types=True, bounds_check=False, profile=True
-from typing import Set, List
+from typing import Set

 cimport cython
 cimport numpy as np
-from libc.string cimport memcpy, memcmp, memset, strlen
+from libc.string cimport memcpy
 from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t

@ -21,7 +21,6 @@ from .span cimport Span
 from .token cimport MISSING_DEP
 from ._dict_proxies import SpanGroups
 from .token cimport Token
-from ..symbols import NAMES as SYMBOLS_BY_INT
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
@ -41,7 +40,7 @@ from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces

 DEF PADDING = 5
-MAX_UTF8_CHAR_BYTE_WIDTH = 4
+

 cdef int bounds_check(int i, int length, int padding) except -1:
    if (i + padding) < 0:
@ -1746,77 +1745,6 @@ cdef class Doc:
                    j += 1
        return output

-    def get_character_combination_hashes(self,
-        *,
-        const bint case_sensitive, 
-        const unsigned char* p_lengths,
-        const unsigned char* s_lengths,
-    ):
-        """
-        Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations 
-            derived from the raw text of each token.
-
-        case_sensitive: if *False*, hashes are generated based on the lower-case version of each token.
-        p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order. 
-            For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
-        s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order. 
-            For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa".
-    
-        Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of 
-        the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
-        for ensuring that lengths being passed in cannot exceed 63 and hence, with maximally four-byte 
-        character widths, that individual values within buffers can never exceed the capacity of a single byte (255).
-
-        Note that this method performs no data validation itself as it expects the calling code will already have done so, and
-        that the behaviour of the code may be erratic if the supplied parameters do not conform to expectations.
-        """
-        
-        # Work out lengths
-        cdef int p_lengths_l = strlen(<char*> p_lengths)
-        cdef int s_lengths_l = strlen(<char*> s_lengths)
-        cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0
-        cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0
-        
-        # Define / allocate buffers
-        cdef Pool mem = Pool()
-        cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
-        cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
-        cdef int doc_l = self.length
-        cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
-            (doc_l, p_lengths_l + s_lengths_l), dtype="uint64")
-        cdef np.uint64_t* hashes_ptr = <np.uint64_t*> hashes.data
-         
-        # Define working variables
-        cdef TokenC tok_c
-        cdef int tok_i, tok_str_l
-        cdef attr_t num_tok_attr
-        cdef bytes tok_str_bytes
-        cdef const unsigned char* tok_str
-        
-        for tok_i in range(doc_l):
-            tok_c = self.c[tok_i]
-            num_tok_attr = tok_c.lex.orth if case_sensitive else tok_c.lex.lower
-            if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens
-                if num_tok_attr == 0:
-                    tok_str_bytes = b""
-                else:
-                    tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8")
-                tok_str = tok_str_bytes
-                tok_str_l = len(tok_str_bytes)
-            else:
-                tok_str, tok_str_l = self.vocab.strings.utf8_ptr(num_tok_attr)
-
-            if p_max_l > 0:
-                _set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf)
-                hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes_ptr)
-
-            if s_max_l > 0:
-                _set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf)
-                hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr)
-                    
-        return hashes
-
-
    @staticmethod
    def _get_array_attrs():
        attrs = [LENGTH, SPACY]
@ -1998,113 +1926,6 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
    return lca_matrix


-cdef void _set_prefix_lengths(
-    const unsigned char* tok_str,
-    const int tok_str_l,
-    const int p_max_l, 
-    unsigned char* pref_l_buf,
-) nogil:
-    """ Populate *pref_l_buf*, which has length *p_max_l*, with the byte lengths of each of the substrings terminated by the first *p_max_l* 
-        characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length 
-        of the whole word.
-
-        tok_str: a UTF-8 representation of a string.
-        tok_str_l: the length of *tok_str*.
-        p_max_l: the number of characters to process at the beginning of the word.
-        pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is 
-        responsible for ensuring that *p_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values 
-        within the buffer can never exceed the capacity of a single byte (255).
-    """
-    cdef int tok_str_idx = 1, pref_l_buf_idx = 0
-
-    while pref_l_buf_idx < p_max_l:
-        if (tok_str_idx >= tok_str_l 
-            or 
-            ((tok_str[tok_str_idx] & 0xc0) != 0x80)  # not a continuation character
-        ):
-            pref_l_buf[pref_l_buf_idx] = tok_str_idx
-            pref_l_buf_idx += 1
-        if tok_str_idx >= tok_str_l:
-            break
-        tok_str_idx += 1
-
-    if pref_l_buf_idx < p_max_l:
-        memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
-
-
-cdef void _set_suffix_lengths(
-    const unsigned char* tok_str,
-    const int tok_str_l,
-    const int s_max_l,
-    unsigned char* suff_l_buf,
-) nogil:
-    """ Populate *suff_l_buf*, which has length *s_max_l*, with the byte lengths of each of the substrings started by the last *s_max_l* 
-        characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length 
-        of the whole word.
-
-        tok_str: a UTF-8 representation of a string.
-        tok_str_l: the length of *tok_str*.
-        s_max_l: the number of characters to process at the end of the word.
-        suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is 
-        responsible for ensuring that *s_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values 
-        within the buffer can never exceed the capacity of a single byte (255).
-    """
-    cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0
-
-    while suff_l_buf_idx < s_max_l:
-        if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
-            suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx
-            suff_l_buf_idx += 1
-        tok_str_idx -= 1
-        if tok_str_idx < 0:
-            break
-
-    if suff_l_buf_idx < s_max_l:
-        memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
-
-
-cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
-cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
-
-
-cdef int _write_hashes(
-    const unsigned char* res_buf,
-    const unsigned char* aff_l_buf,
-    const unsigned char* offset_buf,
-    const int res_buf_last,
-    np.uint64_t* hashes_ptr,
-) nogil:    
-    """ Write 64-bit FNV1A hashes for a token/rich property group combination.
-
-    res_buf: the string from which to generate the hash values.
-    aff_l_buf: one-byte lengths describing how many characters to hash.
-    offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
-    res_buf_last: if affixes should start at the end of *res_buf*, the offset of the last byte in
-         *res_buf*; if affixes should start at the beginning of *res_buf*, *0*.
-    hashes_ptr: a pointer starting from which the new hashes should be written.
-
-    Returns: the number of hashes written.
-    """
-
-    cdef int last_offset = 0, hash_idx = 0, offset, aff_l
-    cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
-    
-    while True:
-        aff_l = aff_l_buf[hash_idx]
-        if aff_l == 0:
-            return hash_idx     
-        offset = offset_buf[aff_l - 1]
-        while last_offset < offset:
-            if res_buf_last > 0:
-                hash_val ^= res_buf[res_buf_last - last_offset]
-            else:
-                hash_val ^= res_buf[last_offset]
-            hash_val *= FNV1A_PRIME
-            last_offset += 1
-        hashes_ptr[hash_idx] = hash_val
-        hash_idx += 1
-
-
 def pickle_doc(doc):
    bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
    hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,