Switch to 64-bit hashes

This commit is contained in:
richardpaulhudson 2022-11-04 10:17:25 +01:00
parent 7f1873ad81
commit f0dc60691a
3 changed files with 235 additions and 227 deletions

View File

@ -998,18 +998,18 @@ def test_doc_spans_setdefault(en_tokenizer):
assert len(doc.spans["key3"]) == 2 assert len(doc.spans["key3"]) == 2
EMPTY_HASH_VALUE = 0x811C9DC5 EMPTY_HASH_VALUE = 0xCBF29CE484222325
def test_fnv1a_hash(): def test_fnv1a_hash():
"""Checks the conformity of the FNV1A implementation with """Checks the conformity of the 64-bit FNV1A implementation with
http://www.isthe.com/chongo/src/fnv/test_fnv.c. http://www.isthe.com/chongo/src/fnv/test_fnv.c.
The method called here is only used in testing; in production The method called here is only used in testing; in production
code, the hashing is performed in a fashion that is interweaved code, the hashing is performed in a fashion that is interweaved
with other logic. The conformity of the production code is with other logic. The conformity of the production code is
demonstrated by the character combination hash tests, where demonstrated by the character combination hash tests, where
hashes produced by the production code are tested for equality hashes produced by the production code are tested for equality
against hashes prodduced by the test code. against hashes produced by the test code.
s""" s"""
INPUTS = [ INPUTS = [
b"", b"",
@ -1219,208 +1219,208 @@ def test_fnv1a_hash():
OUTPUTS = [ OUTPUTS = [
EMPTY_HASH_VALUE, EMPTY_HASH_VALUE,
0xE40C292C, 0xAF63DC4C8601EC8C,
0xE70C2DE5, 0xAF63DF4C8601F1A5,
0xE60C2C52, 0xAF63DE4C8601EFF2,
0xE10C2473, 0xAF63D94C8601E773,
0xE00C22E0, 0xAF63D84C8601E5C0,
0xE30C2799, 0xAF63DB4C8601EAD9,
0x6222E842, 0x08985907B541D342,
0xA9F37ED7, 0xDCB27518FED9D577,
0x3F5076EF, 0xDD120E790C2512AF,
0x39AAA18A, 0xCAC165AFA2FEF40A,
0xBF9CF968, 0x85944171F73967E8,
0x050C5D1F, 0xAF63BD4C8601B7DF,
0x2B24D044, 0x089BE207B544F1E4,
0x9D2C3F7F, 0x08A61407B54D9B5F,
0x7729C516, 0x08A2AE07B54AB836,
0xB91D6109, 0x0891B007B53C4869,
0x931AE6A0, 0x088E4A07B5396540,
0x052255DB, 0x08987C07B5420EBB,
0xBEF39FE6, 0xDCB28A18FED9F926,
0x6150AC75, 0xDD1270790C25B935,
0x9AAB3A3D, 0xCAC146AFA2FEBF5D,
0x519C4C3E, 0x8593D371F738ACFE,
0x0C1C9EB8, 0x34531CA7168B8F38,
0x5F299F4E, 0x08A25607B54A22AE,
0xEF8580F3, 0xF5FAF0190CF90DF3,
0xAC297727, 0xF27397910B3221C7,
0x4546B9C0, 0x2C8C2B76062F22E0,
0xBD564E7D, 0xE150688C8217B8FD,
0x6BDD5C67, 0xF35A83C10E4F1F87,
0xDD77ED30, 0xD1EDD10B507344D0,
0xF4CA9683, 0x2A5EE739B3DDB8C3,
0x4AEB9BD0, 0xDCFB970CA1C0D310,
0xE0E67AD0, 0x4054DA76DAA6DA90,
0xC2D32FA8, 0xF70A2FF589861368,
0x7F743FB7, 0x4C628B38AED25F17,
0x6900631F, 0x9DD1F6510F78189F,
0xC59C990E, 0xA3DE85BD491270CE,
0x448524FD, 0x858E2FA32A55E61D,
0xD49930D5, 0x46810940EFF5F915,
0x1C85C7CA, 0xF5FADD190CF8EDAA,
0x0229FE89, 0xF273ED910B32B3E9,
0x2C469265, 0x2C8C5276062F6525,
0xCE566940, 0xE150B98C821842A0,
0x8BDD8EC7, 0xF35AA3C10E4F55E7,
0x34787625, 0xD1ED680B50729265,
0xD3CA6290, 0x2A5F0639B3DDED70,
0xDDEAF039, 0xDCFBAA0CA1C0F359,
0xC0E64870, 0x4054BA76DAA6A430,
0xDAD35570, 0xF709C7F5898562B0,
0x5A740578, 0x4C62E638AED2F9B8,
0x5B004D15, 0x9DD1A8510F779415,
0x6A9C09CD, 0xA3DE2ABD4911D62D,
0x2384F10A, 0x858E0EA32A55AE0A,
0xDA993A47, 0x46810F40EFF60347,
0x8227DF4F, 0xC33BCE57BEF63EAF,
0x4C298165, 0x08A24307B54A0265,
0xFC563735, 0xF5B9FD190CC18D15,
0x8CB91483, 0x4C968290ACE35703,
0x775BF5D0, 0x07174BD5C64D9350,
0xD5C428D0, 0x5A294C3FF5D18750,
0x34CC0EA3, 0x05B3C1AEB308B843,
0xEA3B4CB7, 0xB92A48DA37D0F477,
0x8E59F029, 0x73CDDDCCD80EBC49,
0x2094DE2B, 0xD58C4C13210A266B,
0xA65A0AD4, 0xE78B6081243EC194,
0x9BBEE5F4, 0xB096F77096A39F34,
0xBE836343, 0xB425C54FF807B6A3,
0x22D5344E, 0x23E520E2751BB46E,
0x19A1470C, 0x1A0B44CCFE1385EC,
0x4A56B1FF, 0xF5BA4B190CC2119F,
0x70B8E86F, 0x4C962690ACE2BAAF,
0x0A5B4A39, 0x0716DED5C64CDA19,
0xB5C3F670, 0x5A292C3FF5D150F0,
0x53CC3F70, 0x05B3E0AEB308ECF0,
0xC03B0A99, 0xB92A5EDA37D119D9,
0x7259C415, 0x73CE41CCD80F6635,
0x4095108B, 0xD58C2C132109F00B,
0x7559BDB1, 0xE78BAF81243F47D1,
0xB3BF0BBC, 0xB0968F7096A2EE7C,
0x2183FF1C, 0xB425A84FF807855C,
0x2BD54279, 0x23E4E9E2751B56F9,
0x23A156CA, 0x1A0B4ECCFE1396EA,
0x64E2D7E4, 0x54ABD453BB2C9004,
0x683AF69A, 0x08BA5F07B55EC3DA,
0xAED2346E, 0x337354193006CB6E,
0x4F9F2CAB, 0xA430D84680AABD0B,
0x02935131, 0xA9BC8ACCA21F39B1,
0xC48FB86D, 0x6961196491CC682D,
0x2269F369, 0xAD2BB1774799DFE9,
0xC18FB3B4, 0x6961166491CC6314,
0x50EF1236, 0x8D1BB3904A3B1236,
0xC28FB547, 0x6961176491CC64C7,
0x96C3BF47, 0xED205D87F40434C7,
0xBF8FB08E, 0x6961146491CC5FAE,
0xF3E4D49C, 0xCD3BAF5E44F8AD9C,
0x32179058, 0xE3B36596127CD6D8,
0x280BFEE6, 0xF77F1072C8E8A646,
0x30178D32, 0xE3B36396127CD372,
0x21ADDAF8, 0x6067DCE9932AD458,
0x4217A988, 0xE3B37596127CF208,
0x772633D6, 0x4B7B10FA9FE83936,
0x08A3D11E, 0xAABAFE7104D914BE,
0xB7E2323A, 0xF4D3180B3CDE3EDA,
0x07A3CF8B, 0xAABAFD7104D9130B,
0x91DFB7D1, 0xF4CFB20B3CDB5BB1,
0x06A3CDF8, 0xAABAFC7104D91158,
0x6BDD3D68, 0xF4CC4C0B3CD87888,
0x1D5636A7, 0xE729BAC5D2A8D3A7,
0xD5B808E5, 0x74BC0524F4DFA4C5,
0x1353E852, 0xE72630C5D2A5B352,
0xBF16B916, 0x6B983224EF8FB456,
0xA55B89ED, 0xE73042C5D2AE266D,
0x3C1A2017, 0x8527E324FDEB4B37,
0x0588B13C, 0x0A83C86FEE952ABC,
0xF22F0174, 0x7318523267779D74,
0xE83641E1, 0x3E66D3D56B8CACA1,
0x6E69B533, 0x956694A5C0095593,
0xF1760448, 0xCAC54572BB1A6FC8,
0x64C8BD58, 0xA7A4C9F3EDEBF0D8,
0x97B4EA23, 0x7829851FAC17B143,
0x9A4E92E6, 0x2C8F4C9AF81BCF06,
0xCFB14012, 0xD34E31539740C732,
0xF01B2511, 0x3605A2AC253D2DB1,
0x0BBB59C3, 0x08C11B8346F4A3C3,
0xCE524AFA, 0x6BE396289CE8A6DA,
0xDD16EF45, 0xD9B957FB7FE794C5,
0x60648BB3, 0x05BE33DA04560A93,
0x7FA4BCFC, 0x0957F1577BA9747C,
0x5053AE17, 0xDA2CC3ACC24FBA57,
0xC9302890, 0x74136F185B29E7F0,
0x956DED32, 0xB2F2B4590EDB93B2,
0x9136DB84, 0xB3608FCE8B86AE04,
0xDF9D3323, 0x4A3A865079359063,
0x32BB6CD0, 0x5B3A7EF496880A50,
0xC8F8385B, 0x48FAE3163854C23B,
0xEB08BFBA, 0x07AAA640476E0B9A,
0x62CC8E3D, 0x2F653656383A687D,
0xC3E20F5C, 0xA1031F8E7599D79C,
0x39E97F17, 0xA31908178FF92477,
0x7837B203, 0x097EDF3C14C3FB83,
0x319E877B, 0xB51CA83FEAA0971B,
0xD3E63F89, 0xDD3C0D96D784F2E9,
0x29B50B38, 0x86CD26A9EA767D78,
0x5ED678B8, 0xE6B215FF54A30C18,
0xB0D5B793, 0xEC5B06A1C5531093,
0x52450BE5, 0x45665A929F9EC5E5,
0xFA72D767, 0x8C7609B4A9F10907,
0x95066709, 0x89AAC3A491F0D729,
0x7F52E123, 0x32CE6B26E0F4A403,
0x76966481, 0x614AB44E02B53E01,
0x063258B0, 0xFA6472EB6EEF3290,
0x2DED6E8A, 0x9E5D75EB1948EB6A,
0xB07D7C52, 0xB6D12AD4A8671852,
0xD0C71B71, 0x88826F56EBA07AF1,
0xF684F1BD, 0x44535BF2645BC0FD,
0x868ECFA8, 0x169388FFC21E3728,
0xF794F684, 0xF68AAC9E396D8224,
0xD19701C3, 0x8E87D7E7472B3883,
0x346E171E, 0x295C26CAA8B423DE,
0x91F8F676, 0x322C814292E72176,
0x0BF58848, 0x8A06550EB8AF7268,
0x6317B6D1, 0xEF86D60E661BCF71,
0xAFAD4C54, 0x9E5426C87F30EE54,
0x0F25681E, 0xF1EA8AA826FD047E,
0x91B18D49, 0x0BABAF9A642CB769,
0x7D61C12E, 0x4B3341D4068D012E,
0x5147D25C, 0xD15605CBC30A335C,
0x9A8B6805, 0x5B21060AED8412E5,
0x4CD2A447, 0x45E2CDA1CE6F4227,
0x1E549B14, 0x50AE3745033AD7D4,
0x2FE1B574, 0xAA4588CED46BF414,
0xCF0CD31E, 0xC1B0056C4A95467E,
0x6C471669, 0x56576A71DE8B4089,
0x0E5EEF1E, 0xBF20965FA6DC927E,
0x2BED3602, 0x569F8383C2040882,
0xB26249E0, 0xE1E772FBA08FECA0,
0x2C9B86A4, 0x4CED94AF97138AC4,
0xE415E2BB, 0xC4112FFB337A82FB,
0x18A98D1D, 0xD64A4FD41DE38B7D,
0xB7DF8B7B, 0x4CFC32329EDEBCBB,
0x241E9075, 0x0803564445050395,
0x063F70DD, 0xAA1574ECF4642FFD,
0x0295AED9, 0x694BC4E54CC315F9,
0x56A7F781, 0xA3D7CB273B011721,
0x253BC645, 0x577C2F8B6115BFA5,
0x46610921, 0xB7EC8C1A769FB4C1,
0x7C1577F9, 0x5D5CFCE63359AB19,
0x512B2851, 0x33B96C3CD65B5F71,
0x76823999, 0xD845097780602BB9,
0xC0586935, 0x84D47645D02DA3D5,
0xF3415C85, 0x83544F33B58773A5,
0x0AE4FF65, 0x9175CBB2160836C5,
0x58B79725, 0xC71B3BC175E72BC5,
0xDEA43AA5, 0x636806AC222EC985,
0x2BB3BE35, 0xB6EF0E6950F52ED5,
0xEA777A45, 0xEAD3D8A0F3DFDAA5,
0x8F21C305, 0x922908FE9A861BA5,
0x5C9D0865, 0x6D4821DE275FD5C5,
0xFA823DD5, 0x1FE3FCE62BD816B5,
0x21A27271, 0xC23E9FCCD6F70591,
0x83C5C6D5, 0xC1AF12BDFE16B5B5,
0x813B0881, 0x39E9F18F2F85E221,
] ]
assert len(INPUTS) == len(OUTPUTS) assert len(INPUTS) == len(OUTPUTS)
@ -1713,17 +1713,14 @@ def test_get_character_combination_hashes_string_store_spec_cases(
def test_character_combination_hashes_empty_lengths(en_tokenizer): def test_character_combination_hashes_empty_lengths(en_tokenizer):
doc = en_tokenizer("and𐌞") doc = en_tokenizer("and𐌞")
assert ( assert doc.get_character_combination_hashes(
doc.get_character_combination_hashes( cs=True,
cs=True, p_lengths=bytes(),
p_lengths=bytes(), s_lengths=bytes(),
s_lengths=bytes(), ps_search_chars=bytes(),
ps_search_chars=bytes(), ps_width_offsets=bytes(),
ps_width_offsets=bytes(), ps_lengths=bytes(),
ps_lengths=bytes(), ss_search_chars=bytes(),
ss_search_chars=bytes(), ss_width_offsets=bytes(),
ss_width_offsets=bytes(), ss_lengths=bytes(),
ss_lengths=bytes(), ).shape == (1, 0)
).shape
== (1, 0)
)

View File

@ -6,7 +6,7 @@ cimport numpy as np
from cpython cimport array from cpython cimport array
from libc.string cimport memcpy, memcmp, memset, strlen from libc.string cimport memcpy, memcmp, memset, strlen
from libc.math cimport sqrt from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t, uint32_t from libc.stdint cimport int32_t, uint64_t
import copy import copy
from collections import Counter, defaultdict from collections import Counter, defaultdict
@ -1779,8 +1779,9 @@ cdef class Doc:
hashed for "spaCy" would be "c" and "ca". hashed for "spaCy" would be "c" and "ca".
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
the fact that we are hashing short affixes and searching for small groups of characters; the calling code is responsible the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
lengths being passed in cannot exceed 63 and that *_search_chars buffers are never longer than 255. for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with a maximum of four-byte
character widths can never exceed 255.
""" """
# Work out lengths # Work out lengths
@ -2155,14 +2156,17 @@ cdef void _search_for_chars(
memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
def get_fnv1a_hash(input: bytes): def get_fnv1a_hash(input: bytes):
""" Python-callable method to facilitate testing. """ """ Python-callable method to facilitate testing. """
cdef uint32_t hash_val = 0x811c9dc5 cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
cdef int length = len(input), offset = 0 cdef int length = len(input), offset = 0
while offset < length: while offset < length:
hash_val ^= input[offset] hash_val ^= input[offset]
hash_val *= 0x01000193 hash_val *= FNV1A_PRIME
offset += 1 offset += 1
return hash_val return hash_val
@ -2175,7 +2179,7 @@ cdef int _write_hashes(
const int res_buf_last, const int res_buf_last,
np.uint64_t* hashes_ptr, np.uint64_t* hashes_ptr,
) nogil: ) nogil:
""" Write FNV1A hashes for a token/rich property group combination. """ Write 64-bit FNV1A hashes for a token/rich property group combination.
res_buf: the string from which to generate the hash values. res_buf: the string from which to generate the hash values.
aff_l_buf: one-byte lengths describing how many characters to hash. aff_l_buf: one-byte lengths describing how many characters to hash.
@ -2188,7 +2192,7 @@ cdef int _write_hashes(
""" """
cdef int last_offset = 0, hash_idx = 0, offset, aff_l cdef int last_offset = 0, hash_idx = 0, offset, aff_l
cdef uint32_t hash_val = 0x811c9dc5 cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
while True: while True:
aff_l = aff_l_buf[hash_idx] aff_l = aff_l_buf[hash_idx]
@ -2200,7 +2204,7 @@ cdef int _write_hashes(
hash_val ^= res_buf[res_buf_last - last_offset] hash_val ^= res_buf[res_buf_last - last_offset]
else: else:
hash_val ^= res_buf[last_offset] hash_val ^= res_buf[last_offset]
hash_val *= 0x01000193 hash_val *= FNV1A_PRIME
last_offset += 1 last_offset += 1
hashes_ptr[hash_idx] = hash_val hashes_ptr[hash_idx] = hash_val
hash_idx += 1 hash_idx += 1

View File

@ -219,6 +219,13 @@ alternate prevents the alternation from occurring, e.g. an `ä` in a German
plural noun does not become `a` if it is the third or fourth vowel from the end plural noun does not become `a` if it is the third or fourth vowel from the end
of the word. of the word.
There are a few rare situations where a graphical character is expressed as more
than one UTF-8 character, e.g. _i_ when representing the lower-case form of the
Turkish letter _İ_. Such situations are supported, but the lengths of prefixes,
suffixes and character search results may need to be increased accordingly.
All lengths must be specified in ascending order.
| Name | Description | | Name | Description |
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ | | `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |