mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Switch to 64-bit hashes
This commit is contained in:
parent
7f1873ad81
commit
f0dc60691a
|
@ -998,18 +998,18 @@ def test_doc_spans_setdefault(en_tokenizer):
|
|||
assert len(doc.spans["key3"]) == 2
|
||||
|
||||
|
||||
EMPTY_HASH_VALUE = 0x811C9DC5
|
||||
EMPTY_HASH_VALUE = 0xCBF29CE484222325
|
||||
|
||||
|
||||
def test_fnv1a_hash():
|
||||
"""Checks the conformity of the FNV1A implementation with
|
||||
"""Checks the conformity of the 64-bit FNV1A implementation with
|
||||
http://www.isthe.com/chongo/src/fnv/test_fnv.c.
|
||||
The method called here is only used in testing; in production
|
||||
code, the hashing is performed in a fashion that is interweaved
|
||||
with other logic. The conformity of the production code is
|
||||
demonstrated by the character combination hash tests, where
|
||||
hashes produced by the production code are tested for equality
|
||||
against hashes prodduced by the test code.
|
||||
against hashes produced by the test code.
|
||||
s"""
|
||||
INPUTS = [
|
||||
b"",
|
||||
|
@ -1219,208 +1219,208 @@ def test_fnv1a_hash():
|
|||
|
||||
OUTPUTS = [
|
||||
EMPTY_HASH_VALUE,
|
||||
0xE40C292C,
|
||||
0xE70C2DE5,
|
||||
0xE60C2C52,
|
||||
0xE10C2473,
|
||||
0xE00C22E0,
|
||||
0xE30C2799,
|
||||
0x6222E842,
|
||||
0xA9F37ED7,
|
||||
0x3F5076EF,
|
||||
0x39AAA18A,
|
||||
0xBF9CF968,
|
||||
0x050C5D1F,
|
||||
0x2B24D044,
|
||||
0x9D2C3F7F,
|
||||
0x7729C516,
|
||||
0xB91D6109,
|
||||
0x931AE6A0,
|
||||
0x052255DB,
|
||||
0xBEF39FE6,
|
||||
0x6150AC75,
|
||||
0x9AAB3A3D,
|
||||
0x519C4C3E,
|
||||
0x0C1C9EB8,
|
||||
0x5F299F4E,
|
||||
0xEF8580F3,
|
||||
0xAC297727,
|
||||
0x4546B9C0,
|
||||
0xBD564E7D,
|
||||
0x6BDD5C67,
|
||||
0xDD77ED30,
|
||||
0xF4CA9683,
|
||||
0x4AEB9BD0,
|
||||
0xE0E67AD0,
|
||||
0xC2D32FA8,
|
||||
0x7F743FB7,
|
||||
0x6900631F,
|
||||
0xC59C990E,
|
||||
0x448524FD,
|
||||
0xD49930D5,
|
||||
0x1C85C7CA,
|
||||
0x0229FE89,
|
||||
0x2C469265,
|
||||
0xCE566940,
|
||||
0x8BDD8EC7,
|
||||
0x34787625,
|
||||
0xD3CA6290,
|
||||
0xDDEAF039,
|
||||
0xC0E64870,
|
||||
0xDAD35570,
|
||||
0x5A740578,
|
||||
0x5B004D15,
|
||||
0x6A9C09CD,
|
||||
0x2384F10A,
|
||||
0xDA993A47,
|
||||
0x8227DF4F,
|
||||
0x4C298165,
|
||||
0xFC563735,
|
||||
0x8CB91483,
|
||||
0x775BF5D0,
|
||||
0xD5C428D0,
|
||||
0x34CC0EA3,
|
||||
0xEA3B4CB7,
|
||||
0x8E59F029,
|
||||
0x2094DE2B,
|
||||
0xA65A0AD4,
|
||||
0x9BBEE5F4,
|
||||
0xBE836343,
|
||||
0x22D5344E,
|
||||
0x19A1470C,
|
||||
0x4A56B1FF,
|
||||
0x70B8E86F,
|
||||
0x0A5B4A39,
|
||||
0xB5C3F670,
|
||||
0x53CC3F70,
|
||||
0xC03B0A99,
|
||||
0x7259C415,
|
||||
0x4095108B,
|
||||
0x7559BDB1,
|
||||
0xB3BF0BBC,
|
||||
0x2183FF1C,
|
||||
0x2BD54279,
|
||||
0x23A156CA,
|
||||
0x64E2D7E4,
|
||||
0x683AF69A,
|
||||
0xAED2346E,
|
||||
0x4F9F2CAB,
|
||||
0x02935131,
|
||||
0xC48FB86D,
|
||||
0x2269F369,
|
||||
0xC18FB3B4,
|
||||
0x50EF1236,
|
||||
0xC28FB547,
|
||||
0x96C3BF47,
|
||||
0xBF8FB08E,
|
||||
0xF3E4D49C,
|
||||
0x32179058,
|
||||
0x280BFEE6,
|
||||
0x30178D32,
|
||||
0x21ADDAF8,
|
||||
0x4217A988,
|
||||
0x772633D6,
|
||||
0x08A3D11E,
|
||||
0xB7E2323A,
|
||||
0x07A3CF8B,
|
||||
0x91DFB7D1,
|
||||
0x06A3CDF8,
|
||||
0x6BDD3D68,
|
||||
0x1D5636A7,
|
||||
0xD5B808E5,
|
||||
0x1353E852,
|
||||
0xBF16B916,
|
||||
0xA55B89ED,
|
||||
0x3C1A2017,
|
||||
0x0588B13C,
|
||||
0xF22F0174,
|
||||
0xE83641E1,
|
||||
0x6E69B533,
|
||||
0xF1760448,
|
||||
0x64C8BD58,
|
||||
0x97B4EA23,
|
||||
0x9A4E92E6,
|
||||
0xCFB14012,
|
||||
0xF01B2511,
|
||||
0x0BBB59C3,
|
||||
0xCE524AFA,
|
||||
0xDD16EF45,
|
||||
0x60648BB3,
|
||||
0x7FA4BCFC,
|
||||
0x5053AE17,
|
||||
0xC9302890,
|
||||
0x956DED32,
|
||||
0x9136DB84,
|
||||
0xDF9D3323,
|
||||
0x32BB6CD0,
|
||||
0xC8F8385B,
|
||||
0xEB08BFBA,
|
||||
0x62CC8E3D,
|
||||
0xC3E20F5C,
|
||||
0x39E97F17,
|
||||
0x7837B203,
|
||||
0x319E877B,
|
||||
0xD3E63F89,
|
||||
0x29B50B38,
|
||||
0x5ED678B8,
|
||||
0xB0D5B793,
|
||||
0x52450BE5,
|
||||
0xFA72D767,
|
||||
0x95066709,
|
||||
0x7F52E123,
|
||||
0x76966481,
|
||||
0x063258B0,
|
||||
0x2DED6E8A,
|
||||
0xB07D7C52,
|
||||
0xD0C71B71,
|
||||
0xF684F1BD,
|
||||
0x868ECFA8,
|
||||
0xF794F684,
|
||||
0xD19701C3,
|
||||
0x346E171E,
|
||||
0x91F8F676,
|
||||
0x0BF58848,
|
||||
0x6317B6D1,
|
||||
0xAFAD4C54,
|
||||
0x0F25681E,
|
||||
0x91B18D49,
|
||||
0x7D61C12E,
|
||||
0x5147D25C,
|
||||
0x9A8B6805,
|
||||
0x4CD2A447,
|
||||
0x1E549B14,
|
||||
0x2FE1B574,
|
||||
0xCF0CD31E,
|
||||
0x6C471669,
|
||||
0x0E5EEF1E,
|
||||
0x2BED3602,
|
||||
0xB26249E0,
|
||||
0x2C9B86A4,
|
||||
0xE415E2BB,
|
||||
0x18A98D1D,
|
||||
0xB7DF8B7B,
|
||||
0x241E9075,
|
||||
0x063F70DD,
|
||||
0x0295AED9,
|
||||
0x56A7F781,
|
||||
0x253BC645,
|
||||
0x46610921,
|
||||
0x7C1577F9,
|
||||
0x512B2851,
|
||||
0x76823999,
|
||||
0xC0586935,
|
||||
0xF3415C85,
|
||||
0x0AE4FF65,
|
||||
0x58B79725,
|
||||
0xDEA43AA5,
|
||||
0x2BB3BE35,
|
||||
0xEA777A45,
|
||||
0x8F21C305,
|
||||
0x5C9D0865,
|
||||
0xFA823DD5,
|
||||
0x21A27271,
|
||||
0x83C5C6D5,
|
||||
0x813B0881,
|
||||
0xAF63DC4C8601EC8C,
|
||||
0xAF63DF4C8601F1A5,
|
||||
0xAF63DE4C8601EFF2,
|
||||
0xAF63D94C8601E773,
|
||||
0xAF63D84C8601E5C0,
|
||||
0xAF63DB4C8601EAD9,
|
||||
0x08985907B541D342,
|
||||
0xDCB27518FED9D577,
|
||||
0xDD120E790C2512AF,
|
||||
0xCAC165AFA2FEF40A,
|
||||
0x85944171F73967E8,
|
||||
0xAF63BD4C8601B7DF,
|
||||
0x089BE207B544F1E4,
|
||||
0x08A61407B54D9B5F,
|
||||
0x08A2AE07B54AB836,
|
||||
0x0891B007B53C4869,
|
||||
0x088E4A07B5396540,
|
||||
0x08987C07B5420EBB,
|
||||
0xDCB28A18FED9F926,
|
||||
0xDD1270790C25B935,
|
||||
0xCAC146AFA2FEBF5D,
|
||||
0x8593D371F738ACFE,
|
||||
0x34531CA7168B8F38,
|
||||
0x08A25607B54A22AE,
|
||||
0xF5FAF0190CF90DF3,
|
||||
0xF27397910B3221C7,
|
||||
0x2C8C2B76062F22E0,
|
||||
0xE150688C8217B8FD,
|
||||
0xF35A83C10E4F1F87,
|
||||
0xD1EDD10B507344D0,
|
||||
0x2A5EE739B3DDB8C3,
|
||||
0xDCFB970CA1C0D310,
|
||||
0x4054DA76DAA6DA90,
|
||||
0xF70A2FF589861368,
|
||||
0x4C628B38AED25F17,
|
||||
0x9DD1F6510F78189F,
|
||||
0xA3DE85BD491270CE,
|
||||
0x858E2FA32A55E61D,
|
||||
0x46810940EFF5F915,
|
||||
0xF5FADD190CF8EDAA,
|
||||
0xF273ED910B32B3E9,
|
||||
0x2C8C5276062F6525,
|
||||
0xE150B98C821842A0,
|
||||
0xF35AA3C10E4F55E7,
|
||||
0xD1ED680B50729265,
|
||||
0x2A5F0639B3DDED70,
|
||||
0xDCFBAA0CA1C0F359,
|
||||
0x4054BA76DAA6A430,
|
||||
0xF709C7F5898562B0,
|
||||
0x4C62E638AED2F9B8,
|
||||
0x9DD1A8510F779415,
|
||||
0xA3DE2ABD4911D62D,
|
||||
0x858E0EA32A55AE0A,
|
||||
0x46810F40EFF60347,
|
||||
0xC33BCE57BEF63EAF,
|
||||
0x08A24307B54A0265,
|
||||
0xF5B9FD190CC18D15,
|
||||
0x4C968290ACE35703,
|
||||
0x07174BD5C64D9350,
|
||||
0x5A294C3FF5D18750,
|
||||
0x05B3C1AEB308B843,
|
||||
0xB92A48DA37D0F477,
|
||||
0x73CDDDCCD80EBC49,
|
||||
0xD58C4C13210A266B,
|
||||
0xE78B6081243EC194,
|
||||
0xB096F77096A39F34,
|
||||
0xB425C54FF807B6A3,
|
||||
0x23E520E2751BB46E,
|
||||
0x1A0B44CCFE1385EC,
|
||||
0xF5BA4B190CC2119F,
|
||||
0x4C962690ACE2BAAF,
|
||||
0x0716DED5C64CDA19,
|
||||
0x5A292C3FF5D150F0,
|
||||
0x05B3E0AEB308ECF0,
|
||||
0xB92A5EDA37D119D9,
|
||||
0x73CE41CCD80F6635,
|
||||
0xD58C2C132109F00B,
|
||||
0xE78BAF81243F47D1,
|
||||
0xB0968F7096A2EE7C,
|
||||
0xB425A84FF807855C,
|
||||
0x23E4E9E2751B56F9,
|
||||
0x1A0B4ECCFE1396EA,
|
||||
0x54ABD453BB2C9004,
|
||||
0x08BA5F07B55EC3DA,
|
||||
0x337354193006CB6E,
|
||||
0xA430D84680AABD0B,
|
||||
0xA9BC8ACCA21F39B1,
|
||||
0x6961196491CC682D,
|
||||
0xAD2BB1774799DFE9,
|
||||
0x6961166491CC6314,
|
||||
0x8D1BB3904A3B1236,
|
||||
0x6961176491CC64C7,
|
||||
0xED205D87F40434C7,
|
||||
0x6961146491CC5FAE,
|
||||
0xCD3BAF5E44F8AD9C,
|
||||
0xE3B36596127CD6D8,
|
||||
0xF77F1072C8E8A646,
|
||||
0xE3B36396127CD372,
|
||||
0x6067DCE9932AD458,
|
||||
0xE3B37596127CF208,
|
||||
0x4B7B10FA9FE83936,
|
||||
0xAABAFE7104D914BE,
|
||||
0xF4D3180B3CDE3EDA,
|
||||
0xAABAFD7104D9130B,
|
||||
0xF4CFB20B3CDB5BB1,
|
||||
0xAABAFC7104D91158,
|
||||
0xF4CC4C0B3CD87888,
|
||||
0xE729BAC5D2A8D3A7,
|
||||
0x74BC0524F4DFA4C5,
|
||||
0xE72630C5D2A5B352,
|
||||
0x6B983224EF8FB456,
|
||||
0xE73042C5D2AE266D,
|
||||
0x8527E324FDEB4B37,
|
||||
0x0A83C86FEE952ABC,
|
||||
0x7318523267779D74,
|
||||
0x3E66D3D56B8CACA1,
|
||||
0x956694A5C0095593,
|
||||
0xCAC54572BB1A6FC8,
|
||||
0xA7A4C9F3EDEBF0D8,
|
||||
0x7829851FAC17B143,
|
||||
0x2C8F4C9AF81BCF06,
|
||||
0xD34E31539740C732,
|
||||
0x3605A2AC253D2DB1,
|
||||
0x08C11B8346F4A3C3,
|
||||
0x6BE396289CE8A6DA,
|
||||
0xD9B957FB7FE794C5,
|
||||
0x05BE33DA04560A93,
|
||||
0x0957F1577BA9747C,
|
||||
0xDA2CC3ACC24FBA57,
|
||||
0x74136F185B29E7F0,
|
||||
0xB2F2B4590EDB93B2,
|
||||
0xB3608FCE8B86AE04,
|
||||
0x4A3A865079359063,
|
||||
0x5B3A7EF496880A50,
|
||||
0x48FAE3163854C23B,
|
||||
0x07AAA640476E0B9A,
|
||||
0x2F653656383A687D,
|
||||
0xA1031F8E7599D79C,
|
||||
0xA31908178FF92477,
|
||||
0x097EDF3C14C3FB83,
|
||||
0xB51CA83FEAA0971B,
|
||||
0xDD3C0D96D784F2E9,
|
||||
0x86CD26A9EA767D78,
|
||||
0xE6B215FF54A30C18,
|
||||
0xEC5B06A1C5531093,
|
||||
0x45665A929F9EC5E5,
|
||||
0x8C7609B4A9F10907,
|
||||
0x89AAC3A491F0D729,
|
||||
0x32CE6B26E0F4A403,
|
||||
0x614AB44E02B53E01,
|
||||
0xFA6472EB6EEF3290,
|
||||
0x9E5D75EB1948EB6A,
|
||||
0xB6D12AD4A8671852,
|
||||
0x88826F56EBA07AF1,
|
||||
0x44535BF2645BC0FD,
|
||||
0x169388FFC21E3728,
|
||||
0xF68AAC9E396D8224,
|
||||
0x8E87D7E7472B3883,
|
||||
0x295C26CAA8B423DE,
|
||||
0x322C814292E72176,
|
||||
0x8A06550EB8AF7268,
|
||||
0xEF86D60E661BCF71,
|
||||
0x9E5426C87F30EE54,
|
||||
0xF1EA8AA826FD047E,
|
||||
0x0BABAF9A642CB769,
|
||||
0x4B3341D4068D012E,
|
||||
0xD15605CBC30A335C,
|
||||
0x5B21060AED8412E5,
|
||||
0x45E2CDA1CE6F4227,
|
||||
0x50AE3745033AD7D4,
|
||||
0xAA4588CED46BF414,
|
||||
0xC1B0056C4A95467E,
|
||||
0x56576A71DE8B4089,
|
||||
0xBF20965FA6DC927E,
|
||||
0x569F8383C2040882,
|
||||
0xE1E772FBA08FECA0,
|
||||
0x4CED94AF97138AC4,
|
||||
0xC4112FFB337A82FB,
|
||||
0xD64A4FD41DE38B7D,
|
||||
0x4CFC32329EDEBCBB,
|
||||
0x0803564445050395,
|
||||
0xAA1574ECF4642FFD,
|
||||
0x694BC4E54CC315F9,
|
||||
0xA3D7CB273B011721,
|
||||
0x577C2F8B6115BFA5,
|
||||
0xB7EC8C1A769FB4C1,
|
||||
0x5D5CFCE63359AB19,
|
||||
0x33B96C3CD65B5F71,
|
||||
0xD845097780602BB9,
|
||||
0x84D47645D02DA3D5,
|
||||
0x83544F33B58773A5,
|
||||
0x9175CBB2160836C5,
|
||||
0xC71B3BC175E72BC5,
|
||||
0x636806AC222EC985,
|
||||
0xB6EF0E6950F52ED5,
|
||||
0xEAD3D8A0F3DFDAA5,
|
||||
0x922908FE9A861BA5,
|
||||
0x6D4821DE275FD5C5,
|
||||
0x1FE3FCE62BD816B5,
|
||||
0xC23E9FCCD6F70591,
|
||||
0xC1AF12BDFE16B5B5,
|
||||
0x39E9F18F2F85E221,
|
||||
]
|
||||
|
||||
assert len(INPUTS) == len(OUTPUTS)
|
||||
|
@ -1713,8 +1713,7 @@ def test_get_character_combination_hashes_string_store_spec_cases(
|
|||
|
||||
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||
doc = en_tokenizer("and𐌞")
|
||||
assert (
|
||||
doc.get_character_combination_hashes(
|
||||
assert doc.get_character_combination_hashes(
|
||||
cs=True,
|
||||
p_lengths=bytes(),
|
||||
s_lengths=bytes(),
|
||||
|
@ -1724,6 +1723,4 @@ def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
|||
ss_search_chars=bytes(),
|
||||
ss_width_offsets=bytes(),
|
||||
ss_lengths=bytes(),
|
||||
).shape
|
||||
== (1, 0)
|
||||
)
|
||||
).shape == (1, 0)
|
||||
|
|
|
@ -6,7 +6,7 @@ cimport numpy as np
|
|||
from cpython cimport array
|
||||
from libc.string cimport memcpy, memcmp, memset, strlen
|
||||
from libc.math cimport sqrt
|
||||
from libc.stdint cimport int32_t, uint64_t, uint32_t
|
||||
from libc.stdint cimport int32_t, uint64_t
|
||||
|
||||
import copy
|
||||
from collections import Counter, defaultdict
|
||||
|
@ -1779,8 +1779,9 @@ cdef class Doc:
|
|||
hashed for "spaCy" would be "c" and "ca".
|
||||
|
||||
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
|
||||
the fact that we are hashing short affixes and searching for small groups of characters; the calling code is responsible
|
||||
lengths being passed in cannot exceed 63 and that *_search_chars buffers are never longer than 255.
|
||||
the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
|
||||
for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with a maximum of four-byte
|
||||
character widths can never exceed 255.
|
||||
"""
|
||||
|
||||
# Work out lengths
|
||||
|
@ -2155,14 +2156,17 @@ cdef void _search_for_chars(
|
|||
memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx)
|
||||
|
||||
|
||||
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
|
||||
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
|
||||
|
||||
def get_fnv1a_hash(input: bytes):
|
||||
""" Python-callable method to facilitate testing. """
|
||||
cdef uint32_t hash_val = 0x811c9dc5
|
||||
cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
|
||||
cdef int length = len(input), offset = 0
|
||||
|
||||
while offset < length:
|
||||
hash_val ^= input[offset]
|
||||
hash_val *= 0x01000193
|
||||
hash_val *= FNV1A_PRIME
|
||||
offset += 1
|
||||
return hash_val
|
||||
|
||||
|
@ -2175,7 +2179,7 @@ cdef int _write_hashes(
|
|||
const int res_buf_last,
|
||||
np.uint64_t* hashes_ptr,
|
||||
) nogil:
|
||||
""" Write FNV1A hashes for a token/rich property group combination.
|
||||
""" Write 64-bit FNV1A hashes for a token/rich property group combination.
|
||||
|
||||
res_buf: the string from which to generate the hash values.
|
||||
aff_l_buf: one-byte lengths describing how many characters to hash.
|
||||
|
@ -2188,7 +2192,7 @@ cdef int _write_hashes(
|
|||
"""
|
||||
|
||||
cdef int last_offset = 0, hash_idx = 0, offset, aff_l
|
||||
cdef uint32_t hash_val = 0x811c9dc5
|
||||
cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
|
||||
|
||||
while True:
|
||||
aff_l = aff_l_buf[hash_idx]
|
||||
|
@ -2200,7 +2204,7 @@ cdef int _write_hashes(
|
|||
hash_val ^= res_buf[res_buf_last - last_offset]
|
||||
else:
|
||||
hash_val ^= res_buf[last_offset]
|
||||
hash_val *= 0x01000193
|
||||
hash_val *= FNV1A_PRIME
|
||||
last_offset += 1
|
||||
hashes_ptr[hash_idx] = hash_val
|
||||
hash_idx += 1
|
||||
|
|
|
@ -219,6 +219,13 @@ alternate prevents the alternation from occurring, e.g. an `ä` in a German
|
|||
plural noun does not become `a` if it is the third or fourth vowel from the end
|
||||
of the word.
|
||||
|
||||
There are a few rare situations where a graphical character is expressed as more
|
||||
than one UTF-8 character, e.g. _i_ when representing the lower-case form of the
|
||||
Turkish letter _İ_. Such situations are supported, but the lengths of prefixes,
|
||||
suffixes and character search results may need to be increased accordingly.
|
||||
|
||||
All lengths must be specified in ascending order.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user