Add FNV1A conformity tests

This commit is contained in:
richard@explosion.ai 2022-11-03 10:19:38 +01:00
parent 557799358c
commit deba504173
2 changed files with 436 additions and 10 deletions

View File

@ -996,6 +996,433 @@ def test_doc_spans_setdefault(en_tokenizer):
assert len(doc.spans["key3"]) == 2
def test_fnv1a_hash():
"""Checks the conformity of the FNV1A implementation with
http://www.isthe.com/chongo/src/fnv/test_fnv.c.
The method called here is only used in testing; in production
code, the hashing is performed in a fashion that is interweaved
with other logic. The conformity of the production code is
demonstrated by the character combination hash tests, where
hashes produced by the production code are tested for equality
against hashes prodduced by the test code.
s"""
INPUTS = [
b"",
b"a",
b"b",
b"c",
b"d",
b"e",
b"f",
b"fo",
b"foo",
b"foob",
b"fooba",
b"foobar",
b"\x00",
b"a\x00",
b"b\x00",
b"c\x00",
b"d\x00",
b"e\x00",
b"f\x00",
b"fo\x00",
b"foo\x00",
b"foob\x00",
b"fooba\x00",
b"foobar\x00",
b"ch",
b"cho",
b"chon",
b"chong",
b"chongo",
b"chongo ",
b"chongo w",
b"chongo wa",
b"chongo was",
b"chongo was ",
b"chongo was h",
b"chongo was he",
b"chongo was her",
b"chongo was here",
b"chongo was here!",
b"chongo was here!\n",
b"ch\x00",
b"cho\x00",
b"chon\x00",
b"chong\x00",
b"chongo\x00",
b"chongo \x00",
b"chongo w\x00",
b"chongo wa\x00",
b"chongo was\x00",
b"chongo was \x00",
b"chongo was h\x00",
b"chongo was he\x00",
b"chongo was her\x00",
b"chongo was here\x00",
b"chongo was here!\x00",
b"chongo was here!\n\x00",
b"cu",
b"cur",
b"curd",
b"curds",
b"curds ",
b"curds a",
b"curds an",
b"curds and",
b"curds and ",
b"curds and w",
b"curds and wh",
b"curds and whe",
b"curds and whey",
b"curds and whey\n",
b"cu\x00",
b"cur\x00",
b"curd\x00",
b"curds\x00",
b"curds \x00",
b"curds a\x00",
b"curds an\x00",
b"curds and\x00",
b"curds and \x00",
b"curds and w\x00",
b"curds and wh\x00",
b"curds and whe\x00",
b"curds and whey\x00",
b"curds and whey\n\x00",
b"hi",
b"hi\x00",
b"hello",
b"hello\x00",
b"\xff\x00\x00\x01",
b"\x01\x00\x00\xff",
b"\xff\x00\x00\x02",
b"\x02\x00\x00\xff",
b"\xff\x00\x00\x03",
b"\x03\x00\x00\xff",
b"\xff\x00\x00\x04",
b"\x04\x00\x00\xff",
b"\x40\x51\x4e\x44",
b"\x44\x4e\x51\x40",
b"\x40\x51\x4e\x4a",
b"\x4a\x4e\x51\x40",
b"\x40\x51\x4e\x54",
b"\x54\x4e\x51\x40",
b"127.0.0.1",
b"127.0.0.1\x00",
b"127.0.0.2",
b"127.0.0.2\x00",
b"127.0.0.3",
b"127.0.0.3\x00",
b"64.81.78.68",
b"64.81.78.68\x00",
b"64.81.78.74",
b"64.81.78.74\x00",
b"64.81.78.84",
b"64.81.78.84\x00",
b"feedface",
b"feedface\x00",
b"feedfacedaffdeed",
b"feedfacedaffdeed\x00",
b"feedfacedeadbeef",
b"feedfacedeadbeef\x00",
b"line 1\nline 2\nline 3",
b"chongo <Landon Curt Noll> /\\../\\",
b"chongo <Landon Curt Noll> /\\../\\\x00",
b"chongo (Landon Curt Noll) /\\../\\",
b"chongo (Landon Curt Noll) /\\../\\\x00",
b"http://antwrp.gsfc.nasa.gov/apod/astropix.html",
b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash",
b"http://epod.usra.edu/",
b"http://exoplanet.eu/",
b"http://hvo.wr.usgs.gov/cam3/",
b"http://hvo.wr.usgs.gov/cams/HMcam/",
b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html",
b"http://hvo.wr.usgs.gov/kilauea/update/images.html",
b"http://hvo.wr.usgs.gov/kilauea/update/maps.html",
b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html",
b"http://neo.jpl.nasa.gov/risk/",
b"http://norvig.com/21-days.html",
b"http://primes.utm.edu/curios/home.php",
b"http://slashdot.org/",
b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html",
b"http://volcano.wr.usgs.gov/kilaueastatus.php",
b"http://www.avo.alaska.edu/activity/Redoubt.php",
b"http://www.dilbert.com/fast/",
b"http://www.fourmilab.ch/gravitation/orbits/",
b"http://www.fpoa.net/",
b"http://www.ioccc.org/index.html",
b"http://www.isthe.com/cgi-bin/number.cgi",
b"http://www.isthe.com/chongo/bio.html",
b"http://www.isthe.com/chongo/index.html",
b"http://www.isthe.com/chongo/src/calc/lucas-calc",
b"http://www.isthe.com/chongo/tech/astro/venus2004.html",
b"http://www.isthe.com/chongo/tech/astro/vita.html",
b"http://www.isthe.com/chongo/tech/comp/c/expert.html",
b"http://www.isthe.com/chongo/tech/comp/calc/index.html",
b"http://www.isthe.com/chongo/tech/comp/fnv/index.html",
b"http://www.isthe.com/chongo/tech/math/number/howhigh.html",
b"http://www.isthe.com/chongo/tech/math/number/number.html",
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html",
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest",
b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi",
b"http://www.lavarnd.org/cgi-bin/haiku.cgi",
b"http://www.lavarnd.org/cgi-bin/rand-none.cgi",
b"http://www.lavarnd.org/cgi-bin/randdist.cgi",
b"http://www.lavarnd.org/index.html",
b"http://www.lavarnd.org/what/nist-test.html",
b"http://www.macosxhints.com/",
b"http://www.mellis.com/",
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm",
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm",
b"http://www.paulnoll.com/",
b"http://www.pepysdiary.com/",
b"http://www.sciencenews.org/index/home/activity/view",
b"http://www.skyandtelescope.com/",
b"http://www.sput.nl/~rob/sirius.html",
b"http://www.systemexperts.com/",
b"http://www.tq-international.com/phpBB3/index.php",
b"http://www.travelquesttours.com/index.htm",
b"http://www.wunderground.com/global/stations/89606.html",
b"21701" * 10,
b"M21701" * 10,
b"2^21701-1" * 10,
b"\x54\xc5" * 10,
b"\xc5\x54" * 10,
b"23209" * 10,
b"M23209" * 10,
b"2^23209-1" * 10,
b"\x5a\xa9" * 10,
b"\xa9\x5a" * 10,
b"391581216093" * 10,
b"391581*2^216093-1" * 10,
b"\x05\xf9\x9d\x03\x4c\x81" * 10,
b"FEDCBA9876543210" * 10,
b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10,
b"EFCDAB8967452301" * 10,
b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10,
b"0123456789ABCDEF" * 10,
b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10,
b"1032547698BADCFE" * 10,
b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10,
b"\x00" * 500,
b"\x07" * 500,
b"~" * 500,
b"\x7f" * 500,
]
OUTPUTS = [
0x811C9DC5,
0xE40C292C,
0xE70C2DE5,
0xE60C2C52,
0xE10C2473,
0xE00C22E0,
0xE30C2799,
0x6222E842,
0xA9F37ED7,
0x3F5076EF,
0x39AAA18A,
0xBF9CF968,
0x050C5D1F,
0x2B24D044,
0x9D2C3F7F,
0x7729C516,
0xB91D6109,
0x931AE6A0,
0x052255DB,
0xBEF39FE6,
0x6150AC75,
0x9AAB3A3D,
0x519C4C3E,
0x0C1C9EB8,
0x5F299F4E,
0xEF8580F3,
0xAC297727,
0x4546B9C0,
0xBD564E7D,
0x6BDD5C67,
0xDD77ED30,
0xF4CA9683,
0x4AEB9BD0,
0xE0E67AD0,
0xC2D32FA8,
0x7F743FB7,
0x6900631F,
0xC59C990E,
0x448524FD,
0xD49930D5,
0x1C85C7CA,
0x0229FE89,
0x2C469265,
0xCE566940,
0x8BDD8EC7,
0x34787625,
0xD3CA6290,
0xDDEAF039,
0xC0E64870,
0xDAD35570,
0x5A740578,
0x5B004D15,
0x6A9C09CD,
0x2384F10A,
0xDA993A47,
0x8227DF4F,
0x4C298165,
0xFC563735,
0x8CB91483,
0x775BF5D0,
0xD5C428D0,
0x34CC0EA3,
0xEA3B4CB7,
0x8E59F029,
0x2094DE2B,
0xA65A0AD4,
0x9BBEE5F4,
0xBE836343,
0x22D5344E,
0x19A1470C,
0x4A56B1FF,
0x70B8E86F,
0x0A5B4A39,
0xB5C3F670,
0x53CC3F70,
0xC03B0A99,
0x7259C415,
0x4095108B,
0x7559BDB1,
0xB3BF0BBC,
0x2183FF1C,
0x2BD54279,
0x23A156CA,
0x64E2D7E4,
0x683AF69A,
0xAED2346E,
0x4F9F2CAB,
0x02935131,
0xC48FB86D,
0x2269F369,
0xC18FB3B4,
0x50EF1236,
0xC28FB547,
0x96C3BF47,
0xBF8FB08E,
0xF3E4D49C,
0x32179058,
0x280BFEE6,
0x30178D32,
0x21ADDAF8,
0x4217A988,
0x772633D6,
0x08A3D11E,
0xB7E2323A,
0x07A3CF8B,
0x91DFB7D1,
0x06A3CDF8,
0x6BDD3D68,
0x1D5636A7,
0xD5B808E5,
0x1353E852,
0xBF16B916,
0xA55B89ED,
0x3C1A2017,
0x0588B13C,
0xF22F0174,
0xE83641E1,
0x6E69B533,
0xF1760448,
0x64C8BD58,
0x97B4EA23,
0x9A4E92E6,
0xCFB14012,
0xF01B2511,
0x0BBB59C3,
0xCE524AFA,
0xDD16EF45,
0x60648BB3,
0x7FA4BCFC,
0x5053AE17,
0xC9302890,
0x956DED32,
0x9136DB84,
0xDF9D3323,
0x32BB6CD0,
0xC8F8385B,
0xEB08BFBA,
0x62CC8E3D,
0xC3E20F5C,
0x39E97F17,
0x7837B203,
0x319E877B,
0xD3E63F89,
0x29B50B38,
0x5ED678B8,
0xB0D5B793,
0x52450BE5,
0xFA72D767,
0x95066709,
0x7F52E123,
0x76966481,
0x063258B0,
0x2DED6E8A,
0xB07D7C52,
0xD0C71B71,
0xF684F1BD,
0x868ECFA8,
0xF794F684,
0xD19701C3,
0x346E171E,
0x91F8F676,
0x0BF58848,
0x6317B6D1,
0xAFAD4C54,
0x0F25681E,
0x91B18D49,
0x7D61C12E,
0x5147D25C,
0x9A8B6805,
0x4CD2A447,
0x1E549B14,
0x2FE1B574,
0xCF0CD31E,
0x6C471669,
0x0E5EEF1E,
0x2BED3602,
0xB26249E0,
0x2C9B86A4,
0xE415E2BB,
0x18A98D1D,
0xB7DF8B7B,
0x241E9075,
0x063F70DD,
0x0295AED9,
0x56A7F781,
0x253BC645,
0x46610921,
0x7C1577F9,
0x512B2851,
0x76823999,
0xC0586935,
0xF3415C85,
0x0AE4FF65,
0x58B79725,
0xDEA43AA5,
0x2BB3BE35,
0xEA777A45,
0x8F21C305,
0x5C9D0865,
0xFA823DD5,
0x21A27271,
0x83C5C6D5,
0x813B0881,
]
assert len(INPUTS) == len(OUTPUTS)
for i in range(len(INPUTS)):
assert get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
def _encode_and_hash(input: str) -> int:
return get_fnv1a_hash(input.encode("UTF-8"))
@ -1099,14 +1526,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
else:
assert hashes[3][9] == _encode_and_hash("rp")
# check values are the same cross-platform
if case_sensitive:
assert hashes[0][2] == 1140960578
else:
assert hashes[0][2] == 604076770
assert hashes[1][3] == 3384544169
assert hashes[3][8] == 4144776981
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
doc = en_tokenizer("spaCy✨ and Prodigy")

View File

@ -2202,8 +2202,15 @@ cdef uint32_t fnv1a_hash(
def get_fnv1a_hash(input: bytes):
""" Python method to facilitate testing *fnv1a_hash*. """
return fnv1a_hash(input, len(input))
""" Python-callable method to facilitate testing. """
cdef uint32_t hash_val = 0x811c9dc5
cdef int length = len(input), offset = 0
while offset < length:
hash_val ^= input[offset]
hash_val *= 0x01000193
offset += 1
return hash_val
@cython.boundscheck(False) # Deactivate bounds checking