mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Add FNV1A conformity tests
This commit is contained in:
parent
557799358c
commit
deba504173
|
@ -996,6 +996,433 @@ def test_doc_spans_setdefault(en_tokenizer):
|
|||
assert len(doc.spans["key3"]) == 2
|
||||
|
||||
|
||||
def test_fnv1a_hash():
|
||||
"""Checks the conformity of the FNV1A implementation with
|
||||
http://www.isthe.com/chongo/src/fnv/test_fnv.c.
|
||||
The method called here is only used in testing; in production
|
||||
code, the hashing is performed in a fashion that is interweaved
|
||||
with other logic. The conformity of the production code is
|
||||
demonstrated by the character combination hash tests, where
|
||||
hashes produced by the production code are tested for equality
|
||||
against hashes prodduced by the test code.
|
||||
s"""
|
||||
INPUTS = [
|
||||
b"",
|
||||
b"a",
|
||||
b"b",
|
||||
b"c",
|
||||
b"d",
|
||||
b"e",
|
||||
b"f",
|
||||
b"fo",
|
||||
b"foo",
|
||||
b"foob",
|
||||
b"fooba",
|
||||
b"foobar",
|
||||
b"\x00",
|
||||
b"a\x00",
|
||||
b"b\x00",
|
||||
b"c\x00",
|
||||
b"d\x00",
|
||||
b"e\x00",
|
||||
b"f\x00",
|
||||
b"fo\x00",
|
||||
b"foo\x00",
|
||||
b"foob\x00",
|
||||
b"fooba\x00",
|
||||
b"foobar\x00",
|
||||
b"ch",
|
||||
b"cho",
|
||||
b"chon",
|
||||
b"chong",
|
||||
b"chongo",
|
||||
b"chongo ",
|
||||
b"chongo w",
|
||||
b"chongo wa",
|
||||
b"chongo was",
|
||||
b"chongo was ",
|
||||
b"chongo was h",
|
||||
b"chongo was he",
|
||||
b"chongo was her",
|
||||
b"chongo was here",
|
||||
b"chongo was here!",
|
||||
b"chongo was here!\n",
|
||||
b"ch\x00",
|
||||
b"cho\x00",
|
||||
b"chon\x00",
|
||||
b"chong\x00",
|
||||
b"chongo\x00",
|
||||
b"chongo \x00",
|
||||
b"chongo w\x00",
|
||||
b"chongo wa\x00",
|
||||
b"chongo was\x00",
|
||||
b"chongo was \x00",
|
||||
b"chongo was h\x00",
|
||||
b"chongo was he\x00",
|
||||
b"chongo was her\x00",
|
||||
b"chongo was here\x00",
|
||||
b"chongo was here!\x00",
|
||||
b"chongo was here!\n\x00",
|
||||
b"cu",
|
||||
b"cur",
|
||||
b"curd",
|
||||
b"curds",
|
||||
b"curds ",
|
||||
b"curds a",
|
||||
b"curds an",
|
||||
b"curds and",
|
||||
b"curds and ",
|
||||
b"curds and w",
|
||||
b"curds and wh",
|
||||
b"curds and whe",
|
||||
b"curds and whey",
|
||||
b"curds and whey\n",
|
||||
b"cu\x00",
|
||||
b"cur\x00",
|
||||
b"curd\x00",
|
||||
b"curds\x00",
|
||||
b"curds \x00",
|
||||
b"curds a\x00",
|
||||
b"curds an\x00",
|
||||
b"curds and\x00",
|
||||
b"curds and \x00",
|
||||
b"curds and w\x00",
|
||||
b"curds and wh\x00",
|
||||
b"curds and whe\x00",
|
||||
b"curds and whey\x00",
|
||||
b"curds and whey\n\x00",
|
||||
b"hi",
|
||||
b"hi\x00",
|
||||
b"hello",
|
||||
b"hello\x00",
|
||||
b"\xff\x00\x00\x01",
|
||||
b"\x01\x00\x00\xff",
|
||||
b"\xff\x00\x00\x02",
|
||||
b"\x02\x00\x00\xff",
|
||||
b"\xff\x00\x00\x03",
|
||||
b"\x03\x00\x00\xff",
|
||||
b"\xff\x00\x00\x04",
|
||||
b"\x04\x00\x00\xff",
|
||||
b"\x40\x51\x4e\x44",
|
||||
b"\x44\x4e\x51\x40",
|
||||
b"\x40\x51\x4e\x4a",
|
||||
b"\x4a\x4e\x51\x40",
|
||||
b"\x40\x51\x4e\x54",
|
||||
b"\x54\x4e\x51\x40",
|
||||
b"127.0.0.1",
|
||||
b"127.0.0.1\x00",
|
||||
b"127.0.0.2",
|
||||
b"127.0.0.2\x00",
|
||||
b"127.0.0.3",
|
||||
b"127.0.0.3\x00",
|
||||
b"64.81.78.68",
|
||||
b"64.81.78.68\x00",
|
||||
b"64.81.78.74",
|
||||
b"64.81.78.74\x00",
|
||||
b"64.81.78.84",
|
||||
b"64.81.78.84\x00",
|
||||
b"feedface",
|
||||
b"feedface\x00",
|
||||
b"feedfacedaffdeed",
|
||||
b"feedfacedaffdeed\x00",
|
||||
b"feedfacedeadbeef",
|
||||
b"feedfacedeadbeef\x00",
|
||||
b"line 1\nline 2\nline 3",
|
||||
b"chongo <Landon Curt Noll> /\\../\\",
|
||||
b"chongo <Landon Curt Noll> /\\../\\\x00",
|
||||
b"chongo (Landon Curt Noll) /\\../\\",
|
||||
b"chongo (Landon Curt Noll) /\\../\\\x00",
|
||||
b"http://antwrp.gsfc.nasa.gov/apod/astropix.html",
|
||||
b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash",
|
||||
b"http://epod.usra.edu/",
|
||||
b"http://exoplanet.eu/",
|
||||
b"http://hvo.wr.usgs.gov/cam3/",
|
||||
b"http://hvo.wr.usgs.gov/cams/HMcam/",
|
||||
b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html",
|
||||
b"http://hvo.wr.usgs.gov/kilauea/update/images.html",
|
||||
b"http://hvo.wr.usgs.gov/kilauea/update/maps.html",
|
||||
b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html",
|
||||
b"http://neo.jpl.nasa.gov/risk/",
|
||||
b"http://norvig.com/21-days.html",
|
||||
b"http://primes.utm.edu/curios/home.php",
|
||||
b"http://slashdot.org/",
|
||||
b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html",
|
||||
b"http://volcano.wr.usgs.gov/kilaueastatus.php",
|
||||
b"http://www.avo.alaska.edu/activity/Redoubt.php",
|
||||
b"http://www.dilbert.com/fast/",
|
||||
b"http://www.fourmilab.ch/gravitation/orbits/",
|
||||
b"http://www.fpoa.net/",
|
||||
b"http://www.ioccc.org/index.html",
|
||||
b"http://www.isthe.com/cgi-bin/number.cgi",
|
||||
b"http://www.isthe.com/chongo/bio.html",
|
||||
b"http://www.isthe.com/chongo/index.html",
|
||||
b"http://www.isthe.com/chongo/src/calc/lucas-calc",
|
||||
b"http://www.isthe.com/chongo/tech/astro/venus2004.html",
|
||||
b"http://www.isthe.com/chongo/tech/astro/vita.html",
|
||||
b"http://www.isthe.com/chongo/tech/comp/c/expert.html",
|
||||
b"http://www.isthe.com/chongo/tech/comp/calc/index.html",
|
||||
b"http://www.isthe.com/chongo/tech/comp/fnv/index.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/number/howhigh.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/number/number.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest",
|
||||
b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi",
|
||||
b"http://www.lavarnd.org/cgi-bin/haiku.cgi",
|
||||
b"http://www.lavarnd.org/cgi-bin/rand-none.cgi",
|
||||
b"http://www.lavarnd.org/cgi-bin/randdist.cgi",
|
||||
b"http://www.lavarnd.org/index.html",
|
||||
b"http://www.lavarnd.org/what/nist-test.html",
|
||||
b"http://www.macosxhints.com/",
|
||||
b"http://www.mellis.com/",
|
||||
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm",
|
||||
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm",
|
||||
b"http://www.paulnoll.com/",
|
||||
b"http://www.pepysdiary.com/",
|
||||
b"http://www.sciencenews.org/index/home/activity/view",
|
||||
b"http://www.skyandtelescope.com/",
|
||||
b"http://www.sput.nl/~rob/sirius.html",
|
||||
b"http://www.systemexperts.com/",
|
||||
b"http://www.tq-international.com/phpBB3/index.php",
|
||||
b"http://www.travelquesttours.com/index.htm",
|
||||
b"http://www.wunderground.com/global/stations/89606.html",
|
||||
b"21701" * 10,
|
||||
b"M21701" * 10,
|
||||
b"2^21701-1" * 10,
|
||||
b"\x54\xc5" * 10,
|
||||
b"\xc5\x54" * 10,
|
||||
b"23209" * 10,
|
||||
b"M23209" * 10,
|
||||
b"2^23209-1" * 10,
|
||||
b"\x5a\xa9" * 10,
|
||||
b"\xa9\x5a" * 10,
|
||||
b"391581216093" * 10,
|
||||
b"391581*2^216093-1" * 10,
|
||||
b"\x05\xf9\x9d\x03\x4c\x81" * 10,
|
||||
b"FEDCBA9876543210" * 10,
|
||||
b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10,
|
||||
b"EFCDAB8967452301" * 10,
|
||||
b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10,
|
||||
b"0123456789ABCDEF" * 10,
|
||||
b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10,
|
||||
b"1032547698BADCFE" * 10,
|
||||
b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10,
|
||||
b"\x00" * 500,
|
||||
b"\x07" * 500,
|
||||
b"~" * 500,
|
||||
b"\x7f" * 500,
|
||||
]
|
||||
|
||||
OUTPUTS = [
|
||||
0x811C9DC5,
|
||||
0xE40C292C,
|
||||
0xE70C2DE5,
|
||||
0xE60C2C52,
|
||||
0xE10C2473,
|
||||
0xE00C22E0,
|
||||
0xE30C2799,
|
||||
0x6222E842,
|
||||
0xA9F37ED7,
|
||||
0x3F5076EF,
|
||||
0x39AAA18A,
|
||||
0xBF9CF968,
|
||||
0x050C5D1F,
|
||||
0x2B24D044,
|
||||
0x9D2C3F7F,
|
||||
0x7729C516,
|
||||
0xB91D6109,
|
||||
0x931AE6A0,
|
||||
0x052255DB,
|
||||
0xBEF39FE6,
|
||||
0x6150AC75,
|
||||
0x9AAB3A3D,
|
||||
0x519C4C3E,
|
||||
0x0C1C9EB8,
|
||||
0x5F299F4E,
|
||||
0xEF8580F3,
|
||||
0xAC297727,
|
||||
0x4546B9C0,
|
||||
0xBD564E7D,
|
||||
0x6BDD5C67,
|
||||
0xDD77ED30,
|
||||
0xF4CA9683,
|
||||
0x4AEB9BD0,
|
||||
0xE0E67AD0,
|
||||
0xC2D32FA8,
|
||||
0x7F743FB7,
|
||||
0x6900631F,
|
||||
0xC59C990E,
|
||||
0x448524FD,
|
||||
0xD49930D5,
|
||||
0x1C85C7CA,
|
||||
0x0229FE89,
|
||||
0x2C469265,
|
||||
0xCE566940,
|
||||
0x8BDD8EC7,
|
||||
0x34787625,
|
||||
0xD3CA6290,
|
||||
0xDDEAF039,
|
||||
0xC0E64870,
|
||||
0xDAD35570,
|
||||
0x5A740578,
|
||||
0x5B004D15,
|
||||
0x6A9C09CD,
|
||||
0x2384F10A,
|
||||
0xDA993A47,
|
||||
0x8227DF4F,
|
||||
0x4C298165,
|
||||
0xFC563735,
|
||||
0x8CB91483,
|
||||
0x775BF5D0,
|
||||
0xD5C428D0,
|
||||
0x34CC0EA3,
|
||||
0xEA3B4CB7,
|
||||
0x8E59F029,
|
||||
0x2094DE2B,
|
||||
0xA65A0AD4,
|
||||
0x9BBEE5F4,
|
||||
0xBE836343,
|
||||
0x22D5344E,
|
||||
0x19A1470C,
|
||||
0x4A56B1FF,
|
||||
0x70B8E86F,
|
||||
0x0A5B4A39,
|
||||
0xB5C3F670,
|
||||
0x53CC3F70,
|
||||
0xC03B0A99,
|
||||
0x7259C415,
|
||||
0x4095108B,
|
||||
0x7559BDB1,
|
||||
0xB3BF0BBC,
|
||||
0x2183FF1C,
|
||||
0x2BD54279,
|
||||
0x23A156CA,
|
||||
0x64E2D7E4,
|
||||
0x683AF69A,
|
||||
0xAED2346E,
|
||||
0x4F9F2CAB,
|
||||
0x02935131,
|
||||
0xC48FB86D,
|
||||
0x2269F369,
|
||||
0xC18FB3B4,
|
||||
0x50EF1236,
|
||||
0xC28FB547,
|
||||
0x96C3BF47,
|
||||
0xBF8FB08E,
|
||||
0xF3E4D49C,
|
||||
0x32179058,
|
||||
0x280BFEE6,
|
||||
0x30178D32,
|
||||
0x21ADDAF8,
|
||||
0x4217A988,
|
||||
0x772633D6,
|
||||
0x08A3D11E,
|
||||
0xB7E2323A,
|
||||
0x07A3CF8B,
|
||||
0x91DFB7D1,
|
||||
0x06A3CDF8,
|
||||
0x6BDD3D68,
|
||||
0x1D5636A7,
|
||||
0xD5B808E5,
|
||||
0x1353E852,
|
||||
0xBF16B916,
|
||||
0xA55B89ED,
|
||||
0x3C1A2017,
|
||||
0x0588B13C,
|
||||
0xF22F0174,
|
||||
0xE83641E1,
|
||||
0x6E69B533,
|
||||
0xF1760448,
|
||||
0x64C8BD58,
|
||||
0x97B4EA23,
|
||||
0x9A4E92E6,
|
||||
0xCFB14012,
|
||||
0xF01B2511,
|
||||
0x0BBB59C3,
|
||||
0xCE524AFA,
|
||||
0xDD16EF45,
|
||||
0x60648BB3,
|
||||
0x7FA4BCFC,
|
||||
0x5053AE17,
|
||||
0xC9302890,
|
||||
0x956DED32,
|
||||
0x9136DB84,
|
||||
0xDF9D3323,
|
||||
0x32BB6CD0,
|
||||
0xC8F8385B,
|
||||
0xEB08BFBA,
|
||||
0x62CC8E3D,
|
||||
0xC3E20F5C,
|
||||
0x39E97F17,
|
||||
0x7837B203,
|
||||
0x319E877B,
|
||||
0xD3E63F89,
|
||||
0x29B50B38,
|
||||
0x5ED678B8,
|
||||
0xB0D5B793,
|
||||
0x52450BE5,
|
||||
0xFA72D767,
|
||||
0x95066709,
|
||||
0x7F52E123,
|
||||
0x76966481,
|
||||
0x063258B0,
|
||||
0x2DED6E8A,
|
||||
0xB07D7C52,
|
||||
0xD0C71B71,
|
||||
0xF684F1BD,
|
||||
0x868ECFA8,
|
||||
0xF794F684,
|
||||
0xD19701C3,
|
||||
0x346E171E,
|
||||
0x91F8F676,
|
||||
0x0BF58848,
|
||||
0x6317B6D1,
|
||||
0xAFAD4C54,
|
||||
0x0F25681E,
|
||||
0x91B18D49,
|
||||
0x7D61C12E,
|
||||
0x5147D25C,
|
||||
0x9A8B6805,
|
||||
0x4CD2A447,
|
||||
0x1E549B14,
|
||||
0x2FE1B574,
|
||||
0xCF0CD31E,
|
||||
0x6C471669,
|
||||
0x0E5EEF1E,
|
||||
0x2BED3602,
|
||||
0xB26249E0,
|
||||
0x2C9B86A4,
|
||||
0xE415E2BB,
|
||||
0x18A98D1D,
|
||||
0xB7DF8B7B,
|
||||
0x241E9075,
|
||||
0x063F70DD,
|
||||
0x0295AED9,
|
||||
0x56A7F781,
|
||||
0x253BC645,
|
||||
0x46610921,
|
||||
0x7C1577F9,
|
||||
0x512B2851,
|
||||
0x76823999,
|
||||
0xC0586935,
|
||||
0xF3415C85,
|
||||
0x0AE4FF65,
|
||||
0x58B79725,
|
||||
0xDEA43AA5,
|
||||
0x2BB3BE35,
|
||||
0xEA777A45,
|
||||
0x8F21C305,
|
||||
0x5C9D0865,
|
||||
0xFA823DD5,
|
||||
0x21A27271,
|
||||
0x83C5C6D5,
|
||||
0x813B0881,
|
||||
]
|
||||
|
||||
assert len(INPUTS) == len(OUTPUTS)
|
||||
for i in range(len(INPUTS)):
|
||||
assert get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
|
||||
|
||||
|
||||
def _encode_and_hash(input: str) -> int:
|
||||
return get_fnv1a_hash(input.encode("UTF-8"))
|
||||
|
||||
|
@ -1099,14 +1526,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive
|
|||
else:
|
||||
assert hashes[3][9] == _encode_and_hash("rp")
|
||||
|
||||
# check values are the same cross-platform
|
||||
if case_sensitive:
|
||||
assert hashes[0][2] == 1140960578
|
||||
else:
|
||||
assert hashes[0][2] == 604076770
|
||||
assert hashes[1][3] == 3384544169
|
||||
assert hashes[3][8] == 4144776981
|
||||
|
||||
|
||||
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||
|
|
|
@ -2202,8 +2202,15 @@ cdef uint32_t fnv1a_hash(
|
|||
|
||||
|
||||
def get_fnv1a_hash(input: bytes):
|
||||
""" Python method to facilitate testing *fnv1a_hash*. """
|
||||
return fnv1a_hash(input, len(input))
|
||||
""" Python-callable method to facilitate testing. """
|
||||
cdef uint32_t hash_val = 0x811c9dc5
|
||||
cdef int length = len(input), offset = 0
|
||||
|
||||
while offset < length:
|
||||
hash_val ^= input[offset]
|
||||
hash_val *= 0x01000193
|
||||
offset += 1
|
||||
return hash_val
|
||||
|
||||
|
||||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
|
|
Loading…
Reference in New Issue
Block a user