diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index cdbf9c007..5c761464f 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -996,6 +996,433 @@ def test_doc_spans_setdefault(en_tokenizer): assert len(doc.spans["key3"]) == 2 +def test_fnv1a_hash(): + """Checks the conformity of the FNV1A implementation with + http://www.isthe.com/chongo/src/fnv/test_fnv.c. + The method called here is only used in testing; in production + code, the hashing is performed in a fashion that is interweaved + with other logic. The conformity of the production code is + demonstrated by the character combination hash tests, where + hashes produced by the production code are tested for equality + against hashes prodduced by the test code. + s""" + INPUTS = [ + b"", + b"a", + b"b", + b"c", + b"d", + b"e", + b"f", + b"fo", + b"foo", + b"foob", + b"fooba", + b"foobar", + b"\x00", + b"a\x00", + b"b\x00", + b"c\x00", + b"d\x00", + b"e\x00", + b"f\x00", + b"fo\x00", + b"foo\x00", + b"foob\x00", + b"fooba\x00", + b"foobar\x00", + b"ch", + b"cho", + b"chon", + b"chong", + b"chongo", + b"chongo ", + b"chongo w", + b"chongo wa", + b"chongo was", + b"chongo was ", + b"chongo was h", + b"chongo was he", + b"chongo was her", + b"chongo was here", + b"chongo was here!", + b"chongo was here!\n", + b"ch\x00", + b"cho\x00", + b"chon\x00", + b"chong\x00", + b"chongo\x00", + b"chongo \x00", + b"chongo w\x00", + b"chongo wa\x00", + b"chongo was\x00", + b"chongo was \x00", + b"chongo was h\x00", + b"chongo was he\x00", + b"chongo was her\x00", + b"chongo was here\x00", + b"chongo was here!\x00", + b"chongo was here!\n\x00", + b"cu", + b"cur", + b"curd", + b"curds", + b"curds ", + b"curds a", + b"curds an", + b"curds and", + b"curds and ", + b"curds and w", + b"curds and wh", + b"curds and whe", + b"curds and whey", + b"curds and whey\n", + b"cu\x00", + b"cur\x00", + b"curd\x00", + b"curds\x00", + b"curds \x00", + b"curds a\x00", + b"curds an\x00", + b"curds and\x00", + b"curds and \x00", + b"curds and w\x00", + b"curds and wh\x00", + b"curds and whe\x00", + b"curds and whey\x00", + b"curds and whey\n\x00", + b"hi", + b"hi\x00", + b"hello", + b"hello\x00", + b"\xff\x00\x00\x01", + b"\x01\x00\x00\xff", + b"\xff\x00\x00\x02", + b"\x02\x00\x00\xff", + b"\xff\x00\x00\x03", + b"\x03\x00\x00\xff", + b"\xff\x00\x00\x04", + b"\x04\x00\x00\xff", + b"\x40\x51\x4e\x44", + b"\x44\x4e\x51\x40", + b"\x40\x51\x4e\x4a", + b"\x4a\x4e\x51\x40", + b"\x40\x51\x4e\x54", + b"\x54\x4e\x51\x40", + b"127.0.0.1", + b"127.0.0.1\x00", + b"127.0.0.2", + b"127.0.0.2\x00", + b"127.0.0.3", + b"127.0.0.3\x00", + b"64.81.78.68", + b"64.81.78.68\x00", + b"64.81.78.74", + b"64.81.78.74\x00", + b"64.81.78.84", + b"64.81.78.84\x00", + b"feedface", + b"feedface\x00", + b"feedfacedaffdeed", + b"feedfacedaffdeed\x00", + b"feedfacedeadbeef", + b"feedfacedeadbeef\x00", + b"line 1\nline 2\nline 3", + b"chongo /\\../\\", + b"chongo /\\../\\\x00", + b"chongo (Landon Curt Noll) /\\../\\", + b"chongo (Landon Curt Noll) /\\../\\\x00", + b"http://antwrp.gsfc.nasa.gov/apod/astropix.html", + b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash", + b"http://epod.usra.edu/", + b"http://exoplanet.eu/", + b"http://hvo.wr.usgs.gov/cam3/", + b"http://hvo.wr.usgs.gov/cams/HMcam/", + b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html", + b"http://hvo.wr.usgs.gov/kilauea/update/images.html", + b"http://hvo.wr.usgs.gov/kilauea/update/maps.html", + b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html", + b"http://neo.jpl.nasa.gov/risk/", + b"http://norvig.com/21-days.html", + b"http://primes.utm.edu/curios/home.php", + b"http://slashdot.org/", + b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html", + b"http://volcano.wr.usgs.gov/kilaueastatus.php", + b"http://www.avo.alaska.edu/activity/Redoubt.php", + b"http://www.dilbert.com/fast/", + b"http://www.fourmilab.ch/gravitation/orbits/", + b"http://www.fpoa.net/", + b"http://www.ioccc.org/index.html", + b"http://www.isthe.com/cgi-bin/number.cgi", + b"http://www.isthe.com/chongo/bio.html", + b"http://www.isthe.com/chongo/index.html", + b"http://www.isthe.com/chongo/src/calc/lucas-calc", + b"http://www.isthe.com/chongo/tech/astro/venus2004.html", + b"http://www.isthe.com/chongo/tech/astro/vita.html", + b"http://www.isthe.com/chongo/tech/comp/c/expert.html", + b"http://www.isthe.com/chongo/tech/comp/calc/index.html", + b"http://www.isthe.com/chongo/tech/comp/fnv/index.html", + b"http://www.isthe.com/chongo/tech/math/number/howhigh.html", + b"http://www.isthe.com/chongo/tech/math/number/number.html", + b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html", + b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest", + b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi", + b"http://www.lavarnd.org/cgi-bin/haiku.cgi", + b"http://www.lavarnd.org/cgi-bin/rand-none.cgi", + b"http://www.lavarnd.org/cgi-bin/randdist.cgi", + b"http://www.lavarnd.org/index.html", + b"http://www.lavarnd.org/what/nist-test.html", + b"http://www.macosxhints.com/", + b"http://www.mellis.com/", + b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm", + b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm", + b"http://www.paulnoll.com/", + b"http://www.pepysdiary.com/", + b"http://www.sciencenews.org/index/home/activity/view", + b"http://www.skyandtelescope.com/", + b"http://www.sput.nl/~rob/sirius.html", + b"http://www.systemexperts.com/", + b"http://www.tq-international.com/phpBB3/index.php", + b"http://www.travelquesttours.com/index.htm", + b"http://www.wunderground.com/global/stations/89606.html", + b"21701" * 10, + b"M21701" * 10, + b"2^21701-1" * 10, + b"\x54\xc5" * 10, + b"\xc5\x54" * 10, + b"23209" * 10, + b"M23209" * 10, + b"2^23209-1" * 10, + b"\x5a\xa9" * 10, + b"\xa9\x5a" * 10, + b"391581216093" * 10, + b"391581*2^216093-1" * 10, + b"\x05\xf9\x9d\x03\x4c\x81" * 10, + b"FEDCBA9876543210" * 10, + b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10, + b"EFCDAB8967452301" * 10, + b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10, + b"0123456789ABCDEF" * 10, + b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10, + b"1032547698BADCFE" * 10, + b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10, + b"\x00" * 500, + b"\x07" * 500, + b"~" * 500, + b"\x7f" * 500, + ] + + OUTPUTS = [ + 0x811C9DC5, + 0xE40C292C, + 0xE70C2DE5, + 0xE60C2C52, + 0xE10C2473, + 0xE00C22E0, + 0xE30C2799, + 0x6222E842, + 0xA9F37ED7, + 0x3F5076EF, + 0x39AAA18A, + 0xBF9CF968, + 0x050C5D1F, + 0x2B24D044, + 0x9D2C3F7F, + 0x7729C516, + 0xB91D6109, + 0x931AE6A0, + 0x052255DB, + 0xBEF39FE6, + 0x6150AC75, + 0x9AAB3A3D, + 0x519C4C3E, + 0x0C1C9EB8, + 0x5F299F4E, + 0xEF8580F3, + 0xAC297727, + 0x4546B9C0, + 0xBD564E7D, + 0x6BDD5C67, + 0xDD77ED30, + 0xF4CA9683, + 0x4AEB9BD0, + 0xE0E67AD0, + 0xC2D32FA8, + 0x7F743FB7, + 0x6900631F, + 0xC59C990E, + 0x448524FD, + 0xD49930D5, + 0x1C85C7CA, + 0x0229FE89, + 0x2C469265, + 0xCE566940, + 0x8BDD8EC7, + 0x34787625, + 0xD3CA6290, + 0xDDEAF039, + 0xC0E64870, + 0xDAD35570, + 0x5A740578, + 0x5B004D15, + 0x6A9C09CD, + 0x2384F10A, + 0xDA993A47, + 0x8227DF4F, + 0x4C298165, + 0xFC563735, + 0x8CB91483, + 0x775BF5D0, + 0xD5C428D0, + 0x34CC0EA3, + 0xEA3B4CB7, + 0x8E59F029, + 0x2094DE2B, + 0xA65A0AD4, + 0x9BBEE5F4, + 0xBE836343, + 0x22D5344E, + 0x19A1470C, + 0x4A56B1FF, + 0x70B8E86F, + 0x0A5B4A39, + 0xB5C3F670, + 0x53CC3F70, + 0xC03B0A99, + 0x7259C415, + 0x4095108B, + 0x7559BDB1, + 0xB3BF0BBC, + 0x2183FF1C, + 0x2BD54279, + 0x23A156CA, + 0x64E2D7E4, + 0x683AF69A, + 0xAED2346E, + 0x4F9F2CAB, + 0x02935131, + 0xC48FB86D, + 0x2269F369, + 0xC18FB3B4, + 0x50EF1236, + 0xC28FB547, + 0x96C3BF47, + 0xBF8FB08E, + 0xF3E4D49C, + 0x32179058, + 0x280BFEE6, + 0x30178D32, + 0x21ADDAF8, + 0x4217A988, + 0x772633D6, + 0x08A3D11E, + 0xB7E2323A, + 0x07A3CF8B, + 0x91DFB7D1, + 0x06A3CDF8, + 0x6BDD3D68, + 0x1D5636A7, + 0xD5B808E5, + 0x1353E852, + 0xBF16B916, + 0xA55B89ED, + 0x3C1A2017, + 0x0588B13C, + 0xF22F0174, + 0xE83641E1, + 0x6E69B533, + 0xF1760448, + 0x64C8BD58, + 0x97B4EA23, + 0x9A4E92E6, + 0xCFB14012, + 0xF01B2511, + 0x0BBB59C3, + 0xCE524AFA, + 0xDD16EF45, + 0x60648BB3, + 0x7FA4BCFC, + 0x5053AE17, + 0xC9302890, + 0x956DED32, + 0x9136DB84, + 0xDF9D3323, + 0x32BB6CD0, + 0xC8F8385B, + 0xEB08BFBA, + 0x62CC8E3D, + 0xC3E20F5C, + 0x39E97F17, + 0x7837B203, + 0x319E877B, + 0xD3E63F89, + 0x29B50B38, + 0x5ED678B8, + 0xB0D5B793, + 0x52450BE5, + 0xFA72D767, + 0x95066709, + 0x7F52E123, + 0x76966481, + 0x063258B0, + 0x2DED6E8A, + 0xB07D7C52, + 0xD0C71B71, + 0xF684F1BD, + 0x868ECFA8, + 0xF794F684, + 0xD19701C3, + 0x346E171E, + 0x91F8F676, + 0x0BF58848, + 0x6317B6D1, + 0xAFAD4C54, + 0x0F25681E, + 0x91B18D49, + 0x7D61C12E, + 0x5147D25C, + 0x9A8B6805, + 0x4CD2A447, + 0x1E549B14, + 0x2FE1B574, + 0xCF0CD31E, + 0x6C471669, + 0x0E5EEF1E, + 0x2BED3602, + 0xB26249E0, + 0x2C9B86A4, + 0xE415E2BB, + 0x18A98D1D, + 0xB7DF8B7B, + 0x241E9075, + 0x063F70DD, + 0x0295AED9, + 0x56A7F781, + 0x253BC645, + 0x46610921, + 0x7C1577F9, + 0x512B2851, + 0x76823999, + 0xC0586935, + 0xF3415C85, + 0x0AE4FF65, + 0x58B79725, + 0xDEA43AA5, + 0x2BB3BE35, + 0xEA777A45, + 0x8F21C305, + 0x5C9D0865, + 0xFA823DD5, + 0x21A27271, + 0x83C5C6D5, + 0x813B0881, + ] + + assert len(INPUTS) == len(OUTPUTS) + for i in range(len(INPUTS)): + assert get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i] + + def _encode_and_hash(input: str) -> int: return get_fnv1a_hash(input.encode("UTF-8")) @@ -1099,14 +1526,6 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive else: assert hashes[3][9] == _encode_and_hash("rp") - # check values are the same cross-platform - if case_sensitive: - assert hashes[0][2] == 1140960578 - else: - assert hashes[0][2] == 604076770 - assert hashes[1][3] == 3384544169 - assert hashes[3][8] == 4144776981 - def test_get_character_combination_hashes_good_case_partial(en_tokenizer): doc = en_tokenizer("spaCy✨ and Prodigy") diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 33d45a145..9f1592caa 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -2202,8 +2202,15 @@ cdef uint32_t fnv1a_hash( def get_fnv1a_hash(input: bytes): - """ Python method to facilitate testing *fnv1a_hash*. """ - return fnv1a_hash(input, len(input)) + """ Python-callable method to facilitate testing. """ + cdef uint32_t hash_val = 0x811c9dc5 + cdef int length = len(input), offset = 0 + + while offset < length: + hash_val ^= input[offset] + hash_val *= 0x01000193 + offset += 1 + return hash_val @cython.boundscheck(False) # Deactivate bounds checking