diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index f76dc45b5..c54d38338 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -998,18 +998,18 @@ def test_doc_spans_setdefault(en_tokenizer): assert len(doc.spans["key3"]) == 2 -EMPTY_HASH_VALUE = 0x811C9DC5 +EMPTY_HASH_VALUE = 0xCBF29CE484222325 def test_fnv1a_hash(): - """Checks the conformity of the FNV1A implementation with + """Checks the conformity of the 64-bit FNV1A implementation with http://www.isthe.com/chongo/src/fnv/test_fnv.c. The method called here is only used in testing; in production code, the hashing is performed in a fashion that is interweaved with other logic. The conformity of the production code is demonstrated by the character combination hash tests, where hashes produced by the production code are tested for equality - against hashes prodduced by the test code. + against hashes produced by the test code. s""" INPUTS = [ b"", @@ -1219,208 +1219,208 @@ def test_fnv1a_hash(): OUTPUTS = [ EMPTY_HASH_VALUE, - 0xE40C292C, - 0xE70C2DE5, - 0xE60C2C52, - 0xE10C2473, - 0xE00C22E0, - 0xE30C2799, - 0x6222E842, - 0xA9F37ED7, - 0x3F5076EF, - 0x39AAA18A, - 0xBF9CF968, - 0x050C5D1F, - 0x2B24D044, - 0x9D2C3F7F, - 0x7729C516, - 0xB91D6109, - 0x931AE6A0, - 0x052255DB, - 0xBEF39FE6, - 0x6150AC75, - 0x9AAB3A3D, - 0x519C4C3E, - 0x0C1C9EB8, - 0x5F299F4E, - 0xEF8580F3, - 0xAC297727, - 0x4546B9C0, - 0xBD564E7D, - 0x6BDD5C67, - 0xDD77ED30, - 0xF4CA9683, - 0x4AEB9BD0, - 0xE0E67AD0, - 0xC2D32FA8, - 0x7F743FB7, - 0x6900631F, - 0xC59C990E, - 0x448524FD, - 0xD49930D5, - 0x1C85C7CA, - 0x0229FE89, - 0x2C469265, - 0xCE566940, - 0x8BDD8EC7, - 0x34787625, - 0xD3CA6290, - 0xDDEAF039, - 0xC0E64870, - 0xDAD35570, - 0x5A740578, - 0x5B004D15, - 0x6A9C09CD, - 0x2384F10A, - 0xDA993A47, - 0x8227DF4F, - 0x4C298165, - 0xFC563735, - 0x8CB91483, - 0x775BF5D0, - 0xD5C428D0, - 0x34CC0EA3, - 0xEA3B4CB7, - 0x8E59F029, - 0x2094DE2B, - 0xA65A0AD4, - 0x9BBEE5F4, - 0xBE836343, - 0x22D5344E, - 0x19A1470C, - 0x4A56B1FF, - 0x70B8E86F, - 0x0A5B4A39, - 0xB5C3F670, - 0x53CC3F70, - 0xC03B0A99, - 0x7259C415, - 0x4095108B, - 0x7559BDB1, - 0xB3BF0BBC, - 0x2183FF1C, - 0x2BD54279, - 0x23A156CA, - 0x64E2D7E4, - 0x683AF69A, - 0xAED2346E, - 0x4F9F2CAB, - 0x02935131, - 0xC48FB86D, - 0x2269F369, - 0xC18FB3B4, - 0x50EF1236, - 0xC28FB547, - 0x96C3BF47, - 0xBF8FB08E, - 0xF3E4D49C, - 0x32179058, - 0x280BFEE6, - 0x30178D32, - 0x21ADDAF8, - 0x4217A988, - 0x772633D6, - 0x08A3D11E, - 0xB7E2323A, - 0x07A3CF8B, - 0x91DFB7D1, - 0x06A3CDF8, - 0x6BDD3D68, - 0x1D5636A7, - 0xD5B808E5, - 0x1353E852, - 0xBF16B916, - 0xA55B89ED, - 0x3C1A2017, - 0x0588B13C, - 0xF22F0174, - 0xE83641E1, - 0x6E69B533, - 0xF1760448, - 0x64C8BD58, - 0x97B4EA23, - 0x9A4E92E6, - 0xCFB14012, - 0xF01B2511, - 0x0BBB59C3, - 0xCE524AFA, - 0xDD16EF45, - 0x60648BB3, - 0x7FA4BCFC, - 0x5053AE17, - 0xC9302890, - 0x956DED32, - 0x9136DB84, - 0xDF9D3323, - 0x32BB6CD0, - 0xC8F8385B, - 0xEB08BFBA, - 0x62CC8E3D, - 0xC3E20F5C, - 0x39E97F17, - 0x7837B203, - 0x319E877B, - 0xD3E63F89, - 0x29B50B38, - 0x5ED678B8, - 0xB0D5B793, - 0x52450BE5, - 0xFA72D767, - 0x95066709, - 0x7F52E123, - 0x76966481, - 0x063258B0, - 0x2DED6E8A, - 0xB07D7C52, - 0xD0C71B71, - 0xF684F1BD, - 0x868ECFA8, - 0xF794F684, - 0xD19701C3, - 0x346E171E, - 0x91F8F676, - 0x0BF58848, - 0x6317B6D1, - 0xAFAD4C54, - 0x0F25681E, - 0x91B18D49, - 0x7D61C12E, - 0x5147D25C, - 0x9A8B6805, - 0x4CD2A447, - 0x1E549B14, - 0x2FE1B574, - 0xCF0CD31E, - 0x6C471669, - 0x0E5EEF1E, - 0x2BED3602, - 0xB26249E0, - 0x2C9B86A4, - 0xE415E2BB, - 0x18A98D1D, - 0xB7DF8B7B, - 0x241E9075, - 0x063F70DD, - 0x0295AED9, - 0x56A7F781, - 0x253BC645, - 0x46610921, - 0x7C1577F9, - 0x512B2851, - 0x76823999, - 0xC0586935, - 0xF3415C85, - 0x0AE4FF65, - 0x58B79725, - 0xDEA43AA5, - 0x2BB3BE35, - 0xEA777A45, - 0x8F21C305, - 0x5C9D0865, - 0xFA823DD5, - 0x21A27271, - 0x83C5C6D5, - 0x813B0881, + 0xAF63DC4C8601EC8C, + 0xAF63DF4C8601F1A5, + 0xAF63DE4C8601EFF2, + 0xAF63D94C8601E773, + 0xAF63D84C8601E5C0, + 0xAF63DB4C8601EAD9, + 0x08985907B541D342, + 0xDCB27518FED9D577, + 0xDD120E790C2512AF, + 0xCAC165AFA2FEF40A, + 0x85944171F73967E8, + 0xAF63BD4C8601B7DF, + 0x089BE207B544F1E4, + 0x08A61407B54D9B5F, + 0x08A2AE07B54AB836, + 0x0891B007B53C4869, + 0x088E4A07B5396540, + 0x08987C07B5420EBB, + 0xDCB28A18FED9F926, + 0xDD1270790C25B935, + 0xCAC146AFA2FEBF5D, + 0x8593D371F738ACFE, + 0x34531CA7168B8F38, + 0x08A25607B54A22AE, + 0xF5FAF0190CF90DF3, + 0xF27397910B3221C7, + 0x2C8C2B76062F22E0, + 0xE150688C8217B8FD, + 0xF35A83C10E4F1F87, + 0xD1EDD10B507344D0, + 0x2A5EE739B3DDB8C3, + 0xDCFB970CA1C0D310, + 0x4054DA76DAA6DA90, + 0xF70A2FF589861368, + 0x4C628B38AED25F17, + 0x9DD1F6510F78189F, + 0xA3DE85BD491270CE, + 0x858E2FA32A55E61D, + 0x46810940EFF5F915, + 0xF5FADD190CF8EDAA, + 0xF273ED910B32B3E9, + 0x2C8C5276062F6525, + 0xE150B98C821842A0, + 0xF35AA3C10E4F55E7, + 0xD1ED680B50729265, + 0x2A5F0639B3DDED70, + 0xDCFBAA0CA1C0F359, + 0x4054BA76DAA6A430, + 0xF709C7F5898562B0, + 0x4C62E638AED2F9B8, + 0x9DD1A8510F779415, + 0xA3DE2ABD4911D62D, + 0x858E0EA32A55AE0A, + 0x46810F40EFF60347, + 0xC33BCE57BEF63EAF, + 0x08A24307B54A0265, + 0xF5B9FD190CC18D15, + 0x4C968290ACE35703, + 0x07174BD5C64D9350, + 0x5A294C3FF5D18750, + 0x05B3C1AEB308B843, + 0xB92A48DA37D0F477, + 0x73CDDDCCD80EBC49, + 0xD58C4C13210A266B, + 0xE78B6081243EC194, + 0xB096F77096A39F34, + 0xB425C54FF807B6A3, + 0x23E520E2751BB46E, + 0x1A0B44CCFE1385EC, + 0xF5BA4B190CC2119F, + 0x4C962690ACE2BAAF, + 0x0716DED5C64CDA19, + 0x5A292C3FF5D150F0, + 0x05B3E0AEB308ECF0, + 0xB92A5EDA37D119D9, + 0x73CE41CCD80F6635, + 0xD58C2C132109F00B, + 0xE78BAF81243F47D1, + 0xB0968F7096A2EE7C, + 0xB425A84FF807855C, + 0x23E4E9E2751B56F9, + 0x1A0B4ECCFE1396EA, + 0x54ABD453BB2C9004, + 0x08BA5F07B55EC3DA, + 0x337354193006CB6E, + 0xA430D84680AABD0B, + 0xA9BC8ACCA21F39B1, + 0x6961196491CC682D, + 0xAD2BB1774799DFE9, + 0x6961166491CC6314, + 0x8D1BB3904A3B1236, + 0x6961176491CC64C7, + 0xED205D87F40434C7, + 0x6961146491CC5FAE, + 0xCD3BAF5E44F8AD9C, + 0xE3B36596127CD6D8, + 0xF77F1072C8E8A646, + 0xE3B36396127CD372, + 0x6067DCE9932AD458, + 0xE3B37596127CF208, + 0x4B7B10FA9FE83936, + 0xAABAFE7104D914BE, + 0xF4D3180B3CDE3EDA, + 0xAABAFD7104D9130B, + 0xF4CFB20B3CDB5BB1, + 0xAABAFC7104D91158, + 0xF4CC4C0B3CD87888, + 0xE729BAC5D2A8D3A7, + 0x74BC0524F4DFA4C5, + 0xE72630C5D2A5B352, + 0x6B983224EF8FB456, + 0xE73042C5D2AE266D, + 0x8527E324FDEB4B37, + 0x0A83C86FEE952ABC, + 0x7318523267779D74, + 0x3E66D3D56B8CACA1, + 0x956694A5C0095593, + 0xCAC54572BB1A6FC8, + 0xA7A4C9F3EDEBF0D8, + 0x7829851FAC17B143, + 0x2C8F4C9AF81BCF06, + 0xD34E31539740C732, + 0x3605A2AC253D2DB1, + 0x08C11B8346F4A3C3, + 0x6BE396289CE8A6DA, + 0xD9B957FB7FE794C5, + 0x05BE33DA04560A93, + 0x0957F1577BA9747C, + 0xDA2CC3ACC24FBA57, + 0x74136F185B29E7F0, + 0xB2F2B4590EDB93B2, + 0xB3608FCE8B86AE04, + 0x4A3A865079359063, + 0x5B3A7EF496880A50, + 0x48FAE3163854C23B, + 0x07AAA640476E0B9A, + 0x2F653656383A687D, + 0xA1031F8E7599D79C, + 0xA31908178FF92477, + 0x097EDF3C14C3FB83, + 0xB51CA83FEAA0971B, + 0xDD3C0D96D784F2E9, + 0x86CD26A9EA767D78, + 0xE6B215FF54A30C18, + 0xEC5B06A1C5531093, + 0x45665A929F9EC5E5, + 0x8C7609B4A9F10907, + 0x89AAC3A491F0D729, + 0x32CE6B26E0F4A403, + 0x614AB44E02B53E01, + 0xFA6472EB6EEF3290, + 0x9E5D75EB1948EB6A, + 0xB6D12AD4A8671852, + 0x88826F56EBA07AF1, + 0x44535BF2645BC0FD, + 0x169388FFC21E3728, + 0xF68AAC9E396D8224, + 0x8E87D7E7472B3883, + 0x295C26CAA8B423DE, + 0x322C814292E72176, + 0x8A06550EB8AF7268, + 0xEF86D60E661BCF71, + 0x9E5426C87F30EE54, + 0xF1EA8AA826FD047E, + 0x0BABAF9A642CB769, + 0x4B3341D4068D012E, + 0xD15605CBC30A335C, + 0x5B21060AED8412E5, + 0x45E2CDA1CE6F4227, + 0x50AE3745033AD7D4, + 0xAA4588CED46BF414, + 0xC1B0056C4A95467E, + 0x56576A71DE8B4089, + 0xBF20965FA6DC927E, + 0x569F8383C2040882, + 0xE1E772FBA08FECA0, + 0x4CED94AF97138AC4, + 0xC4112FFB337A82FB, + 0xD64A4FD41DE38B7D, + 0x4CFC32329EDEBCBB, + 0x0803564445050395, + 0xAA1574ECF4642FFD, + 0x694BC4E54CC315F9, + 0xA3D7CB273B011721, + 0x577C2F8B6115BFA5, + 0xB7EC8C1A769FB4C1, + 0x5D5CFCE63359AB19, + 0x33B96C3CD65B5F71, + 0xD845097780602BB9, + 0x84D47645D02DA3D5, + 0x83544F33B58773A5, + 0x9175CBB2160836C5, + 0xC71B3BC175E72BC5, + 0x636806AC222EC985, + 0xB6EF0E6950F52ED5, + 0xEAD3D8A0F3DFDAA5, + 0x922908FE9A861BA5, + 0x6D4821DE275FD5C5, + 0x1FE3FCE62BD816B5, + 0xC23E9FCCD6F70591, + 0xC1AF12BDFE16B5B5, + 0x39E9F18F2F85E221, ] assert len(INPUTS) == len(OUTPUTS) @@ -1713,17 +1713,14 @@ def test_get_character_combination_hashes_string_store_spec_cases( def test_character_combination_hashes_empty_lengths(en_tokenizer): doc = en_tokenizer("and𐌞") - assert ( - doc.get_character_combination_hashes( - cs=True, - p_lengths=bytes(), - s_lengths=bytes(), - ps_search_chars=bytes(), - ps_width_offsets=bytes(), - ps_lengths=bytes(), - ss_search_chars=bytes(), - ss_width_offsets=bytes(), - ss_lengths=bytes(), - ).shape - == (1, 0) - ) + assert doc.get_character_combination_hashes( + cs=True, + p_lengths=bytes(), + s_lengths=bytes(), + ps_search_chars=bytes(), + ps_width_offsets=bytes(), + ps_lengths=bytes(), + ss_search_chars=bytes(), + ss_width_offsets=bytes(), + ss_lengths=bytes(), + ).shape == (1, 0) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 711436a0f..20880e528 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -6,7 +6,7 @@ cimport numpy as np from cpython cimport array from libc.string cimport memcpy, memcmp, memset, strlen from libc.math cimport sqrt -from libc.stdint cimport int32_t, uint64_t, uint32_t +from libc.stdint cimport int32_t, uint64_t import copy from collections import Counter, defaultdict @@ -1779,8 +1779,9 @@ cdef class Doc: hashed for "spaCy" would be "c" and "ca". Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of - the fact that we are hashing short affixes and searching for small groups of characters; the calling code is responsible - lengths being passed in cannot exceed 63 and that *_search_chars buffers are never longer than 255. + the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible + for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with a maximum of four-byte + character widths can never exceed 255. """ # Work out lengths @@ -2155,14 +2156,17 @@ cdef void _search_for_chars( memset(l_buf + l_buf_idx, res_buf_idx, max_res_l - l_buf_idx) +cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325 +cdef uint64_t FNV1A_PRIME = 0x00000100000001B3 + def get_fnv1a_hash(input: bytes): """ Python-callable method to facilitate testing. """ - cdef uint32_t hash_val = 0x811c9dc5 + cdef uint64_t hash_val = FNV1A_OFFSET_BASIS cdef int length = len(input), offset = 0 while offset < length: hash_val ^= input[offset] - hash_val *= 0x01000193 + hash_val *= FNV1A_PRIME offset += 1 return hash_val @@ -2175,7 +2179,7 @@ cdef int _write_hashes( const int res_buf_last, np.uint64_t* hashes_ptr, ) nogil: - """ Write FNV1A hashes for a token/rich property group combination. + """ Write 64-bit FNV1A hashes for a token/rich property group combination. res_buf: the string from which to generate the hash values. aff_l_buf: one-byte lengths describing how many characters to hash. @@ -2188,7 +2192,7 @@ cdef int _write_hashes( """ cdef int last_offset = 0, hash_idx = 0, offset, aff_l - cdef uint32_t hash_val = 0x811c9dc5 + cdef uint64_t hash_val = FNV1A_OFFSET_BASIS while True: aff_l = aff_l_buf[hash_idx] @@ -2200,7 +2204,7 @@ cdef int _write_hashes( hash_val ^= res_buf[res_buf_last - last_offset] else: hash_val ^= res_buf[last_offset] - hash_val *= 0x01000193 + hash_val *= FNV1A_PRIME last_offset += 1 hashes_ptr[hash_idx] = hash_val hash_idx += 1 diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index a76c099de..78c1ad401 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -219,6 +219,13 @@ alternate prevents the alternation from occurring, e.g. an `ä` in a German plural noun does not become `a` if it is the third or fourth vowel from the end of the word. +There are a few rare situations where a graphical character is expressed as more +than one UTF-8 character, e.g. _i_ when representing the lower-case form of the +Turkish letter _İ_. Such situations are supported, but the lengths of prefixes, +suffixes and character search results may need to be increased accordingly. + +All lengths must be specified in ascending order. + | Name | Description | | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |