mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
							parent
							
								
									6977a2b8cd
								
							
						
					
					
						commit
						62fc6b1afa
					
				|  | @ -5,11 +5,12 @@ cimport cython | |||
| from libc.string cimport memcpy | ||||
| from libc.stdint cimport uint64_t | ||||
| 
 | ||||
| from murmurhash.mrmr cimport hash64 | ||||
| from murmurhash.mrmr cimport hash64, hash32 | ||||
| 
 | ||||
| from preshed.maps cimport map_iter, key_t | ||||
| 
 | ||||
| from .typedefs cimport hash_t | ||||
| from libc.stdint cimport uint32_t | ||||
| 
 | ||||
| try: | ||||
|     import ujson as json | ||||
|  | @ -26,6 +27,10 @@ cdef hash_t _hash_utf8(char* utf8_string, int length): | |||
|     return hash64(utf8_string, length, 1) | ||||
| 
 | ||||
| 
 | ||||
| cdef uint32_t _hash32_utf8(char* utf8_string, int length): | ||||
|     return hash32(utf8_string, length, 1) | ||||
| 
 | ||||
| 
 | ||||
| cdef unicode _decode(const Utf8Str* string): | ||||
|     cdef int i, length | ||||
|     if string.s[0] < sizeof(string.s) and string.s[0] != 0: | ||||
|  | @ -84,7 +89,7 @@ cdef class StringStore: | |||
|         self._resize_at = 10000 | ||||
|         self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str)) | ||||
|         self.size = 1 | ||||
|         self.is_frozen = False | ||||
|         self.is_frozen = freeze | ||||
|         if strings is not None: | ||||
|             for string in strings: | ||||
|                 _ = self[string] | ||||
|  | @ -118,12 +123,14 @@ cdef class StringStore: | |||
|         cdef bytes byte_string | ||||
|         cdef const Utf8Str* utf8str | ||||
|         cdef uint64_t int_id | ||||
|         cdef uint32_t oov_id | ||||
|         if isinstance(string_or_id, (int, long)): | ||||
|             int_id = string_or_id | ||||
|             oov_id = string_or_id | ||||
|             if int_id < <uint64_t>self.size: | ||||
|                 return _decode(&self.c[int_id]) | ||||
|             else: | ||||
|                 utf8str = <Utf8Str*>self._oov.get(int_id) | ||||
|                 utf8str = <Utf8Str*>self._oov.get(oov_id) | ||||
|                 if utf8str is not NULL: | ||||
|                     return _decode(utf8str) | ||||
|                 else: | ||||
|  | @ -137,10 +144,12 @@ cdef class StringStore: | |||
|                 raise TypeError(type(string_or_id)) | ||||
|             utf8str = self._intern_utf8(byte_string, len(byte_string)) | ||||
|             if utf8str is NULL: | ||||
|                 # TODO: We could get unlucky here, and hash into a value that | ||||
|                 # collides with the 'real' strings. All we have to do is offset | ||||
|                 # I think? | ||||
|                 return _hash_utf8(byte_string, len(byte_string)) | ||||
|                 # TODO: We need to use 32 bit here, for compatibility with the  | ||||
|                 # vocabulary values. This makes birthday paradox probabilities | ||||
|                 # pretty bad. | ||||
|                 # We could also get unlucky here, and hash into a value that | ||||
|                 # collides with the 'real' strings.  | ||||
|                 return _hash32_utf8(byte_string, len(byte_string)) | ||||
|             else: | ||||
|                 return utf8str - self.c | ||||
| 
 | ||||
|  | @ -199,11 +208,13 @@ cdef class StringStore: | |||
|         if value is not NULL: | ||||
|             return value | ||||
|         if self.is_frozen: | ||||
|             # OOV store uses 32 bit hashes. Pretty ugly :( | ||||
|             key32 = _hash32_utf8(utf8_string, length) | ||||
|             # Important: Make the OOV store own the memory. That way it's trivial | ||||
|             # to flush them all. | ||||
|             value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str)) | ||||
|             value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length) | ||||
|             self._oov.set(key, value) | ||||
|             self._oov.set(key32, value) | ||||
|             return NULL | ||||
| 
 | ||||
|         if self.size == self._resize_at: | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user