mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
							parent
							
								
									6977a2b8cd
								
							
						
					
					
						commit
						62fc6b1afa
					
				|  | @ -5,11 +5,12 @@ cimport cython | ||||||
| from libc.string cimport memcpy | from libc.string cimport memcpy | ||||||
| from libc.stdint cimport uint64_t | from libc.stdint cimport uint64_t | ||||||
| 
 | 
 | ||||||
| from murmurhash.mrmr cimport hash64 | from murmurhash.mrmr cimport hash64, hash32 | ||||||
| 
 | 
 | ||||||
| from preshed.maps cimport map_iter, key_t | from preshed.maps cimport map_iter, key_t | ||||||
| 
 | 
 | ||||||
| from .typedefs cimport hash_t | from .typedefs cimport hash_t | ||||||
|  | from libc.stdint cimport uint32_t | ||||||
| 
 | 
 | ||||||
| try: | try: | ||||||
|     import ujson as json |     import ujson as json | ||||||
|  | @ -26,6 +27,10 @@ cdef hash_t _hash_utf8(char* utf8_string, int length): | ||||||
|     return hash64(utf8_string, length, 1) |     return hash64(utf8_string, length, 1) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | cdef uint32_t _hash32_utf8(char* utf8_string, int length): | ||||||
|  |     return hash32(utf8_string, length, 1) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| cdef unicode _decode(const Utf8Str* string): | cdef unicode _decode(const Utf8Str* string): | ||||||
|     cdef int i, length |     cdef int i, length | ||||||
|     if string.s[0] < sizeof(string.s) and string.s[0] != 0: |     if string.s[0] < sizeof(string.s) and string.s[0] != 0: | ||||||
|  | @ -84,7 +89,7 @@ cdef class StringStore: | ||||||
|         self._resize_at = 10000 |         self._resize_at = 10000 | ||||||
|         self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str)) |         self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str)) | ||||||
|         self.size = 1 |         self.size = 1 | ||||||
|         self.is_frozen = False |         self.is_frozen = freeze | ||||||
|         if strings is not None: |         if strings is not None: | ||||||
|             for string in strings: |             for string in strings: | ||||||
|                 _ = self[string] |                 _ = self[string] | ||||||
|  | @ -118,12 +123,14 @@ cdef class StringStore: | ||||||
|         cdef bytes byte_string |         cdef bytes byte_string | ||||||
|         cdef const Utf8Str* utf8str |         cdef const Utf8Str* utf8str | ||||||
|         cdef uint64_t int_id |         cdef uint64_t int_id | ||||||
|  |         cdef uint32_t oov_id | ||||||
|         if isinstance(string_or_id, (int, long)): |         if isinstance(string_or_id, (int, long)): | ||||||
|             int_id = string_or_id |             int_id = string_or_id | ||||||
|  |             oov_id = string_or_id | ||||||
|             if int_id < <uint64_t>self.size: |             if int_id < <uint64_t>self.size: | ||||||
|                 return _decode(&self.c[int_id]) |                 return _decode(&self.c[int_id]) | ||||||
|             else: |             else: | ||||||
|                 utf8str = <Utf8Str*>self._oov.get(int_id) |                 utf8str = <Utf8Str*>self._oov.get(oov_id) | ||||||
|                 if utf8str is not NULL: |                 if utf8str is not NULL: | ||||||
|                     return _decode(utf8str) |                     return _decode(utf8str) | ||||||
|                 else: |                 else: | ||||||
|  | @ -137,10 +144,12 @@ cdef class StringStore: | ||||||
|                 raise TypeError(type(string_or_id)) |                 raise TypeError(type(string_or_id)) | ||||||
|             utf8str = self._intern_utf8(byte_string, len(byte_string)) |             utf8str = self._intern_utf8(byte_string, len(byte_string)) | ||||||
|             if utf8str is NULL: |             if utf8str is NULL: | ||||||
|                 # TODO: We could get unlucky here, and hash into a value that |                 # TODO: We need to use 32 bit here, for compatibility with the  | ||||||
|                 # collides with the 'real' strings. All we have to do is offset |                 # vocabulary values. This makes birthday paradox probabilities | ||||||
|                 # I think? |                 # pretty bad. | ||||||
|                 return _hash_utf8(byte_string, len(byte_string)) |                 # We could also get unlucky here, and hash into a value that | ||||||
|  |                 # collides with the 'real' strings.  | ||||||
|  |                 return _hash32_utf8(byte_string, len(byte_string)) | ||||||
|             else: |             else: | ||||||
|                 return utf8str - self.c |                 return utf8str - self.c | ||||||
| 
 | 
 | ||||||
|  | @ -199,11 +208,13 @@ cdef class StringStore: | ||||||
|         if value is not NULL: |         if value is not NULL: | ||||||
|             return value |             return value | ||||||
|         if self.is_frozen: |         if self.is_frozen: | ||||||
|  |             # OOV store uses 32 bit hashes. Pretty ugly :( | ||||||
|  |             key32 = _hash32_utf8(utf8_string, length) | ||||||
|             # Important: Make the OOV store own the memory. That way it's trivial |             # Important: Make the OOV store own the memory. That way it's trivial | ||||||
|             # to flush them all. |             # to flush them all. | ||||||
|             value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str)) |             value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str)) | ||||||
|             value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length) |             value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length) | ||||||
|             self._oov.set(key, value) |             self._oov.set(key32, value) | ||||||
|             return NULL |             return NULL | ||||||
| 
 | 
 | ||||||
|         if self.size == self._resize_at: |         if self.size == self._resize_at: | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user