mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	A long time ago we went to some trouble to try to clean up "unused" strings, to avoid the `StringStore` growing in long-running processes. This never really worked reliably, and I think it was a really wrong approach. It's much better to let the user reload the `nlp` object as necessary, now that the string encoding is stable (in v1, the string IDs were sequential integers, making reloading the NLP object really annoying.) The extra book-keeping does make some performance difference, and the feature is unsed, so it's past time we killed it.
		
			
				
	
	
		
			30 lines
		
	
	
		
			711 B
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			30 lines
		
	
	
		
			711 B
		
	
	
	
		
			Cython
		
	
	
	
	
	
| from libc.stdint cimport int64_t
 | |
| from libcpp.vector cimport vector
 | |
| from libcpp.set cimport set
 | |
| from cymem.cymem cimport Pool
 | |
| from preshed.maps cimport PreshMap
 | |
| from murmurhash.mrmr cimport hash64
 | |
| 
 | |
| from .typedefs cimport attr_t, hash_t
 | |
| 
 | |
| 
 | |
| cpdef hash_t hash_string(unicode string) except 0
 | |
| cdef hash_t hash_utf8(char* utf8_string, int length) nogil
 | |
| 
 | |
| cdef unicode decode_Utf8Str(const Utf8Str* string)
 | |
| 
 | |
| 
 | |
| ctypedef union Utf8Str:
 | |
|     unsigned char[8] s
 | |
|     unsigned char* p
 | |
| 
 | |
| 
 | |
| cdef class StringStore:
 | |
|     cdef Pool mem
 | |
| 
 | |
|     cdef vector[hash_t] keys
 | |
|     cdef public PreshMap _map
 | |
| 
 | |
|     cdef const Utf8Str* intern_unicode(self, unicode py_string)
 | |
|     cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
 |