mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Remove 'cleanup' of strings (#6007)
A long time ago we went to some trouble to try to clean up "unused" strings, to avoid the `StringStore` growing in long-running processes. This never really worked reliably, and I think it was a really wrong approach. It's much better to let the user reload the `nlp` object as necessary, now that the string encoding is stable (in v1, the string IDs were sequential integers, making reloading the NLP object really annoying.) The extra book-keeping does make some performance difference, and the feature is unsed, so it's past time we killed it.
This commit is contained in:
		
							parent
							
								
									690bd77669
								
							
						
					
					
						commit
						046c38bd26
					
				|  | @ -1314,7 +1314,6 @@ class Language: | |||
|         as_tuples: bool = False, | ||||
|         batch_size: int = 1000, | ||||
|         disable: Iterable[str] = SimpleFrozenList(), | ||||
|         cleanup: bool = False, | ||||
|         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, | ||||
|         n_process: int = 1, | ||||
|     ): | ||||
|  | @ -1326,8 +1325,6 @@ class Language: | |||
|             (doc, context) tuples. Defaults to False. | ||||
|         batch_size (int): The number of texts to buffer. | ||||
|         disable (List[str]): Names of the pipeline components to disable. | ||||
|         cleanup (bool): If True, unneeded strings are freed to control memory | ||||
|             use. Experimental. | ||||
|         component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword | ||||
|             arguments for specific components. | ||||
|         n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`. | ||||
|  | @ -1378,35 +1375,9 @@ class Language: | |||
|             for pipe in pipes: | ||||
|                 docs = pipe(docs) | ||||
| 
 | ||||
|         # Track weakrefs of "recent" documents, so that we can see when they | ||||
|         # expire from memory. When they do, we know we don't need old strings. | ||||
|         # This way, we avoid maintaining an unbounded growth in string entries | ||||
|         # in the string store. | ||||
|         recent_refs = weakref.WeakSet() | ||||
|         old_refs = weakref.WeakSet() | ||||
|         # Keep track of the original string data, so that if we flush old strings, | ||||
|         # we can recover the original ones. However, we only want to do this if we're | ||||
|         # really adding strings, to save up-front costs. | ||||
|         original_strings_data = None | ||||
|         nr_seen = 0 | ||||
|         for doc in docs: | ||||
|             yield doc | ||||
|             if cleanup: | ||||
|                 recent_refs.add(doc) | ||||
|                 if nr_seen < 10000: | ||||
|                     old_refs.add(doc) | ||||
|                     nr_seen += 1 | ||||
|                 elif len(old_refs) == 0: | ||||
|                     old_refs, recent_refs = recent_refs, old_refs | ||||
|                     if original_strings_data is None: | ||||
|                         original_strings_data = list(self.vocab.strings) | ||||
|                     else: | ||||
|                         keys, strings = self.vocab.strings._cleanup_stale_strings( | ||||
|                             original_strings_data | ||||
|                         ) | ||||
|                         self.vocab._reset_cache(keys, strings) | ||||
|                         self.tokenizer._reset_cache(keys) | ||||
|                     nr_seen = 0 | ||||
| 
 | ||||
|     def _multiprocessing_pipe( | ||||
|         self, | ||||
|  |  | |||
|  | @ -23,7 +23,6 @@ cdef class StringStore: | |||
|     cdef Pool mem | ||||
| 
 | ||||
|     cdef vector[hash_t] keys | ||||
|     cdef set[hash_t] hits | ||||
|     cdef public PreshMap _map | ||||
| 
 | ||||
|     cdef const Utf8Str* intern_unicode(self, unicode py_string) | ||||
|  |  | |||
|  | @ -127,7 +127,6 @@ cdef class StringStore: | |||
|             return SYMBOLS_BY_INT[string_or_id] | ||||
|         else: | ||||
|             key = string_or_id | ||||
|             self.hits.insert(key) | ||||
|             utf8str = <Utf8Str*>self._map.get(key) | ||||
|             if utf8str is NULL: | ||||
|                 raise KeyError(Errors.E018.format(hash_value=string_or_id)) | ||||
|  | @ -198,7 +197,6 @@ cdef class StringStore: | |||
|         if key < len(SYMBOLS_BY_INT): | ||||
|             return True | ||||
|         else: | ||||
|             self.hits.insert(key) | ||||
|             return self._map.get(key) is not NULL | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|  | @ -210,7 +208,6 @@ cdef class StringStore: | |||
|         cdef hash_t key | ||||
|         for i in range(self.keys.size()): | ||||
|             key = self.keys[i] | ||||
|             self.hits.insert(key) | ||||
|             utf8str = <Utf8Str*>self._map.get(key) | ||||
|             yield decode_Utf8Str(utf8str) | ||||
|         # TODO: Iterate OOV here? | ||||
|  | @ -269,41 +266,9 @@ cdef class StringStore: | |||
|         self.mem = Pool() | ||||
|         self._map = PreshMap() | ||||
|         self.keys.clear() | ||||
|         self.hits.clear() | ||||
|         for string in strings: | ||||
|             self.add(string) | ||||
| 
 | ||||
|     def _cleanup_stale_strings(self, excepted): | ||||
|         """ | ||||
|         excepted (list): Strings that should not be removed. | ||||
|         RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places | ||||
|         """ | ||||
|         if self.hits.size() == 0: | ||||
|             # If we don't have any hits, just skip cleanup | ||||
|             return | ||||
| 
 | ||||
|         cdef vector[hash_t] tmp | ||||
|         dropped_strings = [] | ||||
|         dropped_keys = [] | ||||
|         for i in range(self.keys.size()): | ||||
|             key = self.keys[i] | ||||
|             # Here we cannot use __getitem__ because it also set hit. | ||||
|             utf8str = <Utf8Str*>self._map.get(key) | ||||
|             value = decode_Utf8Str(utf8str) | ||||
|             if self.hits.count(key) != 0 or value in excepted: | ||||
|                 tmp.push_back(key) | ||||
|             else: | ||||
|                 dropped_keys.append(key) | ||||
|                 dropped_strings.append(value) | ||||
| 
 | ||||
|         self.keys.swap(tmp) | ||||
|         strings = list(self) | ||||
|         self._reset_and_load(strings) | ||||
|         # Here we have strings but hits to it should be reseted | ||||
|         self.hits.clear() | ||||
| 
 | ||||
|         return dropped_keys, dropped_strings | ||||
| 
 | ||||
|     cdef const Utf8Str* intern_unicode(self, unicode py_string): | ||||
|         # 0 means missing, but we don't bother offsetting the index. | ||||
|         cdef bytes byte_string = py_string.encode("utf8") | ||||
|  | @ -319,6 +284,5 @@ cdef class StringStore: | |||
|             return value | ||||
|         value = _allocate(self.mem, <unsigned char*>utf8_string, length) | ||||
|         self._map.set(key, value) | ||||
|         self.hits.insert(key) | ||||
|         self.keys.push_back(key) | ||||
|         return value | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user