Remove 'cleanup' of strings (#6007)

A long time ago we went to some trouble to try to clean up "unused" strings, to avoid the `StringStore` growing in long-running processes. This never really worked reliably, and I think it was a really wrong approach. It's much better to let the user reload the `nlp` object as necessary, now that the string encoding is stable (in v1, the string IDs were sequential integers, making reloading the NLP object really annoying.) The extra book-keeping does make some performance difference, and the feature is unsed, so it's past time we killed it.
2025-08-30 00:45:01 +03:00 · 2020-09-01 16:12:15 +02:00 · 2020-09-01 16:12:15 +02:00 · 046c38bd26
commit 046c38bd26
parent 690bd77669
3 changed files with 0 additions and 66 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1314,7 +1314,6 @@ class Language:
        as_tuples: bool = False,
        batch_size: int = 1000,
        disable: Iterable[str] = SimpleFrozenList(),
        cleanup: bool = False,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        n_process: int = 1,
    ):
@ -1326,8 +1325,6 @@ class Language:
            (doc, context) tuples. Defaults to False.
        batch_size (int): The number of texts to buffer.
        disable (List[str]): Names of the pipeline components to disable.
        cleanup (bool): If True, unneeded strings are freed to control memory
            use. Experimental.
        component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
            arguments for specific components.
        n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
@ -1378,35 +1375,9 @@ class Language:
            for pipe in pipes:
                docs = pipe(docs)
        # Track weakrefs of "recent" documents, so that we can see when they
        # expire from memory. When they do, we know we don't need old strings.
        # This way, we avoid maintaining an unbounded growth in string entries
        # in the string store.
        recent_refs = weakref.WeakSet()
        old_refs = weakref.WeakSet()
        # Keep track of the original string data, so that if we flush old strings,
        # we can recover the original ones. However, we only want to do this if we're
        # really adding strings, to save up-front costs.
        original_strings_data = None
        nr_seen = 0
        for doc in docs:
            yield doc
            if cleanup:
                recent_refs.add(doc)
                if nr_seen < 10000:
                    old_refs.add(doc)
                    nr_seen += 1
                elif len(old_refs) == 0:
                    old_refs, recent_refs = recent_refs, old_refs
                    if original_strings_data is None:
                        original_strings_data = list(self.vocab.strings)
                    else:
                        keys, strings = self.vocab.strings._cleanup_stale_strings(
                            original_strings_data
                        )
                        self.vocab._reset_cache(keys, strings)
                        self.tokenizer._reset_cache(keys)
                    nr_seen = 0
    def _multiprocessing_pipe(
        self,
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -23,7 +23,6 @@ cdef class StringStore:
    cdef Pool mem
    cdef vector[hash_t] keys
    cdef set[hash_t] hits
    cdef public PreshMap _map
    cdef const Utf8Str* intern_unicode(self, unicode py_string)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -127,7 +127,6 @@ cdef class StringStore:
            return SYMBOLS_BY_INT[string_or_id]
        else:
            key = string_or_id
            self.hits.insert(key)
            utf8str = <Utf8Str*>self._map.get(key)
            if utf8str is NULL:
                raise KeyError(Errors.E018.format(hash_value=string_or_id))
@ -198,7 +197,6 @@ cdef class StringStore:
        if key < len(SYMBOLS_BY_INT):
            return True
        else:
            self.hits.insert(key)
            return self._map.get(key) is not NULL
    def __iter__(self):
@ -210,7 +208,6 @@ cdef class StringStore:
        cdef hash_t key
        for i in range(self.keys.size()):
            key = self.keys[i]
            self.hits.insert(key)
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
        # TODO: Iterate OOV here?
@ -269,41 +266,9 @@ cdef class StringStore:
        self.mem = Pool()
        self._map = PreshMap()
        self.keys.clear()
        self.hits.clear()
        for string in strings:
            self.add(string)
    def _cleanup_stale_strings(self, excepted):
        """
        excepted (list): Strings that should not be removed.
        RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
        """
        if self.hits.size() == 0:
            # If we don't have any hits, just skip cleanup
            return
        cdef vector[hash_t] tmp
        dropped_strings = []
        dropped_keys = []
        for i in range(self.keys.size()):
            key = self.keys[i]
            # Here we cannot use __getitem__ because it also set hit.
            utf8str = <Utf8Str*>self._map.get(key)
            value = decode_Utf8Str(utf8str)
            if self.hits.count(key) != 0 or value in excepted:
                tmp.push_back(key)
            else:
                dropped_keys.append(key)
                dropped_strings.append(value)
        self.keys.swap(tmp)
        strings = list(self)
        self._reset_and_load(strings)
        # Here we have strings but hits to it should be reseted
        self.hits.clear()
        return dropped_keys, dropped_strings
    cdef const Utf8Str* intern_unicode(self, unicode py_string):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode("utf8")
@ -319,6 +284,5 @@ cdef class StringStore:
            return value
        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
        self._map.set(key, value)
        self.hits.insert(key)
        self.keys.push_back(key)
        return value