mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Remove 'cleanup' of strings (#6007)
A long time ago we went to some trouble to try to clean up "unused" strings, to avoid the `StringStore` growing in long-running processes. This never really worked reliably, and I think it was a really wrong approach. It's much better to let the user reload the `nlp` object as necessary, now that the string encoding is stable (in v1, the string IDs were sequential integers, making reloading the NLP object really annoying.) The extra book-keeping does make some performance difference, and the feature is unsed, so it's past time we killed it.
This commit is contained in:
parent
690bd77669
commit
046c38bd26
|
@ -1314,7 +1314,6 @@ class Language:
|
||||||
as_tuples: bool = False,
|
as_tuples: bool = False,
|
||||||
batch_size: int = 1000,
|
batch_size: int = 1000,
|
||||||
disable: Iterable[str] = SimpleFrozenList(),
|
disable: Iterable[str] = SimpleFrozenList(),
|
||||||
cleanup: bool = False,
|
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
n_process: int = 1,
|
n_process: int = 1,
|
||||||
):
|
):
|
||||||
|
@ -1326,8 +1325,6 @@ class Language:
|
||||||
(doc, context) tuples. Defaults to False.
|
(doc, context) tuples. Defaults to False.
|
||||||
batch_size (int): The number of texts to buffer.
|
batch_size (int): The number of texts to buffer.
|
||||||
disable (List[str]): Names of the pipeline components to disable.
|
disable (List[str]): Names of the pipeline components to disable.
|
||||||
cleanup (bool): If True, unneeded strings are freed to control memory
|
|
||||||
use. Experimental.
|
|
||||||
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
|
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
|
||||||
arguments for specific components.
|
arguments for specific components.
|
||||||
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
|
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
|
||||||
|
@ -1378,35 +1375,9 @@ class Language:
|
||||||
for pipe in pipes:
|
for pipe in pipes:
|
||||||
docs = pipe(docs)
|
docs = pipe(docs)
|
||||||
|
|
||||||
# Track weakrefs of "recent" documents, so that we can see when they
|
|
||||||
# expire from memory. When they do, we know we don't need old strings.
|
|
||||||
# This way, we avoid maintaining an unbounded growth in string entries
|
|
||||||
# in the string store.
|
|
||||||
recent_refs = weakref.WeakSet()
|
|
||||||
old_refs = weakref.WeakSet()
|
|
||||||
# Keep track of the original string data, so that if we flush old strings,
|
|
||||||
# we can recover the original ones. However, we only want to do this if we're
|
|
||||||
# really adding strings, to save up-front costs.
|
|
||||||
original_strings_data = None
|
|
||||||
nr_seen = 0
|
nr_seen = 0
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
yield doc
|
yield doc
|
||||||
if cleanup:
|
|
||||||
recent_refs.add(doc)
|
|
||||||
if nr_seen < 10000:
|
|
||||||
old_refs.add(doc)
|
|
||||||
nr_seen += 1
|
|
||||||
elif len(old_refs) == 0:
|
|
||||||
old_refs, recent_refs = recent_refs, old_refs
|
|
||||||
if original_strings_data is None:
|
|
||||||
original_strings_data = list(self.vocab.strings)
|
|
||||||
else:
|
|
||||||
keys, strings = self.vocab.strings._cleanup_stale_strings(
|
|
||||||
original_strings_data
|
|
||||||
)
|
|
||||||
self.vocab._reset_cache(keys, strings)
|
|
||||||
self.tokenizer._reset_cache(keys)
|
|
||||||
nr_seen = 0
|
|
||||||
|
|
||||||
def _multiprocessing_pipe(
|
def _multiprocessing_pipe(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -23,7 +23,6 @@ cdef class StringStore:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
|
|
||||||
cdef vector[hash_t] keys
|
cdef vector[hash_t] keys
|
||||||
cdef set[hash_t] hits
|
|
||||||
cdef public PreshMap _map
|
cdef public PreshMap _map
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
||||||
|
|
|
@ -127,7 +127,6 @@ cdef class StringStore:
|
||||||
return SYMBOLS_BY_INT[string_or_id]
|
return SYMBOLS_BY_INT[string_or_id]
|
||||||
else:
|
else:
|
||||||
key = string_or_id
|
key = string_or_id
|
||||||
self.hits.insert(key)
|
|
||||||
utf8str = <Utf8Str*>self._map.get(key)
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
if utf8str is NULL:
|
if utf8str is NULL:
|
||||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||||
|
@ -198,7 +197,6 @@ cdef class StringStore:
|
||||||
if key < len(SYMBOLS_BY_INT):
|
if key < len(SYMBOLS_BY_INT):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
self.hits.insert(key)
|
|
||||||
return self._map.get(key) is not NULL
|
return self._map.get(key) is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
@ -210,7 +208,6 @@ cdef class StringStore:
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
for i in range(self.keys.size()):
|
for i in range(self.keys.size()):
|
||||||
key = self.keys[i]
|
key = self.keys[i]
|
||||||
self.hits.insert(key)
|
|
||||||
utf8str = <Utf8Str*>self._map.get(key)
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
yield decode_Utf8Str(utf8str)
|
yield decode_Utf8Str(utf8str)
|
||||||
# TODO: Iterate OOV here?
|
# TODO: Iterate OOV here?
|
||||||
|
@ -269,41 +266,9 @@ cdef class StringStore:
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self.keys.clear()
|
self.keys.clear()
|
||||||
self.hits.clear()
|
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string)
|
||||||
|
|
||||||
def _cleanup_stale_strings(self, excepted):
|
|
||||||
"""
|
|
||||||
excepted (list): Strings that should not be removed.
|
|
||||||
RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
|
|
||||||
"""
|
|
||||||
if self.hits.size() == 0:
|
|
||||||
# If we don't have any hits, just skip cleanup
|
|
||||||
return
|
|
||||||
|
|
||||||
cdef vector[hash_t] tmp
|
|
||||||
dropped_strings = []
|
|
||||||
dropped_keys = []
|
|
||||||
for i in range(self.keys.size()):
|
|
||||||
key = self.keys[i]
|
|
||||||
# Here we cannot use __getitem__ because it also set hit.
|
|
||||||
utf8str = <Utf8Str*>self._map.get(key)
|
|
||||||
value = decode_Utf8Str(utf8str)
|
|
||||||
if self.hits.count(key) != 0 or value in excepted:
|
|
||||||
tmp.push_back(key)
|
|
||||||
else:
|
|
||||||
dropped_keys.append(key)
|
|
||||||
dropped_strings.append(value)
|
|
||||||
|
|
||||||
self.keys.swap(tmp)
|
|
||||||
strings = list(self)
|
|
||||||
self._reset_and_load(strings)
|
|
||||||
# Here we have strings but hits to it should be reseted
|
|
||||||
self.hits.clear()
|
|
||||||
|
|
||||||
return dropped_keys, dropped_strings
|
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
cdef bytes byte_string = py_string.encode("utf8")
|
cdef bytes byte_string = py_string.encode("utf8")
|
||||||
|
@ -319,6 +284,5 @@ cdef class StringStore:
|
||||||
return value
|
return value
|
||||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||||
self._map.set(key, value)
|
self._map.set(key, value)
|
||||||
self.hits.insert(key)
|
|
||||||
self.keys.push_back(key)
|
self.keys.push_back(key)
|
||||||
return value
|
return value
|
||||||
|
|
Loading…
Reference in New Issue
Block a user