Clean all caches

This commit is contained in:
Roman Domrachev 2017-11-14 21:15:04 +03:00
parent 4e378dc4a4
commit 91e2fa6561
4 changed files with 27 additions and 5 deletions

View File

@ -559,13 +559,14 @@ class Language(object):
nr_seen += 1 nr_seen += 1
elif len(old_refs) == 0: elif len(old_refs) == 0:
old_refs, recent_refs = recent_refs, old_refs old_refs, recent_refs = recent_refs, old_refs
self.vocab.strings._cleanup_stale_strings() keys, strings = self.vocab.strings._cleanup_stale_strings()
nr_seen = 0 self.vocab._reset_cache(keys, strings)
# We can't know which strings from the last batch have really expired. self.tokenizer._reset_cache(keys)
# So we don't erase the strings — we just extend with the original
# content.
for string in original_strings_data: for string in original_strings_data:
self.vocab.strings.add(string) self.vocab.strings.add(string)
nr_seen = 0
# We can't know which strings from the last batch have really expired.
# So we don't erase the strings.
def to_disk(self, path, disable=tuple()): def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. If a model is loaded, this """Save the current state to a directory. If a model is loaded, this

View File

@ -250,15 +250,23 @@ cdef class StringStore:
self.add(string) self.add(string)
def _cleanup_stale_strings(self): def _cleanup_stale_strings(self):
"""
RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
"""
if self.hits.size() == 0: if self.hits.size() == 0:
# If we don't have any hits, just skip cleanup # If we don't have any hits, just skip cleanup
return return
cdef vector[hash_t] tmp cdef vector[hash_t] tmp
dropped_strings = []
dropped_keys = []
for i in range(self.keys.size()): for i in range(self.keys.size()):
key = self.keys[i] key = self.keys[i]
if self.hits.count(key) != 0: if self.hits.count(key) != 0:
tmp.push_back(key) tmp.push_back(key)
else:
dropped_keys.append(key)
dropped_strings.append(self[key])
self.keys.swap(tmp) self.keys.swap(tmp)
strings = list(self) strings = list(self)
@ -266,6 +274,8 @@ cdef class StringStore:
# Here we have strings but hits to it should be reseted # Here we have strings but hits to it should be reseted
self.hits.clear() self.hits.clear()
return dropped_keys, dropped_strings
cdef const Utf8Str* intern_unicode(self, unicode py_string): cdef const Utf8Str* intern_unicode(self, unicode py_string):
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode('utf8') cdef bytes byte_string = py_string.encode('utf8')

View File

@ -132,6 +132,11 @@ cdef class Tokenizer:
for text in texts: for text in texts:
yield self(text) yield self(text)
def _reset_cache(self, keys):
for k in keys:
del self._cache[k]
del self._specials[k]
cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
cached = <_Cached*>self._cache.get(key) cached = <_Cached*>self._cache.get(key)
if cached == NULL: if cached == NULL:

View File

@ -465,6 +465,12 @@ cdef class Vocab:
self._by_orth.set(lexeme.orth, lexeme) self._by_orth.set(lexeme.orth, lexeme)
self.length += 1 self.length += 1
def _reset_cache(self, keys, strings):
for k in keys:
del self._by_hash[k]
self._by_orth = PreshMap()
def pickle_vocab(vocab): def pickle_vocab(vocab):
sstore = vocab.strings sstore = vocab.strings