Try to hold origin data instead of restore it

This commit is contained in:
Roman Domrachev 2017-11-14 22:40:03 +03:00
parent 91e2fa6561
commit a33d5a068d
2 changed files with 5 additions and 8 deletions

View File

@ -559,14 +559,10 @@ class Language(object):
nr_seen += 1 nr_seen += 1
elif len(old_refs) == 0: elif len(old_refs) == 0:
old_refs, recent_refs = recent_refs, old_refs old_refs, recent_refs = recent_refs, old_refs
keys, strings = self.vocab.strings._cleanup_stale_strings() keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data)
self.vocab._reset_cache(keys, strings) self.vocab._reset_cache(keys, strings)
self.tokenizer._reset_cache(keys) self.tokenizer._reset_cache(keys)
for string in original_strings_data:
self.vocab.strings.add(string)
nr_seen = 0 nr_seen = 0
# We can't know which strings from the last batch have really expired.
# So we don't erase the strings.
def to_disk(self, path, disable=tuple()): def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. If a model is loaded, this """Save the current state to a directory. If a model is loaded, this

View File

@ -249,7 +249,7 @@ cdef class StringStore:
for string in strings: for string in strings:
self.add(string) self.add(string)
def _cleanup_stale_strings(self): def _cleanup_stale_strings(self, excepted):
""" """
RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
""" """
@ -262,11 +262,12 @@ cdef class StringStore:
dropped_keys = [] dropped_keys = []
for i in range(self.keys.size()): for i in range(self.keys.size()):
key = self.keys[i] key = self.keys[i]
if self.hits.count(key) != 0: value = self[key]
if self.hits.count(key) != 0 or value in excepted:
tmp.push_back(key) tmp.push_back(key)
else: else:
dropped_keys.append(key) dropped_keys.append(key)
dropped_strings.append(self[key]) dropped_strings.append(value)
self.keys.swap(tmp) self.keys.swap(tmp)
strings = list(self) strings = list(self)