Move string cleanup under a setting flag

This commit is contained in:
Matthew Honnibal 2017-11-23 12:19:18 +00:00
parent 5949777b12
commit 8fec7268eb

View File

@ -547,22 +547,27 @@ class Language(object):
# in the string store. # in the string store.
recent_refs = weakref.WeakSet() recent_refs = weakref.WeakSet()
old_refs = weakref.WeakSet() old_refs = weakref.WeakSet()
# If there is anything that we have inside — after iterations we should # Keep track of the original string data, so that if we flush old strings,
# carefully get it back. # we can recover the original ones. However, we only want to do this if we're
original_strings_data = list(self.vocab.strings) # really adding strings, to save up-front costs.
original_strings_data = None
nr_seen = 0 nr_seen = 0
for doc in docs: for doc in docs:
yield doc yield doc
recent_refs.add(doc) if cleanup:
if nr_seen < 10000: recent_refs.add(doc)
old_refs.add(doc) if nr_seen < 10000:
nr_seen += 1 old_refs.add(doc)
elif len(old_refs) == 0: nr_seen += 1
old_refs, recent_refs = recent_refs, old_refs elif len(old_refs) == 0:
keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data) old_refs, recent_refs = recent_refs, old_refs
self.vocab._reset_cache(keys, strings) if original_strings_data is None:
self.tokenizer._reset_cache(keys) original_strings_data = list(self.vocab.strings)
nr_seen = 0 else:
keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data)
self.vocab._reset_cache(keys, strings)
self.tokenizer._reset_cache(keys)
nr_seen = 0
def to_disk(self, path, disable=tuple()): def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. If a model is loaded, this """Save the current state to a directory. If a model is loaded, this