mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 05:07:03 +03:00
Move string cleanup under a setting flag
This commit is contained in:
parent
5949777b12
commit
8fec7268eb
|
@ -547,18 +547,23 @@ class Language(object):
|
||||||
# in the string store.
|
# in the string store.
|
||||||
recent_refs = weakref.WeakSet()
|
recent_refs = weakref.WeakSet()
|
||||||
old_refs = weakref.WeakSet()
|
old_refs = weakref.WeakSet()
|
||||||
# If there is anything that we have inside — after iterations we should
|
# Keep track of the original string data, so that if we flush old strings,
|
||||||
# carefully get it back.
|
# we can recover the original ones. However, we only want to do this if we're
|
||||||
original_strings_data = list(self.vocab.strings)
|
# really adding strings, to save up-front costs.
|
||||||
|
original_strings_data = None
|
||||||
nr_seen = 0
|
nr_seen = 0
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
yield doc
|
yield doc
|
||||||
|
if cleanup:
|
||||||
recent_refs.add(doc)
|
recent_refs.add(doc)
|
||||||
if nr_seen < 10000:
|
if nr_seen < 10000:
|
||||||
old_refs.add(doc)
|
old_refs.add(doc)
|
||||||
nr_seen += 1
|
nr_seen += 1
|
||||||
elif len(old_refs) == 0:
|
elif len(old_refs) == 0:
|
||||||
old_refs, recent_refs = recent_refs, old_refs
|
old_refs, recent_refs = recent_refs, old_refs
|
||||||
|
if original_strings_data is None:
|
||||||
|
original_strings_data = list(self.vocab.strings)
|
||||||
|
else:
|
||||||
keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data)
|
keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data)
|
||||||
self.vocab._reset_cache(keys, strings)
|
self.vocab._reset_cache(keys, strings)
|
||||||
self.tokenizer._reset_cache(keys)
|
self.tokenizer._reset_cache(keys)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user