mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 08:12:24 +03:00
Create a rolling buffer for the StringStore in Language.pipe()
This commit is contained in:
parent
59c216196c
commit
5c14f3f033
|
@ -8,6 +8,7 @@ import random
|
||||||
import ujson
|
import ujson
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import itertools
|
import itertools
|
||||||
|
import weakref
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
@ -510,8 +511,33 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
# Apply the function, but yield the doc
|
# Apply the function, but yield the doc
|
||||||
docs = _pipe(proc, docs)
|
docs = _pipe(proc, docs)
|
||||||
|
# Track weakrefs of "recent" documents, so that we can see when they
|
||||||
|
# expire from memory. When they do, we know we don't need old strings.
|
||||||
|
# This way, we avoid maintaining an unbounded growth in string entries
|
||||||
|
# in the string store.
|
||||||
|
recent_refs = weakref.WeakSet()
|
||||||
|
old_refs = weakref.WeakSet()
|
||||||
|
original_strings_data = self.vocab.strings.to_bytes()
|
||||||
|
StringStore = self.vocab.strings.__class__
|
||||||
|
recent_strings = StringStore().from_bytes(original_strings_data)
|
||||||
|
nr_seen = 0
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
yield doc
|
yield doc
|
||||||
|
for word in doc:
|
||||||
|
recent_strings.add(word.text)
|
||||||
|
recent_refs.add(doc)
|
||||||
|
if nr_seen < 1000:
|
||||||
|
old_refs.add(doc)
|
||||||
|
nr_seen += 1
|
||||||
|
elif len(old_refs) == 0:
|
||||||
|
# All the docs in the 'old' set have expired, so the only
|
||||||
|
# difference between the backup strings and the current
|
||||||
|
# string-store should be obsolete. We therefore swap out the
|
||||||
|
# old strings data.
|
||||||
|
old_refs, recent_refs = recent_refs, old_refs
|
||||||
|
self.vocab.strings._reset_and_load(recent_strings)
|
||||||
|
recent_strings = StringStore().from_bytes(original_strings_data)
|
||||||
|
nr_seen = 0
|
||||||
|
|
||||||
def to_disk(self, path, disable=tuple()):
|
def to_disk(self, path, disable=tuple()):
|
||||||
"""Save the current state to a directory. If a model is loaded, this
|
"""Save the current state to a directory. If a model is loaded, this
|
||||||
|
|
Loading…
Reference in New Issue
Block a user