mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Merge pull request #1424 from explosion/feature/streaming-data-memory-growth
💫 Fix streaming data memory growth (!!)
This commit is contained in:
commit
fc797a58de
|
@ -8,6 +8,7 @@ import random
|
|||
import ujson
|
||||
from collections import OrderedDict
|
||||
import itertools
|
||||
import weakref
|
||||
|
||||
from .tokenizer import Tokenizer
|
||||
from .vocab import Vocab
|
||||
|
@ -510,8 +511,33 @@ class Language(object):
|
|||
else:
|
||||
# Apply the function, but yield the doc
|
||||
docs = _pipe(proc, docs)
|
||||
# Track weakrefs of "recent" documents, so that we can see when they
|
||||
# expire from memory. When they do, we know we don't need old strings.
|
||||
# This way, we avoid maintaining an unbounded growth in string entries
|
||||
# in the string store.
|
||||
recent_refs = weakref.WeakSet()
|
||||
old_refs = weakref.WeakSet()
|
||||
original_strings_data = self.vocab.strings.to_bytes()
|
||||
StringStore = self.vocab.strings.__class__
|
||||
recent_strings = StringStore().from_bytes(original_strings_data)
|
||||
nr_seen = 0
|
||||
for doc in docs:
|
||||
yield doc
|
||||
for word in doc:
|
||||
recent_strings.add(word.text)
|
||||
recent_refs.add(doc)
|
||||
if nr_seen < 10000:
|
||||
old_refs.add(doc)
|
||||
nr_seen += 1
|
||||
elif len(old_refs) == 0:
|
||||
# All the docs in the 'old' set have expired, so the only
|
||||
# difference between the backup strings and the current
|
||||
# string-store should be obsolete. We therefore swap out the
|
||||
# old strings data.
|
||||
old_refs, recent_refs = recent_refs, old_refs
|
||||
self.vocab.strings._reset_and_load(recent_strings)
|
||||
recent_strings = StringStore().from_bytes(original_strings_data)
|
||||
nr_seen = 0
|
||||
|
||||
def to_disk(self, path, disable=tuple()):
|
||||
"""Save the current state to a directory. If a model is loaded, this
|
||||
|
|
|
@ -21,11 +21,9 @@ ctypedef union Utf8Str:
|
|||
|
||||
cdef class StringStore:
|
||||
cdef Pool mem
|
||||
cdef bint is_frozen
|
||||
|
||||
cdef vector[hash_t] keys
|
||||
cdef public PreshMap _map
|
||||
cdef public PreshMap _oov
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
||||
|
|
|
@ -86,8 +86,6 @@ cdef class StringStore:
|
|||
"""
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._oov = PreshMap()
|
||||
self.is_frozen = freeze
|
||||
if strings is not None:
|
||||
for string in strings:
|
||||
self.add(string)
|
||||
|
@ -243,21 +241,12 @@ cdef class StringStore:
|
|||
self.add(word)
|
||||
return self
|
||||
|
||||
def set_frozen(self, bint is_frozen):
|
||||
# TODO
|
||||
self.is_frozen = is_frozen
|
||||
|
||||
def flush_oov(self):
|
||||
self._oov = PreshMap()
|
||||
|
||||
def _reset_and_load(self, strings, freeze=False):
|
||||
def _reset_and_load(self, strings):
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._oov = PreshMap()
|
||||
self.keys.clear()
|
||||
for string in strings:
|
||||
self.add(string)
|
||||
self.is_frozen = freeze
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
|
@ -272,18 +261,6 @@ cdef class StringStore:
|
|||
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
||||
if value is not NULL:
|
||||
return value
|
||||
value = <Utf8Str*>self._oov.get(key)
|
||||
if value is not NULL:
|
||||
return value
|
||||
if self.is_frozen:
|
||||
# OOV store uses 32 bit hashes. Pretty ugly :(
|
||||
key32 = hash32_utf8(utf8_string, length)
|
||||
# Important: Make the OOV store own the memory. That way it's trivial
|
||||
# to flush them all.
|
||||
value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
|
||||
self._oov.set(key32, value)
|
||||
return NULL
|
||||
|
||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||
self._map.set(key, value)
|
||||
self.keys.push_back(key)
|
||||
|
|
|
@ -57,9 +57,9 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
|||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||
for head in doc:
|
||||
for child in head.lefts:
|
||||
assert child.head is head
|
||||
assert child.head == head
|
||||
for child in head.rights:
|
||||
assert child.head is head
|
||||
assert child.head == head
|
||||
|
||||
|
||||
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
||||
|
|
|
@ -54,6 +54,8 @@ cdef class Doc:
|
|||
|
||||
cdef public object noun_chunks_iterator
|
||||
|
||||
cdef object __weakref__
|
||||
|
||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
|
||||
|
||||
cpdef np.ndarray to_array(self, object features)
|
||||
|
|
|
@ -140,7 +140,6 @@ cdef class Doc:
|
|||
self.user_span_hooks = {}
|
||||
self.tensor = numpy.zeros((0,), dtype='float32')
|
||||
self.user_data = {}
|
||||
self._py_tokens = []
|
||||
self._vector = None
|
||||
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
||||
cdef unicode orth
|
||||
|
@ -209,10 +208,7 @@ cdef class Doc:
|
|||
if i < 0:
|
||||
i = self.length + i
|
||||
bounds_check(i, self.length, PADDING)
|
||||
if self._py_tokens[i] is not None:
|
||||
return self._py_tokens[i]
|
||||
else:
|
||||
return Token.cinit(self.vocab, &self.c[i], i, self)
|
||||
return Token.cinit(self.vocab, &self.c[i], i, self)
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over `Token` objects, from which the annotations can be
|
||||
|
@ -226,10 +222,7 @@ cdef class Doc:
|
|||
"""
|
||||
cdef int i
|
||||
for i in range(self.length):
|
||||
if self._py_tokens[i] is not None:
|
||||
yield self._py_tokens[i]
|
||||
else:
|
||||
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
||||
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
||||
|
||||
def __len__(self):
|
||||
"""The number of tokens in the document.
|
||||
|
@ -535,7 +528,6 @@ cdef class Doc:
|
|||
self.length += 1
|
||||
# Set morphological attributes, e.g. by lemma, if possible
|
||||
self.vocab.morphology.assign_untagged(t)
|
||||
self._py_tokens.append(None)
|
||||
return t.idx + t.lex.length + t.spacy
|
||||
|
||||
@cython.boundscheck(False)
|
||||
|
@ -841,7 +833,6 @@ cdef class Doc:
|
|||
# Set the left/right children, left/right edges
|
||||
set_children_from_heads(self.c, self.length)
|
||||
# Clear the cached Python objects
|
||||
self._py_tokens = [None] * self.length
|
||||
# Return the merged Python object
|
||||
return self[start]
|
||||
|
||||
|
|
|
@ -19,10 +19,7 @@ cdef class Token:
|
|||
if offset < 0 or offset >= doc.length:
|
||||
msg = "Attempt to access token at %d, max length %d"
|
||||
raise IndexError(msg % (offset, doc.length))
|
||||
if doc._py_tokens[offset] != None:
|
||||
return doc._py_tokens[offset]
|
||||
cdef Token self = Token.__new__(Token, vocab, doc, offset)
|
||||
doc._py_tokens[offset] = self
|
||||
return self
|
||||
|
||||
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
||||
|
|
Loading…
Reference in New Issue
Block a user