mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Merge pull request #1424 from explosion/feature/streaming-data-memory-growth
💫 Fix streaming data memory growth (!!)
This commit is contained in:
commit
fc797a58de
|
@ -8,6 +8,7 @@ import random
|
||||||
import ujson
|
import ujson
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import itertools
|
import itertools
|
||||||
|
import weakref
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
@ -510,8 +511,33 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
# Apply the function, but yield the doc
|
# Apply the function, but yield the doc
|
||||||
docs = _pipe(proc, docs)
|
docs = _pipe(proc, docs)
|
||||||
|
# Track weakrefs of "recent" documents, so that we can see when they
|
||||||
|
# expire from memory. When they do, we know we don't need old strings.
|
||||||
|
# This way, we avoid maintaining an unbounded growth in string entries
|
||||||
|
# in the string store.
|
||||||
|
recent_refs = weakref.WeakSet()
|
||||||
|
old_refs = weakref.WeakSet()
|
||||||
|
original_strings_data = self.vocab.strings.to_bytes()
|
||||||
|
StringStore = self.vocab.strings.__class__
|
||||||
|
recent_strings = StringStore().from_bytes(original_strings_data)
|
||||||
|
nr_seen = 0
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
yield doc
|
yield doc
|
||||||
|
for word in doc:
|
||||||
|
recent_strings.add(word.text)
|
||||||
|
recent_refs.add(doc)
|
||||||
|
if nr_seen < 10000:
|
||||||
|
old_refs.add(doc)
|
||||||
|
nr_seen += 1
|
||||||
|
elif len(old_refs) == 0:
|
||||||
|
# All the docs in the 'old' set have expired, so the only
|
||||||
|
# difference between the backup strings and the current
|
||||||
|
# string-store should be obsolete. We therefore swap out the
|
||||||
|
# old strings data.
|
||||||
|
old_refs, recent_refs = recent_refs, old_refs
|
||||||
|
self.vocab.strings._reset_and_load(recent_strings)
|
||||||
|
recent_strings = StringStore().from_bytes(original_strings_data)
|
||||||
|
nr_seen = 0
|
||||||
|
|
||||||
def to_disk(self, path, disable=tuple()):
|
def to_disk(self, path, disable=tuple()):
|
||||||
"""Save the current state to a directory. If a model is loaded, this
|
"""Save the current state to a directory. If a model is loaded, this
|
||||||
|
|
|
@ -21,11 +21,9 @@ ctypedef union Utf8Str:
|
||||||
|
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef bint is_frozen
|
|
||||||
|
|
||||||
cdef vector[hash_t] keys
|
cdef vector[hash_t] keys
|
||||||
cdef public PreshMap _map
|
cdef public PreshMap _map
|
||||||
cdef public PreshMap _oov
|
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
||||||
|
|
|
@ -86,8 +86,6 @@ cdef class StringStore:
|
||||||
"""
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self._oov = PreshMap()
|
|
||||||
self.is_frozen = freeze
|
|
||||||
if strings is not None:
|
if strings is not None:
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string)
|
||||||
|
@ -243,21 +241,12 @@ cdef class StringStore:
|
||||||
self.add(word)
|
self.add(word)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set_frozen(self, bint is_frozen):
|
def _reset_and_load(self, strings):
|
||||||
# TODO
|
|
||||||
self.is_frozen = is_frozen
|
|
||||||
|
|
||||||
def flush_oov(self):
|
|
||||||
self._oov = PreshMap()
|
|
||||||
|
|
||||||
def _reset_and_load(self, strings, freeze=False):
|
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self._oov = PreshMap()
|
|
||||||
self.keys.clear()
|
self.keys.clear()
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string)
|
||||||
self.is_frozen = freeze
|
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
|
@ -272,18 +261,6 @@ cdef class StringStore:
|
||||||
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
||||||
if value is not NULL:
|
if value is not NULL:
|
||||||
return value
|
return value
|
||||||
value = <Utf8Str*>self._oov.get(key)
|
|
||||||
if value is not NULL:
|
|
||||||
return value
|
|
||||||
if self.is_frozen:
|
|
||||||
# OOV store uses 32 bit hashes. Pretty ugly :(
|
|
||||||
key32 = hash32_utf8(utf8_string, length)
|
|
||||||
# Important: Make the OOV store own the memory. That way it's trivial
|
|
||||||
# to flush them all.
|
|
||||||
value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
|
|
||||||
self._oov.set(key32, value)
|
|
||||||
return NULL
|
|
||||||
|
|
||||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||||
self._map.set(key, value)
|
self._map.set(key, value)
|
||||||
self.keys.push_back(key)
|
self.keys.push_back(key)
|
||||||
|
|
|
@ -57,9 +57,9 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||||
for head in doc:
|
for head in doc:
|
||||||
for child in head.lefts:
|
for child in head.lefts:
|
||||||
assert child.head is head
|
assert child.head == head
|
||||||
for child in head.rights:
|
for child in head.rights:
|
||||||
assert child.head is head
|
assert child.head == head
|
||||||
|
|
||||||
|
|
||||||
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
||||||
|
|
|
@ -54,6 +54,8 @@ cdef class Doc:
|
||||||
|
|
||||||
cdef public object noun_chunks_iterator
|
cdef public object noun_chunks_iterator
|
||||||
|
|
||||||
|
cdef object __weakref__
|
||||||
|
|
||||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object features)
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
|
@ -140,7 +140,6 @@ cdef class Doc:
|
||||||
self.user_span_hooks = {}
|
self.user_span_hooks = {}
|
||||||
self.tensor = numpy.zeros((0,), dtype='float32')
|
self.tensor = numpy.zeros((0,), dtype='float32')
|
||||||
self.user_data = {}
|
self.user_data = {}
|
||||||
self._py_tokens = []
|
|
||||||
self._vector = None
|
self._vector = None
|
||||||
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
||||||
cdef unicode orth
|
cdef unicode orth
|
||||||
|
@ -209,10 +208,7 @@ cdef class Doc:
|
||||||
if i < 0:
|
if i < 0:
|
||||||
i = self.length + i
|
i = self.length + i
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
if self._py_tokens[i] is not None:
|
return Token.cinit(self.vocab, &self.c[i], i, self)
|
||||||
return self._py_tokens[i]
|
|
||||||
else:
|
|
||||||
return Token.cinit(self.vocab, &self.c[i], i, self)
|
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over `Token` objects, from which the annotations can be
|
"""Iterate over `Token` objects, from which the annotations can be
|
||||||
|
@ -226,10 +222,7 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
if self._py_tokens[i] is not None:
|
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
||||||
yield self._py_tokens[i]
|
|
||||||
else:
|
|
||||||
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The number of tokens in the document.
|
"""The number of tokens in the document.
|
||||||
|
@ -535,7 +528,6 @@ cdef class Doc:
|
||||||
self.length += 1
|
self.length += 1
|
||||||
# Set morphological attributes, e.g. by lemma, if possible
|
# Set morphological attributes, e.g. by lemma, if possible
|
||||||
self.vocab.morphology.assign_untagged(t)
|
self.vocab.morphology.assign_untagged(t)
|
||||||
self._py_tokens.append(None)
|
|
||||||
return t.idx + t.lex.length + t.spacy
|
return t.idx + t.lex.length + t.spacy
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
|
@ -841,7 +833,6 @@ cdef class Doc:
|
||||||
# Set the left/right children, left/right edges
|
# Set the left/right children, left/right edges
|
||||||
set_children_from_heads(self.c, self.length)
|
set_children_from_heads(self.c, self.length)
|
||||||
# Clear the cached Python objects
|
# Clear the cached Python objects
|
||||||
self._py_tokens = [None] * self.length
|
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
return self[start]
|
return self[start]
|
||||||
|
|
||||||
|
|
|
@ -19,10 +19,7 @@ cdef class Token:
|
||||||
if offset < 0 or offset >= doc.length:
|
if offset < 0 or offset >= doc.length:
|
||||||
msg = "Attempt to access token at %d, max length %d"
|
msg = "Attempt to access token at %d, max length %d"
|
||||||
raise IndexError(msg % (offset, doc.length))
|
raise IndexError(msg % (offset, doc.length))
|
||||||
if doc._py_tokens[offset] != None:
|
|
||||||
return doc._py_tokens[offset]
|
|
||||||
cdef Token self = Token.__new__(Token, vocab, doc, offset)
|
cdef Token self = Token.__new__(Token, vocab, doc, offset)
|
||||||
doc._py_tokens[offset] = self
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user