mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Merge pull request #1576 from ligser/master
Actually reset caches in pipe [wip]
This commit is contained in:
commit
30ba81f881
|
@ -558,13 +558,11 @@ class Language(object):
|
||||||
old_refs.add(doc)
|
old_refs.add(doc)
|
||||||
nr_seen += 1
|
nr_seen += 1
|
||||||
elif len(old_refs) == 0:
|
elif len(old_refs) == 0:
|
||||||
self.vocab.strings._cleanup_stale_strings()
|
old_refs, recent_refs = recent_refs, old_refs
|
||||||
|
keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data)
|
||||||
|
self.vocab._reset_cache(keys, strings)
|
||||||
|
self.tokenizer._reset_cache(keys)
|
||||||
nr_seen = 0
|
nr_seen = 0
|
||||||
# We can't know which strings from the last batch have really expired.
|
|
||||||
# So we don't erase the strings — we just extend with the original
|
|
||||||
# content.
|
|
||||||
for string in original_strings_data:
|
|
||||||
self.vocab.strings.add(string)
|
|
||||||
|
|
||||||
def to_disk(self, path, disable=tuple()):
|
def to_disk(self, path, disable=tuple()):
|
||||||
"""Save the current state to a directory. If a model is loaded, this
|
"""Save the current state to a directory. If a model is loaded, this
|
||||||
|
|
|
@ -249,20 +249,37 @@ cdef class StringStore:
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string)
|
||||||
|
|
||||||
def _cleanup_stale_strings(self):
|
def _cleanup_stale_strings(self, excepted):
|
||||||
|
"""
|
||||||
|
excepted (list): Strings that should not be removed.
|
||||||
|
RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
|
||||||
|
"""
|
||||||
if self.hits.size() == 0:
|
if self.hits.size() == 0:
|
||||||
# If we don't have any hits, just skip cleanup
|
# If we don't have any hits, just skip cleanup
|
||||||
return
|
return
|
||||||
|
|
||||||
cdef vector[hash_t] tmp
|
cdef vector[hash_t] tmp
|
||||||
|
dropped_strings = []
|
||||||
|
dropped_keys = []
|
||||||
for i in range(self.keys.size()):
|
for i in range(self.keys.size()):
|
||||||
key = self.keys[i]
|
key = self.keys[i]
|
||||||
if self.hits.count(key) != 0:
|
# Here we cannot use __getitem__ because it also set hit.
|
||||||
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
|
value = decode_Utf8Str(utf8str)
|
||||||
|
if self.hits.count(key) != 0 or value in excepted:
|
||||||
tmp.push_back(key)
|
tmp.push_back(key)
|
||||||
|
else:
|
||||||
|
dropped_keys.append(key)
|
||||||
|
dropped_strings.append(value)
|
||||||
|
|
||||||
self.keys.swap(tmp)
|
self.keys.swap(tmp)
|
||||||
|
strings = list(self)
|
||||||
|
self._reset_and_load(strings)
|
||||||
|
# Here we have strings but hits to it should be reseted
|
||||||
self.hits.clear()
|
self.hits.clear()
|
||||||
|
|
||||||
|
return dropped_keys, dropped_strings
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
cdef bytes byte_string = py_string.encode('utf8')
|
cdef bytes byte_string = py_string.encode('utf8')
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import gc
|
||||||
|
|
||||||
from ...lang.en import English
|
from ...lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,14 +11,25 @@ def test_issue1506():
|
||||||
|
|
||||||
def string_generator():
|
def string_generator():
|
||||||
for _ in range(10001):
|
for _ in range(10001):
|
||||||
yield "It's sentence produced by that bug."
|
yield u"It's sentence produced by that bug."
|
||||||
|
|
||||||
for _ in range(10001):
|
for _ in range(10001):
|
||||||
yield "I erase lemmas."
|
yield u"I erase some hbdsaj lemmas."
|
||||||
|
|
||||||
for _ in range(10001):
|
for _ in range(10001):
|
||||||
yield "It's sentence produced by that bug."
|
yield u"I erase lemmas."
|
||||||
|
|
||||||
|
for _ in range(10001):
|
||||||
|
yield u"It's sentence produced by that bug."
|
||||||
|
|
||||||
|
for _ in range(10001):
|
||||||
|
yield u"It's sentence produced by that bug."
|
||||||
|
|
||||||
|
for i, d in enumerate(nlp.pipe(string_generator())):
|
||||||
|
# We should run cleanup more than one time to actually cleanup data.
|
||||||
|
# In first run — clean up only mark strings as «not hitted».
|
||||||
|
if i == 10000 or i == 20000 or i == 30000:
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
for d in nlp.pipe(string_generator()):
|
|
||||||
for t in d:
|
for t in d:
|
||||||
str(t.lemma_)
|
str(t.lemma_)
|
||||||
|
|
|
@ -133,6 +133,11 @@ cdef class Tokenizer:
|
||||||
for text in texts:
|
for text in texts:
|
||||||
yield self(text)
|
yield self(text)
|
||||||
|
|
||||||
|
def _reset_cache(self, keys):
|
||||||
|
for k in keys:
|
||||||
|
del self._cache[k]
|
||||||
|
del self._specials[k]
|
||||||
|
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
||||||
cached = <_Cached*>self._cache.get(key)
|
cached = <_Cached*>self._cache.get(key)
|
||||||
if cached == NULL:
|
if cached == NULL:
|
||||||
|
|
|
@ -467,6 +467,13 @@ cdef class Vocab:
|
||||||
self._by_orth.set(lexeme.orth, lexeme)
|
self._by_orth.set(lexeme.orth, lexeme)
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
|
||||||
|
def _reset_cache(self, keys, strings):
|
||||||
|
for k in keys:
|
||||||
|
del self._by_hash[k]
|
||||||
|
|
||||||
|
if len(strings) != 0:
|
||||||
|
self._by_orth = PreshMap()
|
||||||
|
|
||||||
|
|
||||||
def pickle_vocab(vocab):
|
def pickle_vocab(vocab):
|
||||||
sstore = vocab.strings
|
sstore = vocab.strings
|
||||||
|
|
Loading…
Reference in New Issue
Block a user