diff --git a/spacy/language.py b/spacy/language.py index cacce85c7..c43f4e4c5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -558,13 +558,11 @@ class Language(object): old_refs.add(doc) nr_seen += 1 elif len(old_refs) == 0: - self.vocab.strings._cleanup_stale_strings() + old_refs, recent_refs = recent_refs, old_refs + keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data) + self.vocab._reset_cache(keys, strings) + self.tokenizer._reset_cache(keys) nr_seen = 0 - # We can't know which strings from the last batch have really expired. - # So we don't erase the strings — we just extend with the original - # content. - for string in original_strings_data: - self.vocab.strings.add(string) def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this diff --git a/spacy/strings.pyx b/spacy/strings.pyx index f4e047118..649bd43a4 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -249,20 +249,37 @@ cdef class StringStore: for string in strings: self.add(string) - def _cleanup_stale_strings(self): + def _cleanup_stale_strings(self, excepted): + """ + excepted (list): Strings that should not be removed. + RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places + """ if self.hits.size() == 0: # If we don't have any hits, just skip cleanup return cdef vector[hash_t] tmp + dropped_strings = [] + dropped_keys = [] for i in range(self.keys.size()): key = self.keys[i] - if self.hits.count(key) != 0: + # Here we cannot use __getitem__ because it also set hit. + utf8str = self._map.get(key) + value = decode_Utf8Str(utf8str) + if self.hits.count(key) != 0 or value in excepted: tmp.push_back(key) + else: + dropped_keys.append(key) + dropped_strings.append(value) self.keys.swap(tmp) + strings = list(self) + self._reset_and_load(strings) + # Here we have strings but hits to it should be reseted self.hits.clear() + return dropped_keys, dropped_strings + cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode('utf8') diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py index d9ba1ac97..71702a6d4 100644 --- a/spacy/tests/regression/test_issue1506.py +++ b/spacy/tests/regression/test_issue1506.py @@ -1,6 +1,8 @@ # coding: utf8 from __future__ import unicode_literals +import gc + from ...lang.en import English @@ -9,14 +11,25 @@ def test_issue1506(): def string_generator(): for _ in range(10001): - yield "It's sentence produced by that bug." + yield u"It's sentence produced by that bug." for _ in range(10001): - yield "I erase lemmas." + yield u"I erase some hbdsaj lemmas." for _ in range(10001): - yield "It's sentence produced by that bug." + yield u"I erase lemmas." + + for _ in range(10001): + yield u"It's sentence produced by that bug." + + for _ in range(10001): + yield u"It's sentence produced by that bug." + + for i, d in enumerate(nlp.pipe(string_generator())): + # We should run cleanup more than one time to actually cleanup data. + # In first run — clean up only mark strings as «not hitted». + if i == 10000 or i == 20000 or i == 30000: + gc.collect() - for d in nlp.pipe(string_generator()): for t in d: str(t.lemma_) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 095fbf4ad..64c00d950 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -133,6 +133,11 @@ cdef class Tokenizer: for text in texts: yield self(text) + def _reset_cache(self, keys): + for k in keys: + del self._cache[k] + del self._specials[k] + cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 59df8cc1b..37220c17b 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -467,6 +467,13 @@ cdef class Vocab: self._by_orth.set(lexeme.orth, lexeme) self.length += 1 + def _reset_cache(self, keys, strings): + for k in keys: + del self._by_hash[k] + + if len(strings) != 0: + self._by_orth = PreshMap() + def pickle_vocab(vocab): sstore = vocab.strings