Merge pull request #1576 from ligser/master

Actually reset caches in pipe [wip]
2025-07-15 18:52:29 +03:00 · 2017-11-23 12:54:48 +01:00 · 2017-11-23 12:54:48 +01:00 · 30ba81f881
commit 30ba81f881
parent 4988eeb18a 61d28d03e4
5 changed files with 52 additions and 12 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -558,13 +558,11 @@ class Language(object):
                old_refs.add(doc)
                nr_seen += 1
            elif len(old_refs) == 0:
-                self.vocab.strings._cleanup_stale_strings()
+                old_refs, recent_refs = recent_refs, old_refs
                keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data)
                self.vocab._reset_cache(keys, strings)
                self.tokenizer._reset_cache(keys)
                nr_seen = 0
        # We can't know which strings from the last batch have really expired.
        # So we don't erase the strings — we just extend with the original
        # content.
        for string in original_strings_data:
            self.vocab.strings.add(string)
    def to_disk(self, path, disable=tuple()):
        """Save the current state to a directory.  If a model is loaded, this
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -249,20 +249,37 @@ cdef class StringStore:
        for string in strings:
            self.add(string)
-    def _cleanup_stale_strings(self):
+    def _cleanup_stale_strings(self, excepted):
        """
        excepted (list): Strings that should not be removed.
        RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
        """
        if self.hits.size() == 0:
            # If we don't have any hits, just skip cleanup
            return
        cdef vector[hash_t] tmp
        dropped_strings = []
        dropped_keys = []
        for i in range(self.keys.size()):
            key = self.keys[i]
-            if self.hits.count(key) != 0:
+            # Here we cannot use __getitem__ because it also set hit.
            utf8str = <Utf8Str*>self._map.get(key)
            value = decode_Utf8Str(utf8str)
            if self.hits.count(key) != 0 or value in excepted:
                tmp.push_back(key)
            else:
                dropped_keys.append(key)
                dropped_strings.append(value)
        self.keys.swap(tmp)
        strings = list(self)
        self._reset_and_load(strings)
        # Here we have strings but hits to it should be reseted
        self.hits.clear()
        return dropped_keys, dropped_strings
    cdef const Utf8Str* intern_unicode(self, unicode py_string):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode('utf8')
--- a/spacy/tests/regression/test_issue1506.py
+++ b/spacy/tests/regression/test_issue1506.py
@ -1,6 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
 import gc
 from ...lang.en import English
@ -9,14 +11,25 @@ def test_issue1506():
    def string_generator():
        for _ in range(10001):
-            yield "It's sentence produced by that bug."
+            yield u"It's sentence produced by that bug."
        for _ in range(10001):
-            yield "I erase lemmas."
+            yield u"I erase some hbdsaj lemmas."
        for _ in range(10001):
-            yield "It's sentence produced by that bug."
+            yield u"I erase lemmas."
        for _ in range(10001):
            yield u"It's sentence produced by that bug."
        for _ in range(10001):
            yield u"It's sentence produced by that bug."
    for i, d in enumerate(nlp.pipe(string_generator())):
        # We should run cleanup more than one time to actually cleanup data.
        # In first run — clean up only mark strings as «not hitted».
        if i == 10000 or i == 20000 or i == 30000:
            gc.collect()
    for d in nlp.pipe(string_generator()):
        for t in d:
            str(t.lemma_)
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -133,6 +133,11 @@ cdef class Tokenizer:
        for text in texts:
            yield self(text)
    def _reset_cache(self, keys):
        for k in keys:
            del self._cache[k]
            del self._specials[k]
    cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
        cached = <_Cached*>self._cache.get(key)
        if cached == NULL:
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -467,6 +467,13 @@ cdef class Vocab:
            self._by_orth.set(lexeme.orth, lexeme)
            self.length += 1
    def _reset_cache(self, keys, strings):
        for k in keys:
            del self._by_hash[k]
        if len(strings) != 0:
            self._by_orth = PreshMap()
 def pickle_vocab(vocab):
    sstore = vocab.strings