From a2745b0e84f15867758fca2867500fba9784623c Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Tue, 14 Nov 2017 17:45:50 +0300 Subject: [PATCH 01/11] StringStore now actually cleaned Do not lose docs in ref tracking --- spacy/language.py | 1 + spacy/strings.pyx | 3 +++ spacy/tests/regression/test_issue1506.py | 22 +++++++++++++++++++--- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 739e7665d..d42c75fa9 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -558,6 +558,7 @@ class Language(object): old_refs.add(doc) nr_seen += 1 elif len(old_refs) == 0: + old_refs, recent_refs = recent_refs, old_refs self.vocab.strings._cleanup_stale_strings() nr_seen = 0 # Last batch can be not garbage collected and we cannot know it — last diff --git a/spacy/strings.pyx b/spacy/strings.pyx index b8628cef1..f088b955c 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -260,6 +260,9 @@ cdef class StringStore: if self.hits.count(key) != 0: tmp.push_back(key) + strings = list(self) + self._reset_and_load(strings) + self.keys.swap(tmp) self.hits.clear() diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py index d9ba1ac97..338126d3a 100644 --- a/spacy/tests/regression/test_issue1506.py +++ b/spacy/tests/regression/test_issue1506.py @@ -1,6 +1,8 @@ # coding: utf8 from __future__ import unicode_literals +import gc + from ...lang.en import English @@ -11,12 +13,26 @@ def test_issue1506(): for _ in range(10001): yield "It's sentence produced by that bug." + yield "Oh snap." + for _ in range(10001): yield "I erase lemmas." for _ in range(10001): yield "It's sentence produced by that bug." - for d in nlp.pipe(string_generator()): - for t in d: - str(t.lemma_) + for _ in range(10001): + yield "It's sentence produced by that bug." + + anchor = None + remember = None + for i, d in enumerate(nlp.pipe(string_generator())): + if i == 9999: + anchor = d + elif 10001 == i: + remember = d + elif i == 10002: + del anchor + gc.collect() + + assert remember.text == 'Oh snap.' From 870defa815208f68aaa6632a2a62edf189013fa3 Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Tue, 14 Nov 2017 17:56:30 +0300 Subject: [PATCH 02/11] Swap keys in proper place Remove unnecessary clear of the hits --- spacy/strings.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 3cfbb67dc..abd4fa69e 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -260,12 +260,10 @@ cdef class StringStore: if self.hits.count(key) != 0: tmp.push_back(key) + self.keys.swap(tmp) strings = list(self) self._reset_and_load(strings) - self.keys.swap(tmp) - self.hits.clear() - cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode('utf8') From 3d247d2bb8f7545cacf1419d954de1172eb33763 Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Tue, 14 Nov 2017 18:01:37 +0300 Subject: [PATCH 03/11] Get back previous testcase --- spacy/tests/regression/test_issue1506.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py index 338126d3a..f3ebbe04b 100644 --- a/spacy/tests/regression/test_issue1506.py +++ b/spacy/tests/regression/test_issue1506.py @@ -35,4 +35,7 @@ def test_issue1506(): del anchor gc.collect() + for t in d: + str(t.lemma_) + assert remember.text == 'Oh snap.' From caae77f72de4e1bf5bb4ac493672939feb0bfd63 Mon Sep 17 00:00:00 2001 From: Roman Date: Tue, 14 Nov 2017 19:44:40 +0300 Subject: [PATCH 04/11] Update strings.pyx --- spacy/strings.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index abd4fa69e..d08d96bb2 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -263,6 +263,8 @@ cdef class StringStore: self.keys.swap(tmp) strings = list(self) self._reset_and_load(strings) + # Here we have strings but hits to it should be reseted + self.hits.clear() cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. From 47ce2347b05bb13af6c6430c24341e5892b22ffa Mon Sep 17 00:00:00 2001 From: Roman Date: Tue, 14 Nov 2017 20:28:13 +0300 Subject: [PATCH 05/11] Create test that fails when actual cleanup caused --- spacy/tests/regression/test_issue1506.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py index f3ebbe04b..2abde25bf 100644 --- a/spacy/tests/regression/test_issue1506.py +++ b/spacy/tests/regression/test_issue1506.py @@ -35,6 +35,11 @@ def test_issue1506(): del anchor gc.collect() + # We should run cleanup more than one time to actually cleanup data. + # In first run — clean up only mark strings as «not hitted». + if i == 20000 or i == 30000: + gc.collect() + for t in d: str(t.lemma_) From 4e378dc4a44781e59bc63abb64c0fffb114a5adc Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Tue, 14 Nov 2017 20:45:04 +0300 Subject: [PATCH 06/11] Remove all obsolete code and test only initial problem --- spacy/tests/regression/test_issue1506.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py index 2abde25bf..3e2b31d49 100644 --- a/spacy/tests/regression/test_issue1506.py +++ b/spacy/tests/regression/test_issue1506.py @@ -13,8 +13,6 @@ def test_issue1506(): for _ in range(10001): yield "It's sentence produced by that bug." - yield "Oh snap." - for _ in range(10001): yield "I erase lemmas." @@ -24,23 +22,11 @@ def test_issue1506(): for _ in range(10001): yield "It's sentence produced by that bug." - anchor = None - remember = None for i, d in enumerate(nlp.pipe(string_generator())): - if i == 9999: - anchor = d - elif 10001 == i: - remember = d - elif i == 10002: - del anchor - gc.collect() - # We should run cleanup more than one time to actually cleanup data. # In first run — clean up only mark strings as «not hitted». - if i == 20000 or i == 30000: + if i == 10000 or i == 20000 or i == 30000: gc.collect() for t in d: str(t.lemma_) - - assert remember.text == 'Oh snap.' From 91e2fa65614354f2843f757ac84ed86f99d2a4de Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Tue, 14 Nov 2017 21:15:04 +0300 Subject: [PATCH 07/11] Clean all caches --- spacy/language.py | 11 ++++++----- spacy/strings.pyx | 10 ++++++++++ spacy/tokenizer.pyx | 5 +++++ spacy/vocab.pyx | 6 ++++++ 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index b328cef36..d1c2cf1b2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -559,13 +559,14 @@ class Language(object): nr_seen += 1 elif len(old_refs) == 0: old_refs, recent_refs = recent_refs, old_refs - self.vocab.strings._cleanup_stale_strings() + keys, strings = self.vocab.strings._cleanup_stale_strings() + self.vocab._reset_cache(keys, strings) + self.tokenizer._reset_cache(keys) + for string in original_strings_data: + self.vocab.strings.add(string) nr_seen = 0 # We can't know which strings from the last batch have really expired. - # So we don't erase the strings — we just extend with the original - # content. - for string in original_strings_data: - self.vocab.strings.add(string) + # So we don't erase the strings. def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this diff --git a/spacy/strings.pyx b/spacy/strings.pyx index d08d96bb2..48c9a0cc8 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -250,15 +250,23 @@ cdef class StringStore: self.add(string) def _cleanup_stale_strings(self): + """ + RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places + """ if self.hits.size() == 0: # If we don't have any hits, just skip cleanup return cdef vector[hash_t] tmp + dropped_strings = [] + dropped_keys = [] for i in range(self.keys.size()): key = self.keys[i] if self.hits.count(key) != 0: tmp.push_back(key) + else: + dropped_keys.append(key) + dropped_strings.append(self[key]) self.keys.swap(tmp) strings = list(self) @@ -266,6 +274,8 @@ cdef class StringStore: # Here we have strings but hits to it should be reseted self.hits.clear() + return dropped_keys, dropped_strings + cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode('utf8') diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 67ff47743..543a04256 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -132,6 +132,11 @@ cdef class Tokenizer: for text in texts: yield self(text) + def _reset_cache(self, keys): + for k in keys: + del self._cache[k] + del self._specials[k] + cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 675e4a805..122aa80dc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -465,6 +465,12 @@ cdef class Vocab: self._by_orth.set(lexeme.orth, lexeme) self.length += 1 + def _reset_cache(self, keys, strings): + for k in keys: + del self._by_hash[k] + + self._by_orth = PreshMap() + def pickle_vocab(vocab): sstore = vocab.strings From a33d5a068d073dc1a283edcacab692ea83a5d5b0 Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Tue, 14 Nov 2017 22:40:03 +0300 Subject: [PATCH 08/11] Try to hold origin data instead of restore it --- spacy/language.py | 6 +----- spacy/strings.pyx | 7 ++++--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index d1c2cf1b2..c43f4e4c5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -559,14 +559,10 @@ class Language(object): nr_seen += 1 elif len(old_refs) == 0: old_refs, recent_refs = recent_refs, old_refs - keys, strings = self.vocab.strings._cleanup_stale_strings() + keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data) self.vocab._reset_cache(keys, strings) self.tokenizer._reset_cache(keys) - for string in original_strings_data: - self.vocab.strings.add(string) nr_seen = 0 - # We can't know which strings from the last batch have really expired. - # So we don't erase the strings. def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 48c9a0cc8..4243c8193 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -249,7 +249,7 @@ cdef class StringStore: for string in strings: self.add(string) - def _cleanup_stale_strings(self): + def _cleanup_stale_strings(self, excepted): """ RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places """ @@ -262,11 +262,12 @@ cdef class StringStore: dropped_keys = [] for i in range(self.keys.size()): key = self.keys[i] - if self.hits.count(key) != 0: + value = self[key] + if self.hits.count(key) != 0 or value in excepted: tmp.push_back(key) else: dropped_keys.append(key) - dropped_strings.append(self[key]) + dropped_strings.append(value) self.keys.swap(tmp) strings = list(self) From 3e216808148855ef4aff0a28c26f22c040a084b0 Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Tue, 14 Nov 2017 22:58:46 +0300 Subject: [PATCH 09/11] Use safer method to get string without hit --- spacy/strings.pyx | 5 ++++- spacy/tests/regression/test_issue1506.py | 3 +++ spacy/vocab.pyx | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 4243c8193..649bd43a4 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -251,6 +251,7 @@ cdef class StringStore: def _cleanup_stale_strings(self, excepted): """ + excepted (list): Strings that should not be removed. RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places """ if self.hits.size() == 0: @@ -262,7 +263,9 @@ cdef class StringStore: dropped_keys = [] for i in range(self.keys.size()): key = self.keys[i] - value = self[key] + # Here we cannot use __getitem__ because it also set hit. + utf8str = self._map.get(key) + value = decode_Utf8Str(utf8str) if self.hits.count(key) != 0 or value in excepted: tmp.push_back(key) else: diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py index 3e2b31d49..da6479ec2 100644 --- a/spacy/tests/regression/test_issue1506.py +++ b/spacy/tests/regression/test_issue1506.py @@ -13,6 +13,9 @@ def test_issue1506(): for _ in range(10001): yield "It's sentence produced by that bug." + for _ in range(10001): + yield "I erase some hbdsaj lemmas." + for _ in range(10001): yield "I erase lemmas." diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 122aa80dc..f96857c15 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -469,7 +469,8 @@ cdef class Vocab: for k in keys: del self._by_hash[k] - self._by_orth = PreshMap() + if len(strings) != 0: + self._by_orth = PreshMap() def pickle_vocab(vocab): From 505c6a2f2f5ef9af7b9ac0c702abc69ed2ab20de Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Wed, 15 Nov 2017 17:55:48 +0300 Subject: [PATCH 10/11] Completely cleanup tokenizer cache Tokenizer cache can have be different keys than string That modification can slow down tokenizer and need to be measured --- spacy/language.py | 2 +- spacy/tests/regression/test_issue1506.py | 10 +++++----- spacy/tokenizer.pyx | 9 +++++---- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index c43f4e4c5..8163d2812 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -561,7 +561,7 @@ class Language(object): old_refs, recent_refs = recent_refs, old_refs keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data) self.vocab._reset_cache(keys, strings) - self.tokenizer._reset_cache(keys) + self.tokenizer._reset_cache() nr_seen = 0 def to_disk(self, path, disable=tuple()): diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py index da6479ec2..71702a6d4 100644 --- a/spacy/tests/regression/test_issue1506.py +++ b/spacy/tests/regression/test_issue1506.py @@ -11,19 +11,19 @@ def test_issue1506(): def string_generator(): for _ in range(10001): - yield "It's sentence produced by that bug." + yield u"It's sentence produced by that bug." for _ in range(10001): - yield "I erase some hbdsaj lemmas." + yield u"I erase some hbdsaj lemmas." for _ in range(10001): - yield "I erase lemmas." + yield u"I erase lemmas." for _ in range(10001): - yield "It's sentence produced by that bug." + yield u"It's sentence produced by that bug." for _ in range(10001): - yield "It's sentence produced by that bug." + yield u"It's sentence produced by that bug." for i, d in enumerate(nlp.pipe(string_generator())): # We should run cleanup more than one time to actually cleanup data. diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 543a04256..ec20dba8d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -132,10 +132,11 @@ cdef class Tokenizer: for text in texts: yield self(text) - def _reset_cache(self, keys): - for k in keys: - del self._cache[k] - del self._specials[k] + def _reset_cache(self): + # We cannot do selective cache cleanup because cache can be different than words + # saved in StringStore (prefixes/suffixes/etc). + self._cache = PreshMap() + self._specials = PreshMap() cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) From 61d28d03e46e3301ba9e1aea063e17409d3a8117 Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Wed, 15 Nov 2017 19:11:12 +0300 Subject: [PATCH 11/11] Try again to do selective remove cache --- spacy/language.py | 2 +- spacy/tokenizer.pyx | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 8163d2812..c43f4e4c5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -561,7 +561,7 @@ class Language(object): old_refs, recent_refs = recent_refs, old_refs keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data) self.vocab._reset_cache(keys, strings) - self.tokenizer._reset_cache() + self.tokenizer._reset_cache(keys) nr_seen = 0 def to_disk(self, path, disable=tuple()): diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 10fdc868d..64c00d950 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -133,11 +133,10 @@ cdef class Tokenizer: for text in texts: yield self(text) - def _reset_cache(self): - # We cannot do selective cache cleanup because cache can be different than words - # saved in StringStore (prefixes/suffixes/etc). - self._cache = PreshMap() - self._specials = PreshMap() + def _reset_cache(self, keys): + for k in keys: + del self._cache[k] + del self._specials[k] cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key)