Support 'memory zones' for user memory management

Add a context manage nlp.memory_zone(), which will begin memory_zone() blocks on the vocab, string store, and potentially other components. Once the memory_zone() block expires, spaCy will free any shared resources that were allocated for the text-processing that occurred within the memory_zone. If you create Doc objects within a memory zone, it's invalid to access them once the memory zone is expired. The purpose of this is that spaCy creates and stores Lexeme objects in the Vocab that can be shared between multiple Doc objects. It also interns strings. Normally, spaCy can't know when all Doc objects using a Lexeme are out-of-scope, so new Lexemes accumulate in the vocab, causing memory pressure. Memory zones solve this problem by telling spaCy "okay none of the documents allocated within this block will be accessed again". This lets spaCy free all new Lexeme objects and other data that were created during the block. The mechanism is general, so memory_zone() context managers can be added to other components that could benefit from them, e.g. pipeline components. I experimented with adding memory zone support to the tokenizer as well, for its cache. However, this seems unnecessarily complicated. It makes more sense to just stick a limit on the cache size. This lets spaCy benefit from the efficiency advantage of the cache better, because we can maintain a (bounded) cache even if only small batches of documents are being processed.
2025-09-16 09:02:35 +03:00 · 2024-09-08 13:06:54 +02:00 · 2024-09-08 13:06:54 +02:00 · 5d0d2de955
commit 5d0d2de955
parent a559cde432
8 changed files with 184 additions and 27 deletions
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -15,9 +15,12 @@ ctypedef union Utf8Str:
 cdef class StringStore:
    cdef Pool mem
    cdef vector[hash_t] _keys
+    cdef vector[hash_t] _transient_keys
    cdef PreshMap _map
+    cdef PreshMap _transient_map
+    cdef Pool _non_temp_mem

-    cdef hash_t _intern_str(self, str string)
+    cdef hash_t _intern_str(self, str string, bint transient)
    cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *
    cdef str _decode_str_repr(self, const Utf8Str* string)

--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,6 +1,7 @@
 # cython: infer_types=True
 # cython: profile=False
 from typing import Iterable, Iterator, List, Optional, Tuple, Union
+from contextlib import contextmanager

 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
@ -27,7 +28,9 @@ cdef class StringStore:
        strings (iterable): A sequence of unicode strings to add to the store.
        """
        self.mem = Pool()
+        self._non_temp_mem = self.mem
        self._map = PreshMap()
+        self._transient_map = None
        if strings is not None:
            for string in strings:
                self.add(string)
@ -65,7 +68,7 @@ cdef class StringStore:
        return iter(self.keys())

    def __reduce__(self):
-        strings = list(self)
+        strings = list(self.non_transient_keys())
        return (StringStore, (strings,), None, None, None)

    def __len__(self) -> int:
@ -73,12 +76,37 @@ cdef class StringStore:

        RETURNS (int): The number of strings in the store.
        """
-        return self._keys.size()
+        return self._keys.size() + self._transient_keys.size()

-    def add(self, string: str) -> int:
+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool]=None) -> Pool:
+        """Begin a block where all resources allocated during the block will
+        be freed at the end of it. If a resources was created within the
+        memory zone block, accessing it outside the block is invalid.
+        Behaviour of this invalid access is undefined. Memory zones should
+        not be nested.
+
+        The memory zone is helpful for services that need to process large
+        volumes of text with a defined memory budget.
+        """
+        if mem is None:
+            mem = Pool()
+        self.mem = mem
+        self._transient_map = PreshMap()
+        yield mem
+        self.mem = self._non_temp_mem
+        self._transient_map = None
+        self._transient_keys.clear()
+
+    def add(self, string: str, allow_transient: bool = False) -> int:
        """Add a string to the StringStore.

        string (str): The string to add.
+        allow_transient (bool): Allow the string to be stored in the 'transient'
+          map, which will be flushed at the end of the memory zone. Strings
+          encountered during arbitrary text processing should be added
+          with allow_transient=True, while labels and other strings used
+          internally should not.
        RETURNS (uint64): The string's hash value.
        """
        if not isinstance(string, str):
@ -87,7 +115,7 @@ cdef class StringStore:
        if string in SYMBOLS_BY_STR:
            return SYMBOLS_BY_STR[string]
        else:
-            return self._intern_str(string)
+            return self._intern_str(string, allow_transient)

    def as_int(self, string_or_hash: Union[str, int]) -> str:
        """If a hash value is passed as the input, return it as-is. If the input
@ -133,6 +161,13 @@ cdef class StringStore:
    def keys(self) -> List[str]:
        """Iterate over the stored strings in insertion order.

+        RETURNS: A list of strings.
+        """
+        return self.non_transient_keys() + self.transient_keys()
+
+    def non_transient_keys(self) -> List[str]:
+        """Iterate over the stored strings in insertion order.
+
        RETURNS: A list of strings.
        """
        cdef int i
@ -142,6 +177,15 @@ cdef class StringStore:
            strings[i] = self._decode_str_repr(utf8str)
        return strings

+    def transient_keys(self) -> List[str]:
+        if self._transient_map is None:
+            return []
+        transient_strings = [None] * self._transient_keys.size()
+        for i in range(self._transient_keys.size()):
+            utf8str = <Utf8Str*>self._transient_map.get(self._transient_keys[i])
+            transient_strings[i] = self._decode_str_repr(utf8str)
+        return transient_strings
+
    def values(self) -> List[int]:
        """Iterate over the stored strings hashes in insertion order.

@ -151,7 +195,13 @@ cdef class StringStore:
        hashes = [None] * self._keys.size()
        for i in range(self._keys.size()):
            hashes[i] = self._keys[i]
-        return hashes
+        if self._transient_map is not None:
+            transient_hashes = [None] * self._transient_keys.size()
+            for i in range(self._transient_keys.size()):
+                transient_hashes[i] = self._transient_keys[i]
+        else:
+            transient_hashes = []
+        return hashes + transient_hashes

    def to_disk(self, path):
        """Save the current state to a directory.
@ -176,7 +226,7 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self

    def to_bytes(self, **kwargs):
@ -196,7 +246,7 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self

    def _reset_and_load(self, strings):
@ -204,7 +254,7 @@ cdef class StringStore:
        self._map = PreshMap()
        self._keys.clear()
        for string in strings:
-            self.add(string)
+            self.add(string, allow_transient=False)

    def _get_interned_str(self, hash_value: int) -> str:
        cdef hash_t str_hash
@ -220,12 +270,14 @@ cdef class StringStore:
            return symbol

        utf8str = <Utf8Str*>self._map.get(str_hash)
+        if utf8str is NULL and self._transient_map is not None:
+            utf8str = <Utf8Str*>self._transient_map.get(str_hash)
        if utf8str is NULL:
            raise KeyError(Errors.E018.format(hash_value=str_hash))
        else:
            return self._decode_str_repr(utf8str)

-    cdef hash_t _intern_str(self, str string):
+    cdef hash_t _intern_str(self, str string, bint transient):
        # TODO: This function's API/behaviour is an unholy mess...
        # 0 means missing, but we don't bother offsetting the index.
        chars = string.encode('utf-8')
@ -233,10 +285,21 @@ cdef class StringStore:
        cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
        if value is not NULL:
            return key
-
+        if transient and self._transient_map is not None:
+            # If we've already allocated a transient string, and now we
+            # want to intern it permanently, we'll end up with the string
+            # in both places. That seems fine -- I don't see why we need
+            # to remove it from the transient map.
+            value = <Utf8Str*>self._transient_map.get(key)
+            if value is not NULL:
+                return key
        value = self._allocate_str_repr(<unsigned char*>chars, len(chars))
-        self._map.set(key, value)
-        self._keys.push_back(key)
+        if transient and self._transient_map is not None:
+            self._transient_map.set(key, value)
+            self._transient_keys.push_back(key)
+        else:
+            self._map.set(key, value)
+            self._keys.push_back(key)
        return key

    cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *:
--- a/spacy/tests/vocab_vectors/test_memory_zone.py
+++ b/spacy/tests/vocab_vectors/test_memory_zone.py
@ -0,0 +1,36 @@
+from spacy.vocab import Vocab
+
+
+def test_memory_zone_no_insertion():
+    vocab = Vocab()
+    with vocab.memory_zone():
+        pass
+    lex = vocab["horse"]
+    assert lex.text == "horse"
+
+
+def test_memory_zone_insertion():
+    vocab = Vocab()
+    _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
+    with vocab.memory_zone():
+        lex = vocab["horse"]
+        assert lex.text == "horse"
+    assert "dog" in vocab
+    assert "horse" not in vocab
+
+
+def test_memory_zone_redundant_insertion():
+    """Test that if we insert an already-existing word while
+    in the memory zone, it stays persistent"""
+    vocab = Vocab()
+    _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
+    with vocab.memory_zone():
+        lex = vocab["horse"]
+        assert lex.text == "horse"
+        _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -23,6 +23,7 @@ cdef class Tokenizer:
    cdef object _rules
    cdef PhraseMatcher _special_matcher
    cdef bint _faster_heuristics
+    cdef public int max_cache_size

    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
    cdef int _apply_special_cases(self, Doc doc) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -31,7 +31,7 @@ cdef class Tokenizer:
    """
    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                 suffix_search=None, infix_finditer=None, token_match=None,
-                 url_match=None, faster_heuristics=True):
+                 url_match=None, faster_heuristics=True, max_cache_size=10000):
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.

        vocab (Vocab): A storage container for lexical types.
@ -51,6 +51,7 @@ cdef class Tokenizer:
        faster_heuristics (bool): Whether to restrict the final
            Matcher-based pass for rules to those containing affixes or space.
            Defaults to True.
+        max_cache_size (int): Maximum number of tokenization chunks to cache.

        EXAMPLE:
            >>> tokenizer = Tokenizer(nlp.vocab)
@ -70,6 +71,7 @@ cdef class Tokenizer:
        self._rules = {}
        self._special_matcher = PhraseMatcher(self.vocab)
        self._load_special_cases(rules)
+        self.max_cache_size = max_cache_size

    @property
    def token_match(self):
@ -398,8 +400,9 @@ cdef class Tokenizer:
                                   has_special, with_special_cases)
        self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                            with_special_cases)
-        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
-                          tokens.length - orig_size)
+        if len(self._cache) < self.max_cache_size:
+            self._save_cached(&tokens.c[orig_size], orig_key, has_special,
+                            tokens.length - orig_size)

    cdef str _split_affixes(
        self,
@ -514,6 +517,9 @@ cdef class Tokenizer:
        if n <= 0:
            # avoid mem alloc of zero length
            return 0
+        # Historically this check was mostly used to avoid caching
+        # chunks that had tokens owned by the Doc. Now that that's
+        # not a thing, I don't think we need this?
        for i in range(n):
            if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
                return 0
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -40,6 +40,8 @@ cdef class Vocab:
    cdef const TokenC* make_fused_token(self, substrings) except NULL

    cdef const LexemeC* _new_lexeme(self, str string) except NULL
-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1

    cdef PreshMap _by_orth
+    cdef Pool _non_temp_mem
+    cdef vector[attr_t] _transient_orths
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@ -1,7 +1,9 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
+from contextlib import contextmanager

 from thinc.types import Floats1d, FloatsXd
+from cymem.cymem import Pool

 from . import Language
 from .lexeme import Lexeme
@ -67,6 +69,8 @@ class Vocab:
    def from_bytes(
        self, bytes_data: bytes, *, exclude: Iterable[str] = ...
    ) -> Vocab: ...
+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...

 def pickle_vocab(vocab: Vocab) -> Any: ...
 def unpickle_vocab(
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -3,6 +3,8 @@ import functools
 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
+from contextlib import contextmanager, ExitStack
+from typing import Iterator, Optional

 from .attrs cimport LANG, ORTH
 from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
@ -86,6 +88,12 @@ cdef class Vocab:
        else:
            self.writing_system = writing_system
        self.get_noun_chunks = get_noun_chunks
+        # During a memory_zone we replace our mem object with one
+        # that's passed to us. We keep a reference to our non-temporary
+        # memory here, in case we need to make an allocation we want to
+        # guarantee is not temporary. This is also how we check whether
+        # we're in a memory zone: we check whether self.mem is self._non_temp_mem
+        self._non_temp_mem = self.mem

    @property
    def vectors(self):
@ -113,6 +121,33 @@ cdef class Vocab:
        """
        return self.length

+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
+        """Begin a block where resources allocated during the block will
+        be freed at the end of it. If a resources was created within the
+        memory zone block, accessing it outside the block is invalid.
+        Behaviour of this invalid access is undefined. Memory zones should
+        not be nested.
+
+        The memory zone is helpful for services that need to process large
+        volumes of text with a defined memory budget.
+        """
+        if mem is None:
+            mem = Pool()
+        # The ExitStack allows programmatic nested context managers.
+        # We don't know how many we need, so it would be awkward to have
+        # them as nested blocks.
+        with ExitStack() as stack:
+            contexts = [stack.enter_context(self.strings.memory_zone(mem))]
+            if hasattr(self.morphology, "memory_zone"):
+                contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
+            if hasattr(self._vectors, "memory_zone"):
+                contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
+            self.mem = mem
+            yield mem
+        self._clear_transient_orths()
+        self.mem = self._non_temp_mem
+
    def add_flag(self, flag_getter, int flag_id=-1):
        """Set a new boolean flag to words in the vocabulary.

@ -147,8 +182,7 @@ cdef class Vocab:

    cdef const LexemeC* get(self, str string) except NULL:
        """Get a pointer to a `LexemeC` from the lexicon, creating a new
-        `Lexeme` if necessary using memory acquired from the given pool. If the
-        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
+        `Lexeme` if necessary.
        """
        if string == "":
            return &EMPTY_LEXEME
@ -180,7 +214,7 @@ cdef class Vocab:

    cdef const LexemeC* _new_lexeme(self, str string) except NULL:
        lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
-        lex.orth = self.strings.add(string)
+        lex.orth = self.strings.add(string, allow_transient=True)
        lex.length = len(string)
        if self.vectors is not None and hasattr(self.vectors, "key2row"):
            lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
@ -190,17 +224,25 @@ cdef class Vocab:
            for attr, func in self.lex_attr_getters.items():
                value = func(string)
                if isinstance(value, str):
-                    value = self.strings.add(value)
+                    value = self.strings.add(value, allow_transient=True)
                if value is not None:
                    Lexeme.set_struct_attr(lex, attr, value)
-        self._add_lex_to_vocab(lex.orth, lex)
+        self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
        if lex == NULL:
            raise ValueError(Errors.E085.format(string=string))
        return lex

-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
        self._by_orth.set(lex.orth, <void*>lex)
        self.length += 1
+        if is_transient:
+            self._transient_orths.push_back(lex.orth)
+
+    def _clear_transient_orths(self):
+        """Remove transient lexemes from the index (generally at the end of the memory zone)"""
+        for orth in self._transient_orths:
+            self._by_orth.pop(orth)
+        self._transient_orths.clear()

    def __contains__(self, key):
        """Check whether the string or int key has an entry in the vocabulary.
@ -252,7 +294,7 @@ cdef class Vocab:
        """
        cdef attr_t orth
        if isinstance(id_or_string, str):
-            orth = self.strings.add(id_or_string)
+            orth = self.strings.add(id_or_string, allow_transient=True)
        else:
            orth = id_or_string
        return Lexeme(self, orth)
@ -403,7 +445,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#get_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.has_vector(key):
@ -422,7 +464,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#set_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=False)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.vectors.is_full and key not in self.vectors:
@ -446,7 +488,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#has_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        return key in self.vectors