Integrate memory zones into v3.x

2025-07-15 10:42:34 +03:00 · 2024-09-08 13:06:54 +02:00 · 2024-09-08 13:06:54 +02:00 · 3c86cc669a
commit 3c86cc669a
parent 319e02545c
8 changed files with 230 additions and 46 deletions
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -25,5 +25,10 @@ cdef class StringStore:
    cdef vector[hash_t] keys
    cdef public PreshMap _map
-    cdef const Utf8Str* intern_unicode(self, str py_string)
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
    cdef vector[hash_t] _transient_keys
    cdef PreshMap _transient_map
    cdef Pool _non_temp_mem
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,6 +1,9 @@
 # cython: infer_types=True
 # cython: profile=False
 cimport cython
 from typing import Iterable, Iterator, List, Optional, Tuple, Union
 from contextlib import contextmanager
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash32, hash64
@ -119,7 +122,9 @@ cdef class StringStore:
        strings (iterable): A sequence of unicode strings to add to the store.
        """
        self.mem = Pool()
        self._non_temp_mem = self.mem
        self._map = PreshMap()
        self._transient_map = None
        if strings is not None:
            for string in strings:
                self.add(string)
@ -152,10 +157,13 @@ cdef class StringStore:
                return SYMBOLS_BY_INT[str_hash]
            else:
                utf8str = <Utf8Str*>self._map.get(str_hash)
                if utf8str is NULL and self._transient_map is not None:
                    utf8str = <Utf8Str*>self._transient_map.get(str_hash)
        else:
            # TODO: Raise an error instead
            utf8str = <Utf8Str*>self._map.get(string_or_id)
-
+            if utf8str is NULL and self._transient_map is not None:
                utf8str = <Utf8Str*>self._transient_map.get(str_hash)
        if utf8str is NULL:
            raise KeyError(Errors.E018.format(hash_value=string_or_id))
        else:
@ -175,10 +183,46 @@ cdef class StringStore:
        else:
            return self[key]
-    def add(self, string):
+    def __reduce__(self):
        strings = list(self.non_transient_keys())
        return (StringStore, (strings,), None, None, None)
    def __len__(self) -> int:
        """The number of strings in the store.
        RETURNS (int): The number of strings in the store.
        """
        return self._keys.size() + self._transient_keys.size()
    @contextmanager
    def memory_zone(self, mem: Optional[Pool]=None) -> Pool:
        """Begin a block where all resources allocated during the block will
        be freed at the end of it. If a resources was created within the
        memory zone block, accessing it outside the block is invalid.
        Behaviour of this invalid access is undefined. Memory zones should
        not be nested.
        The memory zone is helpful for services that need to process large
        volumes of text with a defined memory budget.
        """
        if mem is None:
            mem = Pool()
        self.mem = mem
        self._transient_map = PreshMap()
        yield mem
        self.mem = self._non_temp_mem
        self._transient_map = None
        self._transient_keys.clear()
    def add(self, string: str, allow_transient: bool = False) -> int:
        """Add a string to the StringStore.
        string (str): The string to add.
        allow_transient (bool): Allow the string to be stored in the 'transient'
          map, which will be flushed at the end of the memory zone. Strings
          encountered during arbitrary text processing should be added
          with allow_transient=True, while labels and other strings used
          internally should not.
        RETURNS (uint64): The string's hash value.
        """
        cdef hash_t str_hash
@ -188,22 +232,26 @@ cdef class StringStore:
            string = string.encode("utf8")
            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
        elif isinstance(string, bytes):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
        else:
            raise TypeError(Errors.E017.format(value_type=type(string)))
        return str_hash
    def __len__(self):
        """The number of strings in the store.
        if string in SYMBOLS_BY_STR:
            return SYMBOLS_BY_STR[string]
        else:
            return self._intern_str(string, allow_transient)
        RETURNS (int): The number of strings in the store.
        """
-        return self.keys.size()
+        return self.keys.size() + self._transient_keys.size()
    def __contains__(self, string_or_id not None):
        """Check whether a string or ID is in the store.
@ -222,30 +270,70 @@ cdef class StringStore:
            pass
        else:
            # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
+            if self._map.get(string_or_id) is not NULL:
-
+                return True
            elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL:
                return True
            else:
                return False
        if str_hash < len(SYMBOLS_BY_INT):
            return True
        else:
-            return self._map.get(str_hash) is not NULL
+            if self._map.get(str_hash) is not NULL:
                return True
            elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL:
                return True
            else:
                return False
    def __iter__(self):
        """Iterate over the strings in the store, in order.
        YIELDS (str): A string in the store.
        """
        yield from self.non_transient_keys()
        yield from self.transient_keys()
    def non_transient_keys(self) -> Iterator[str]:
        """Iterate over the stored strings in insertion order.
        RETURNS: A list of strings.
        """
        cdef int i
        cdef hash_t key
        for i in range(self.keys.size()):
            key = self.keys[i]
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
        # TODO: Iterate OOV here?
    def __reduce__(self):
        strings = list(self)
        return (StringStore, (strings,), None, None, None)
    def transient_keys(self) -> Iterator[str]:
        if self._transient_map is None:
            return []
        for i in range(self._transient_keys.size()):
            utf8str = <Utf8Str*>self._transient_map.get(self._transient_keys[i])
            yield decode_Utf8Str(utf8str)
    def values(self) -> List[int]:
        """Iterate over the stored strings hashes in insertion order.
        RETURNS: A list of string hashs.
        """
        cdef int i
        hashes = [None] * self._keys.size()
        for i in range(self._keys.size()):
            hashes[i] = self._keys[i]
        if self._transient_map is not None:
            transient_hashes = [None] * self._transient_keys.size()
            for i in range(self._transient_keys.size()):
                transient_hashes[i] = self._transient_keys[i]
        else:
            transient_hashes = []
        return hashes + transient_hashes
    def to_disk(self, path):
        """Save the current state to a directory.
@ -269,7 +357,7 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self
    def to_bytes(self, **kwargs):
@ -289,7 +377,7 @@ cdef class StringStore:
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
        return self
    def _reset_and_load(self, strings):
@ -297,22 +385,34 @@ cdef class StringStore:
        self._map = PreshMap()
        self.keys.clear()
        for string in strings:
-            self.add(string)
+            self.add(string, allow_transient=False)
-    cdef const Utf8Str* intern_unicode(self, str py_string):
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode("utf8")
-        return self._intern_utf8(byte_string, len(byte_string), NULL)
+        return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
    @cython.final
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
        # TODO: This function's API/behaviour is an unholy mess...
        # 0 means missing, but we don't bother offsetting the index.
        cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
        cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
        if value is not NULL:
            return value
        if allow_transient and self._transient_map is not None:
            # If we've already allocated a transient string, and now we
            # want to intern it permanently, we'll end up with the string
            # in both places. That seems fine -- I don't see why we need
            # to remove it from the transient map.
            value = <Utf8Str*>self._transient_map.get(key)
            if value is not NULL:
                return value
        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
-        self._map.set(key, value)
+        if allow_transient and self._transient_map is not None:
-        self.keys.push_back(key)
+            self._transient_map.set(key, value)
            self._transient_keys.push_back(key)
        else:
            self._map.set(key, value)
            self.keys.push_back(key)
        return value
--- a/spacy/tests/vocab_vectors/test_memory_zone.py
+++ b/spacy/tests/vocab_vectors/test_memory_zone.py
@ -0,0 +1,36 @@
 from spacy.vocab import Vocab
 def test_memory_zone_no_insertion():
    vocab = Vocab()
    with vocab.memory_zone():
        pass
    lex = vocab["horse"]
    assert lex.text == "horse"
 def test_memory_zone_insertion():
    vocab = Vocab()
    _ = vocab["dog"]
    assert "dog" in vocab
    assert "horse" not in vocab
    with vocab.memory_zone():
        lex = vocab["horse"]
        assert lex.text == "horse"
    assert "dog" in vocab
    assert "horse" not in vocab
 def test_memory_zone_redundant_insertion():
    """Test that if we insert an already-existing word while
    in the memory zone, it stays persistent"""
    vocab = Vocab()
    _ = vocab["dog"]
    assert "dog" in vocab
    assert "horse" not in vocab
    with vocab.memory_zone():
        lex = vocab["horse"]
        assert lex.text == "horse"
        _ = vocab["dog"]
    assert "dog" in vocab
    assert "horse" not in vocab
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -25,9 +25,7 @@ cdef class Tokenizer:
    cdef PhraseMatcher _special_matcher
    # TODO convert to bool in v4
    cdef int _faster_heuristics
-    # TODO next one is unused and should be removed in v4
+    cdef public int max_cache_size
    # https://github.com/explosion/spaCy/pull/9150
    cdef int _unused_int2
    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
    cdef int _apply_special_cases(self, Doc doc) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -30,7 +30,7 @@ cdef class Tokenizer:
    """
    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                 suffix_search=None, infix_finditer=None, token_match=None,
-                 url_match=None, faster_heuristics=True):
+                 url_match=None, faster_heuristics=True, max_cache_size=10000):
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
        vocab (Vocab): A storage container for lexical types.
@ -50,6 +50,7 @@ cdef class Tokenizer:
        faster_heuristics (bool): Whether to restrict the final
            Matcher-based pass for rules to those containing affixes or space.
            Defaults to True.
        max_cache_size (int): Maximum number of tokenization chunks to cache.
        EXAMPLE:
            >>> tokenizer = Tokenizer(nlp.vocab)
@ -69,6 +70,7 @@ cdef class Tokenizer:
        self._rules = {}
        self._special_matcher = PhraseMatcher(self.vocab)
        self._load_special_cases(rules)
        self.max_cache_size = max_cache_size
    @property
    def token_match(self):
@ -397,8 +399,9 @@ cdef class Tokenizer:
                                   has_special, with_special_cases)
        self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                            with_special_cases)
-        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
+        if len(self._cache) < self.max_cache_size:
-                          tokens.length - orig_size)
+            self._save_cached(&tokens.c[orig_size], orig_key, has_special,
                            tokens.length - orig_size)
    cdef str _split_affixes(
        self,
@ -514,6 +517,9 @@ cdef class Tokenizer:
        if n <= 0:
            # avoid mem alloc of zero length
            return 0
        # Historically this check was mostly used to avoid caching
        # chunks that had tokens owned by the Doc. Now that that's
        # not a thing, I don't think we need this?
        for i in range(n):
            if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
                return 0
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -41,7 +41,9 @@ cdef class Vocab:
    cdef const TokenC* make_fused_token(self, substrings) except NULL
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
    cdef PreshMap _by_orth
    cdef Pool _non_temp_mem
    cdef vector[attr_t] _transient_orths
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@ -1,7 +1,9 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
 from contextlib import contextmanager
 from thinc.types import Floats1d, FloatsXd
 from cymem.cymem import Pool
 from . import Language
 from .lexeme import Lexeme
@ -67,6 +69,8 @@ class Vocab:
    def from_bytes(
        self, bytes_data: bytes, *, exclude: Iterable[str] = ...
    ) -> Vocab: ...
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
 def pickle_vocab(vocab: Vocab) -> Any: ...
 def unpickle_vocab(
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -3,6 +3,8 @@ import functools
 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
 from contextlib import contextmanager, ExitStack
 from typing import Iterator, Optional
 from .attrs cimport LANG, ORTH
 from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
@ -87,6 +89,12 @@ cdef class Vocab:
        self.lookups = lookups
        self.writing_system = writing_system
        self.get_noun_chunks = get_noun_chunks
        # During a memory_zone we replace our mem object with one
        # that's passed to us. We keep a reference to our non-temporary
        # memory here, in case we need to make an allocation we want to
        # guarantee is not temporary. This is also how we check whether
        # we're in a memory zone: we check whether self.mem is self._non_temp_mem
        self._non_temp_mem = self.mem
    @property
    def vectors(self):
@ -114,6 +122,33 @@ cdef class Vocab:
        """
        return self.length
    @contextmanager
    def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
        """Begin a block where resources allocated during the block will
        be freed at the end of it. If a resources was created within the
        memory zone block, accessing it outside the block is invalid.
        Behaviour of this invalid access is undefined. Memory zones should
        not be nested.
        The memory zone is helpful for services that need to process large
        volumes of text with a defined memory budget.
        """
        if mem is None:
            mem = Pool()
        # The ExitStack allows programmatic nested context managers.
        # We don't know how many we need, so it would be awkward to have
        # them as nested blocks.
        with ExitStack() as stack:
            contexts = [stack.enter_context(self.strings.memory_zone(mem))]
            if hasattr(self.morphology, "memory_zone"):
                contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
            if hasattr(self._vectors, "memory_zone"):
                contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
            self.mem = mem
            yield mem
        self._clear_transient_orths()
        self.mem = self._non_temp_mem
    def add_flag(self, flag_getter, int flag_id=-1):
        """Set a new boolean flag to words in the vocabulary.
@ -148,8 +183,7 @@ cdef class Vocab:
    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
        """Get a pointer to a `LexemeC` from the lexicon, creating a new
-        `Lexeme` if necessary using memory acquired from the given pool. If the
+        `Lexeme` if necessary.
        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
        """
        if string == "":
            return &EMPTY_LEXEME
@ -180,17 +214,9 @@ cdef class Vocab:
            return self._new_lexeme(mem, self.strings[orth])
    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
-        # I think this heuristic is bad, and the Vocab should always
+        # The mem argument is deprecated, replaced by memory zones. Same with
-        # own the lexemes. It avoids weird bugs this way, as it's how the thing
+        # this size heuristic.
        # was originally supposed to work. The best solution to the growing
        # memory use is to periodically reset the vocab, which is an action
        # that should be up to the user to do (so we don't need to keep track
        # of the doc ownership).
        # TODO: Change the C API so that the mem isn't passed in here.
        mem = self.mem
        # if len(string) < 3 or self.length < 10000:
        #    mem = self.mem
        cdef bint is_oov = mem is not self.mem
        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
        lex.orth = self.strings.add(string)
        lex.length = len(string)
@ -202,18 +228,25 @@ cdef class Vocab:
            for attr, func in self.lex_attr_getters.items():
                value = func(string)
                if isinstance(value, str):
-                    value = self.strings.add(value)
+                    value = self.strings.add(value, allow_transient=True)
                if value is not None:
                    Lexeme.set_struct_attr(lex, attr, value)
-        if not is_oov:
+        self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
            self._add_lex_to_vocab(lex.orth, lex)
        if lex == NULL:
            raise ValueError(Errors.E085.format(string=string))
        return lex
-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
        self._by_orth.set(lex.orth, <void*>lex)
        self.length += 1
        if is_transient:
            self._transient_orths.push_back(lex.orth)
    def _clear_transient_orths(self):
        """Remove transient lexemes from the index (generally at the end of the memory zone)"""
        for orth in self._transient_orths:
            self._by_orth.pop(orth)
        self._transient_orths.clear()
    def __contains__(self, key):
        """Check whether the string or int key has an entry in the vocabulary.
@ -265,7 +298,7 @@ cdef class Vocab:
        """
        cdef attr_t orth
        if isinstance(id_or_string, str):
-            orth = self.strings.add(id_or_string)
+            orth = self.strings.add(id_or_string, allow_transient=True)
        else:
            orth = id_or_string
        return Lexeme(self, orth)
@ -417,7 +450,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#get_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.has_vector(key):
@ -436,7 +469,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#set_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=False)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        if self.vectors.is_full and key not in self.vectors:
@ -460,7 +493,7 @@ cdef class Vocab:
        DOCS: https://spacy.io/api/vocab#has_vector
        """
        if isinstance(orth, str):
-            orth = self.strings.add(orth)
+            orth = self.strings.add(orth, allow_transient=True)
        cdef Lexeme lex = self[orth]
        key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
        return key in self.vectors