Support 'memory zones' for user memory management (#13621)

Add a context manage nlp.memory_zone(), which will begin
memory_zone() blocks on the vocab, string store, and potentially
other components.

Example usage:

```
with nlp.memory_zone():
    for text in nlp.pipe(texts):
        do_something(doc)
# do_something(doc) <-- Invalid
```

Once the memory_zone() block expires, spaCy will free any shared
resources that were allocated for the text-processing that occurred
within the memory_zone. If you create Doc objects within a memory
zone, it's invalid to access them once the memory zone is expired.

The purpose of this is that spaCy creates and stores Lexeme objects
in the Vocab that can be shared between multiple Doc objects. It also
interns strings. Normally, spaCy can't know when all Doc objects using
a Lexeme are out-of-scope, so new Lexemes accumulate in the vocab,
causing memory pressure.

Memory zones solve this problem by telling spaCy "okay none of the
documents allocated within this block will be accessed again". This
lets spaCy free all new Lexeme objects and other data that were
created during the block.

The mechanism is general, so memory_zone() context managers can be
added to other components that could benefit from them, e.g. pipeline
components.

I experimented with adding memory zone support to the tokenizer as well,
for its cache. However, this seems unnecessarily complicated. It makes
more sense to just stick a limit on the cache size. This lets spaCy
benefit from the efficiency advantage of the cache better, because
we can maintain a (bounded) cache even if only small batches of
documents are being processed.
This commit is contained in:
Matthew Honnibal 2024-09-09 11:19:39 +02:00 committed by GitHub
parent 608f65ce40
commit 1b8d560d0e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 232 additions and 49 deletions

View File

@ -25,5 +25,8 @@ cdef class StringStore:
cdef vector[hash_t] keys cdef vector[hash_t] keys
cdef public PreshMap _map cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, str py_string) cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
cdef vector[hash_t] _transient_keys
cdef PreshMap _transient_map
cdef Pool _non_temp_mem

View File

@ -1,6 +1,10 @@
# cython: infer_types=True # cython: infer_types=True
# cython: profile=False # cython: profile=False
cimport cython cimport cython
from contextlib import contextmanager
from typing import Iterator, List, Optional
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.string cimport memcpy from libc.string cimport memcpy
from murmurhash.mrmr cimport hash32, hash64 from murmurhash.mrmr cimport hash32, hash64
@ -31,7 +35,7 @@ def get_string_id(key):
This function optimises for convenience over performance, so shouldn't be This function optimises for convenience over performance, so shouldn't be
used in tight loops. used in tight loops.
""" """
cdef hash_t str_hash cdef hash_t str_hash
if isinstance(key, str): if isinstance(key, str):
if len(key) == 0: if len(key) == 0:
return 0 return 0
@ -45,8 +49,8 @@ def get_string_id(key):
elif _try_coerce_to_hash(key, &str_hash): elif _try_coerce_to_hash(key, &str_hash):
# Coerce the integral key to the expected primitive hash type. # Coerce the integral key to the expected primitive hash type.
# This ensures that custom/overloaded "primitive" data types # This ensures that custom/overloaded "primitive" data types
# such as those implemented by numpy are not inadvertently used # such as those implemented by numpy are not inadvertently used
# downsteam (as these are internally implemented as custom PyObjects # downsteam (as these are internally implemented as custom PyObjects
# whose comparison operators can incur a significant overhead). # whose comparison operators can incur a significant overhead).
return str_hash return str_hash
else: else:
@ -119,7 +123,9 @@ cdef class StringStore:
strings (iterable): A sequence of unicode strings to add to the store. strings (iterable): A sequence of unicode strings to add to the store.
""" """
self.mem = Pool() self.mem = Pool()
self._non_temp_mem = self.mem
self._map = PreshMap() self._map = PreshMap()
self._transient_map = None
if strings is not None: if strings is not None:
for string in strings: for string in strings:
self.add(string) self.add(string)
@ -152,10 +158,13 @@ cdef class StringStore:
return SYMBOLS_BY_INT[str_hash] return SYMBOLS_BY_INT[str_hash]
else: else:
utf8str = <Utf8Str*>self._map.get(str_hash) utf8str = <Utf8Str*>self._map.get(str_hash)
if utf8str is NULL and self._transient_map is not None:
utf8str = <Utf8Str*>self._transient_map.get(str_hash)
else: else:
# TODO: Raise an error instead # TODO: Raise an error instead
utf8str = <Utf8Str*>self._map.get(string_or_id) utf8str = <Utf8Str*>self._map.get(string_or_id)
if utf8str is NULL and self._transient_map is not None:
utf8str = <Utf8Str*>self._transient_map.get(str_hash)
if utf8str is NULL: if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id)) raise KeyError(Errors.E018.format(hash_value=string_or_id))
else: else:
@ -175,10 +184,46 @@ cdef class StringStore:
else: else:
return self[key] return self[key]
def add(self, string): def __reduce__(self):
strings = list(self.non_transient_keys())
return (StringStore, (strings,), None, None, None)
def __len__(self) -> int:
"""The number of strings in the store.
RETURNS (int): The number of strings in the store.
"""
return self._keys.size() + self._transient_keys.size()
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
"""Begin a block where all resources allocated during the block will
be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid.
Behaviour of this invalid access is undefined. Memory zones should
not be nested.
The memory zone is helpful for services that need to process large
volumes of text with a defined memory budget.
"""
if mem is None:
mem = Pool()
self.mem = mem
self._transient_map = PreshMap()
yield mem
self.mem = self._non_temp_mem
self._transient_map = None
self._transient_keys.clear()
def add(self, string: str, allow_transient: bool = False) -> int:
"""Add a string to the StringStore. """Add a string to the StringStore.
string (str): The string to add. string (str): The string to add.
allow_transient (bool): Allow the string to be stored in the 'transient'
map, which will be flushed at the end of the memory zone. Strings
encountered during arbitrary text processing should be added
with allow_transient=True, while labels and other strings used
internally should not.
RETURNS (uint64): The string's hash value. RETURNS (uint64): The string's hash value.
""" """
cdef hash_t str_hash cdef hash_t str_hash
@ -188,22 +233,26 @@ cdef class StringStore:
string = string.encode("utf8") string = string.encode("utf8")
str_hash = hash_utf8(string, len(string)) str_hash = hash_utf8(string, len(string))
self._intern_utf8(string, len(string), &str_hash) self._intern_utf8(string, len(string), &str_hash, allow_transient)
elif isinstance(string, bytes): elif isinstance(string, bytes):
if string in SYMBOLS_BY_STR: if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string] return SYMBOLS_BY_STR[string]
str_hash = hash_utf8(string, len(string)) str_hash = hash_utf8(string, len(string))
self._intern_utf8(string, len(string), &str_hash) self._intern_utf8(string, len(string), &str_hash, allow_transient)
else: else:
raise TypeError(Errors.E017.format(value_type=type(string))) raise TypeError(Errors.E017.format(value_type=type(string)))
return str_hash return str_hash
def __len__(self): def __len__(self):
"""The number of strings in the store. """The number of strings in the store.
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
else:
return self._intern_str(string, allow_transient)
RETURNS (int): The number of strings in the store. RETURNS (int): The number of strings in the store.
""" """
return self.keys.size() return self.keys.size() + self._transient_keys.size()
def __contains__(self, string_or_id not None): def __contains__(self, string_or_id not None):
"""Check whether a string or ID is in the store. """Check whether a string or ID is in the store.
@ -222,30 +271,70 @@ cdef class StringStore:
pass pass
else: else:
# TODO: Raise an error instead # TODO: Raise an error instead
return self._map.get(string_or_id) is not NULL if self._map.get(string_or_id) is not NULL:
return True
elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL:
return True
else:
return False
if str_hash < len(SYMBOLS_BY_INT): if str_hash < len(SYMBOLS_BY_INT):
return True return True
else: else:
return self._map.get(str_hash) is not NULL if self._map.get(str_hash) is not NULL:
return True
elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL:
return True
else:
return False
def __iter__(self): def __iter__(self):
"""Iterate over the strings in the store, in order. """Iterate over the strings in the store, in order.
YIELDS (str): A string in the store. YIELDS (str): A string in the store.
""" """
yield from self.non_transient_keys()
yield from self.transient_keys()
def non_transient_keys(self) -> Iterator[str]:
"""Iterate over the stored strings in insertion order.
RETURNS: A list of strings.
"""
cdef int i cdef int i
cdef hash_t key cdef hash_t key
for i in range(self.keys.size()): for i in range(self.keys.size()):
key = self.keys[i] key = self.keys[i]
utf8str = <Utf8Str*>self._map.get(key) utf8str = <Utf8Str*>self._map.get(key)
yield decode_Utf8Str(utf8str) yield decode_Utf8Str(utf8str)
# TODO: Iterate OOV here?
def __reduce__(self): def __reduce__(self):
strings = list(self) strings = list(self)
return (StringStore, (strings,), None, None, None) return (StringStore, (strings,), None, None, None)
def transient_keys(self) -> Iterator[str]:
if self._transient_map is None:
return []
for i in range(self._transient_keys.size()):
utf8str = <Utf8Str*>self._transient_map.get(self._transient_keys[i])
yield decode_Utf8Str(utf8str)
def values(self) -> List[int]:
"""Iterate over the stored strings hashes in insertion order.
RETURNS: A list of string hashs.
"""
cdef int i
hashes = [None] * self._keys.size()
for i in range(self._keys.size()):
hashes[i] = self._keys[i]
if self._transient_map is not None:
transient_hashes = [None] * self._transient_keys.size()
for i in range(self._transient_keys.size()):
transient_hashes[i] = self._transient_keys[i]
else:
transient_hashes = []
return hashes + transient_hashes
def to_disk(self, path): def to_disk(self, path):
"""Save the current state to a directory. """Save the current state to a directory.
@ -269,7 +358,7 @@ cdef class StringStore:
prev = list(self) prev = list(self)
self._reset_and_load(strings) self._reset_and_load(strings)
for word in prev: for word in prev:
self.add(word) self.add(word, allow_transient=False)
return self return self
def to_bytes(self, **kwargs): def to_bytes(self, **kwargs):
@ -289,7 +378,7 @@ cdef class StringStore:
prev = list(self) prev = list(self)
self._reset_and_load(strings) self._reset_and_load(strings)
for word in prev: for word in prev:
self.add(word) self.add(word, allow_transient=False)
return self return self
def _reset_and_load(self, strings): def _reset_and_load(self, strings):
@ -297,22 +386,34 @@ cdef class StringStore:
self._map = PreshMap() self._map = PreshMap()
self.keys.clear() self.keys.clear()
for string in strings: for string in strings:
self.add(string) self.add(string, allow_transient=False)
cdef const Utf8Str* intern_unicode(self, str py_string): cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode("utf8") cdef bytes byte_string = py_string.encode("utf8")
return self._intern_utf8(byte_string, len(byte_string), NULL) return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
@cython.final @cython.final
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash): cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
# TODO: This function's API/behaviour is an unholy mess... # TODO: This function's API/behaviour is an unholy mess...
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length) cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
cdef Utf8Str* value = <Utf8Str*>self._map.get(key) cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL: if value is not NULL:
return value return value
if allow_transient and self._transient_map is not None:
# If we've already allocated a transient string, and now we
# want to intern it permanently, we'll end up with the string
# in both places. That seems fine -- I don't see why we need
# to remove it from the transient map.
value = <Utf8Str*>self._transient_map.get(key)
if value is not NULL:
return value
value = _allocate(self.mem, <unsigned char*>utf8_string, length) value = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, value) if allow_transient and self._transient_map is not None:
self.keys.push_back(key) self._transient_map.set(key, value)
self._transient_keys.push_back(key)
else:
self._map.set(key, value)
self.keys.push_back(key)
return value return value

View File

@ -0,0 +1,36 @@
from spacy.vocab import Vocab
def test_memory_zone_no_insertion():
vocab = Vocab()
with vocab.memory_zone():
pass
lex = vocab["horse"]
assert lex.text == "horse"
def test_memory_zone_insertion():
vocab = Vocab()
_ = vocab["dog"]
assert "dog" in vocab
assert "horse" not in vocab
with vocab.memory_zone():
lex = vocab["horse"]
assert lex.text == "horse"
assert "dog" in vocab
assert "horse" not in vocab
def test_memory_zone_redundant_insertion():
"""Test that if we insert an already-existing word while
in the memory zone, it stays persistent"""
vocab = Vocab()
_ = vocab["dog"]
assert "dog" in vocab
assert "horse" not in vocab
with vocab.memory_zone():
lex = vocab["horse"]
assert lex.text == "horse"
_ = vocab["dog"]
assert "dog" in vocab
assert "horse" not in vocab

View File

@ -25,9 +25,7 @@ cdef class Tokenizer:
cdef PhraseMatcher _special_matcher cdef PhraseMatcher _special_matcher
# TODO convert to bool in v4 # TODO convert to bool in v4
cdef int _faster_heuristics cdef int _faster_heuristics
# TODO next one is unused and should be removed in v4 cdef public int max_cache_size
# https://github.com/explosion/spaCy/pull/9150
cdef int _unused_int2
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1 cdef int _apply_special_cases(self, Doc doc) except -1

View File

@ -30,7 +30,7 @@ cdef class Tokenizer:
""" """
def __init__(self, Vocab vocab, rules=None, prefix_search=None, def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None, suffix_search=None, infix_finditer=None, token_match=None,
url_match=None, faster_heuristics=True): url_match=None, faster_heuristics=True, max_cache_size=10000):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text. """Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types. vocab (Vocab): A storage container for lexical types.
@ -50,6 +50,7 @@ cdef class Tokenizer:
faster_heuristics (bool): Whether to restrict the final faster_heuristics (bool): Whether to restrict the final
Matcher-based pass for rules to those containing affixes or space. Matcher-based pass for rules to those containing affixes or space.
Defaults to True. Defaults to True.
max_cache_size (int): Maximum number of tokenization chunks to cache.
EXAMPLE: EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab) >>> tokenizer = Tokenizer(nlp.vocab)
@ -69,6 +70,7 @@ cdef class Tokenizer:
self._rules = {} self._rules = {}
self._special_matcher = PhraseMatcher(self.vocab) self._special_matcher = PhraseMatcher(self.vocab)
self._load_special_cases(rules) self._load_special_cases(rules)
self.max_cache_size = max_cache_size
@property @property
def token_match(self): def token_match(self):
@ -397,8 +399,9 @@ cdef class Tokenizer:
has_special, with_special_cases) has_special, with_special_cases)
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special, self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
with_special_cases) with_special_cases)
self._save_cached(&tokens.c[orig_size], orig_key, has_special, if len(self._cache) < self.max_cache_size:
tokens.length - orig_size) self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
cdef str _split_affixes( cdef str _split_affixes(
self, self,
@ -514,6 +517,9 @@ cdef class Tokenizer:
if n <= 0: if n <= 0:
# avoid mem alloc of zero length # avoid mem alloc of zero length
return 0 return 0
# Historically this check was mostly used to avoid caching
# chunks that had tokens owned by the Doc. Now that that's
# not a thing, I don't think we need this?
for i in range(n): for i in range(n):
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
return 0 return 0

View File

@ -41,7 +41,9 @@ cdef class Vocab:
cdef const TokenC* make_fused_token(self, substrings) except NULL cdef const TokenC* make_fused_token(self, substrings) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef PreshMap _by_orth cdef PreshMap _by_orth
cdef Pool _non_temp_mem
cdef vector[attr_t] _transient_orths

View File

@ -1,6 +1,8 @@
from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
from cymem.cymem import Pool
from thinc.types import Floats1d, FloatsXd from thinc.types import Floats1d, FloatsXd
from . import Language from . import Language
@ -67,6 +69,8 @@ class Vocab:
def from_bytes( def from_bytes(
self, bytes_data: bytes, *, exclude: Iterable[str] = ... self, bytes_data: bytes, *, exclude: Iterable[str] = ...
) -> Vocab: ... ) -> Vocab: ...
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
def pickle_vocab(vocab: Vocab) -> Any: ... def pickle_vocab(vocab: Vocab) -> Any: ...
def unpickle_vocab( def unpickle_vocab(

View File

@ -1,4 +1,6 @@
import functools import functools
from contextlib import ExitStack, contextmanager
from typing import Iterator, Optional
import numpy import numpy
import srsly import srsly
@ -87,6 +89,12 @@ cdef class Vocab:
self.lookups = lookups self.lookups = lookups
self.writing_system = writing_system self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks self.get_noun_chunks = get_noun_chunks
# During a memory_zone we replace our mem object with one
# that's passed to us. We keep a reference to our non-temporary
# memory here, in case we need to make an allocation we want to
# guarantee is not temporary. This is also how we check whether
# we're in a memory zone: we check whether self.mem is self._non_temp_mem
self._non_temp_mem = self.mem
@property @property
def vectors(self): def vectors(self):
@ -114,6 +122,33 @@ cdef class Vocab:
""" """
return self.length return self.length
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
"""Begin a block where resources allocated during the block will
be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid.
Behaviour of this invalid access is undefined. Memory zones should
not be nested.
The memory zone is helpful for services that need to process large
volumes of text with a defined memory budget.
"""
if mem is None:
mem = Pool()
# The ExitStack allows programmatic nested context managers.
# We don't know how many we need, so it would be awkward to have
# them as nested blocks.
with ExitStack() as stack:
contexts = [stack.enter_context(self.strings.memory_zone(mem))]
if hasattr(self.morphology, "memory_zone"):
contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
if hasattr(self._vectors, "memory_zone"):
contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
self.mem = mem
yield mem
self._clear_transient_orths()
self.mem = self._non_temp_mem
def add_flag(self, flag_getter, int flag_id=-1): def add_flag(self, flag_getter, int flag_id=-1):
"""Set a new boolean flag to words in the vocabulary. """Set a new boolean flag to words in the vocabulary.
@ -148,8 +183,7 @@ cdef class Vocab:
cdef const LexemeC* get(self, Pool mem, str string) except NULL: cdef const LexemeC* get(self, Pool mem, str string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new """Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the `Lexeme` if necessary.
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
""" """
if string == "": if string == "":
return &EMPTY_LEXEME return &EMPTY_LEXEME
@ -180,17 +214,9 @@ cdef class Vocab:
return self._new_lexeme(mem, self.strings[orth]) return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL: cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
# I think this heuristic is bad, and the Vocab should always # The mem argument is deprecated, replaced by memory zones. Same with
# own the lexemes. It avoids weird bugs this way, as it's how the thing # this size heuristic.
# was originally supposed to work. The best solution to the growing
# memory use is to periodically reset the vocab, which is an action
# that should be up to the user to do (so we don't need to keep track
# of the doc ownership).
# TODO: Change the C API so that the mem isn't passed in here.
mem = self.mem mem = self.mem
# if len(string) < 3 or self.length < 10000:
# mem = self.mem
cdef bint is_oov = mem is not self.mem
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC)) lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
lex.orth = self.strings.add(string) lex.orth = self.strings.add(string)
lex.length = len(string) lex.length = len(string)
@ -202,18 +228,25 @@ cdef class Vocab:
for attr, func in self.lex_attr_getters.items(): for attr, func in self.lex_attr_getters.items():
value = func(string) value = func(string)
if isinstance(value, str): if isinstance(value, str):
value = self.strings.add(value) value = self.strings.add(value, allow_transient=True)
if value is not None: if value is not None:
Lexeme.set_struct_attr(lex, attr, value) Lexeme.set_struct_attr(lex, attr, value)
if not is_oov: self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
self._add_lex_to_vocab(lex.orth, lex)
if lex == NULL: if lex == NULL:
raise ValueError(Errors.E085.format(string=string)) raise ValueError(Errors.E085.format(string=string))
return lex return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
self._by_orth.set(lex.orth, <void*>lex) self._by_orth.set(lex.orth, <void*>lex)
self.length += 1 self.length += 1
if is_transient:
self._transient_orths.push_back(lex.orth)
def _clear_transient_orths(self):
"""Remove transient lexemes from the index (generally at the end of the memory zone)"""
for orth in self._transient_orths:
self._by_orth.pop(orth)
self._transient_orths.clear()
def __contains__(self, key): def __contains__(self, key):
"""Check whether the string or int key has an entry in the vocabulary. """Check whether the string or int key has an entry in the vocabulary.
@ -265,7 +298,7 @@ cdef class Vocab:
""" """
cdef attr_t orth cdef attr_t orth
if isinstance(id_or_string, str): if isinstance(id_or_string, str):
orth = self.strings.add(id_or_string) orth = self.strings.add(id_or_string, allow_transient=True)
else: else:
orth = id_or_string orth = id_or_string
return Lexeme(self, orth) return Lexeme(self, orth)
@ -417,7 +450,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#get_vector DOCS: https://spacy.io/api/vocab#get_vector
""" """
if isinstance(orth, str): if isinstance(orth, str):
orth = self.strings.add(orth) orth = self.strings.add(orth, allow_transient=True)
cdef Lexeme lex = self[orth] cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if self.has_vector(key): if self.has_vector(key):
@ -436,7 +469,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#set_vector DOCS: https://spacy.io/api/vocab#set_vector
""" """
if isinstance(orth, str): if isinstance(orth, str):
orth = self.strings.add(orth) orth = self.strings.add(orth, allow_transient=False)
cdef Lexeme lex = self[orth] cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if self.vectors.is_full and key not in self.vectors: if self.vectors.is_full and key not in self.vectors:
@ -460,7 +493,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#has_vector DOCS: https://spacy.io/api/vocab#has_vector
""" """
if isinstance(orth, str): if isinstance(orth, str):
orth = self.strings.add(orth) orth = self.strings.add(orth, allow_transient=True)
cdef Lexeme lex = self[orth] cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
return key in self.vectors return key in self.vectors