This commit is contained in:
Matthew Honnibal 2024-09-08 14:39:58 +02:00
parent 6a90330ff4
commit 80b1005ab1
3 changed files with 7 additions and 9 deletions

View File

@ -26,9 +26,7 @@ cdef class StringStore:
cdef public PreshMap _map cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient) cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
cdef vector[hash_t] _transient_keys cdef vector[hash_t] _transient_keys
cdef PreshMap _transient_map cdef PreshMap _transient_map
cdef Pool _non_temp_mem cdef Pool _non_temp_mem

View File

@ -3,7 +3,7 @@
cimport cython cimport cython
from contextlib import contextmanager from contextlib import contextmanager
from typing import Iterable, Iterator, List, Optional, Tuple, Union from typing import Iterator, List, Optional
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.string cimport memcpy from libc.string cimport memcpy
@ -35,7 +35,7 @@ def get_string_id(key):
This function optimises for convenience over performance, so shouldn't be This function optimises for convenience over performance, so shouldn't be
used in tight loops. used in tight loops.
""" """
cdef hash_t str_hash cdef hash_t str_hash
if isinstance(key, str): if isinstance(key, str):
if len(key) == 0: if len(key) == 0:
return 0 return 0
@ -49,8 +49,8 @@ def get_string_id(key):
elif _try_coerce_to_hash(key, &str_hash): elif _try_coerce_to_hash(key, &str_hash):
# Coerce the integral key to the expected primitive hash type. # Coerce the integral key to the expected primitive hash type.
# This ensures that custom/overloaded "primitive" data types # This ensures that custom/overloaded "primitive" data types
# such as those implemented by numpy are not inadvertently used # such as those implemented by numpy are not inadvertently used
# downsteam (as these are internally implemented as custom PyObjects # downsteam (as these are internally implemented as custom PyObjects
# whose comparison operators can incur a significant overhead). # whose comparison operators can incur a significant overhead).
return str_hash return str_hash
else: else:
@ -196,7 +196,7 @@ cdef class StringStore:
return self._keys.size() + self._transient_keys.size() return self._keys.size() + self._transient_keys.size()
@contextmanager @contextmanager
def memory_zone(self, mem: Optional[Pool]=None) -> Pool: def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
"""Begin a block where all resources allocated during the block will """Begin a block where all resources allocated during the block will
be freed at the end of it. If a resources was created within the be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid. memory zone block, accessing it outside the block is invalid.

View File

@ -401,7 +401,7 @@ cdef class Tokenizer:
with_special_cases) with_special_cases)
if len(self._cache) < self.max_cache_size: if len(self._cache) < self.max_cache_size:
self._save_cached(&tokens.c[orig_size], orig_key, has_special, self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size) tokens.length - orig_size)
cdef str _split_affixes( cdef str _split_affixes(
self, self,