This commit is contained in:
Matthew Honnibal 2024-09-08 14:39:58 +02:00
parent 6a90330ff4
commit 80b1005ab1
3 changed files with 7 additions and 9 deletions

View File

@ -26,9 +26,7 @@ cdef class StringStore:
cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
cdef vector[hash_t] _transient_keys
cdef PreshMap _transient_map
cdef Pool _non_temp_mem

View File

@ -3,7 +3,7 @@
cimport cython
from contextlib import contextmanager
from typing import Iterable, Iterator, List, Optional, Tuple, Union
from typing import Iterator, List, Optional
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
@ -35,7 +35,7 @@ def get_string_id(key):
This function optimises for convenience over performance, so shouldn't be
used in tight loops.
"""
cdef hash_t str_hash
cdef hash_t str_hash
if isinstance(key, str):
if len(key) == 0:
return 0
@ -49,8 +49,8 @@ def get_string_id(key):
elif _try_coerce_to_hash(key, &str_hash):
# Coerce the integral key to the expected primitive hash type.
# This ensures that custom/overloaded "primitive" data types
# such as those implemented by numpy are not inadvertently used
# downsteam (as these are internally implemented as custom PyObjects
# such as those implemented by numpy are not inadvertently used
# downsteam (as these are internally implemented as custom PyObjects
# whose comparison operators can incur a significant overhead).
return str_hash
else:
@ -196,7 +196,7 @@ cdef class StringStore:
return self._keys.size() + self._transient_keys.size()
@contextmanager
def memory_zone(self, mem: Optional[Pool]=None) -> Pool:
def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
"""Begin a block where all resources allocated during the block will
be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid.

View File

@ -401,7 +401,7 @@ cdef class Tokenizer:
with_special_cases)
if len(self._cache) < self.max_cache_size:
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
tokens.length - orig_size)
cdef str _split_affixes(
self,