2016-10-24 14:49:03 +03:00
|
|
|
# cython: infer_types=True
|
2023-06-26 12:41:03 +03:00
|
|
|
from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
|
|
|
|
|
2016-03-24 17:09:55 +03:00
|
|
|
cimport cython
|
2023-06-26 12:41:03 +03:00
|
|
|
from libc.stdint cimport uint32_t
|
2014-12-19 22:42:01 +03:00
|
|
|
from libc.string cimport memcpy
|
2017-11-11 03:11:27 +03:00
|
|
|
from libcpp.set cimport set
|
2022-10-06 11:51:06 +03:00
|
|
|
from murmurhash.mrmr cimport hash64
|
2020-03-02 13:48:10 +03:00
|
|
|
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
import srsly
|
2015-11-05 14:28:26 +03:00
|
|
|
|
2020-03-02 13:48:10 +03:00
|
|
|
from .typedefs cimport hash_t
|
|
|
|
|
2023-06-26 12:41:03 +03:00
|
|
|
from . import util
|
|
|
|
from .errors import Errors
|
2017-05-28 14:03:16 +03:00
|
|
|
from .symbols import IDS as SYMBOLS_BY_STR
|
2019-03-07 14:52:15 +03:00
|
|
|
from .symbols import NAMES as SYMBOLS_BY_INT
|
2014-12-19 22:42:01 +03:00
|
|
|
|
2017-05-28 19:19:11 +03:00
|
|
|
|
2014-12-19 22:42:01 +03:00
|
|
|
cdef class StringStore:
|
2022-10-06 11:51:06 +03:00
|
|
|
"""Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/stringstore
|
2019-03-08 13:42:26 +03:00
|
|
|
"""
|
2022-10-06 11:51:06 +03:00
|
|
|
def __init__(self, strings: Optional[Iterable[str]] = None):
|
2017-05-21 15:18:58 +03:00
|
|
|
"""Create the StringStore.
|
2016-11-01 14:25:36 +03:00
|
|
|
|
2017-05-21 15:18:58 +03:00
|
|
|
strings (iterable): A sequence of unicode strings to add to the store.
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
2014-12-19 22:42:01 +03:00
|
|
|
self.mem = Pool()
|
|
|
|
self._map = PreshMap()
|
2015-10-12 07:12:32 +03:00
|
|
|
if strings is not None:
|
|
|
|
for string in strings:
|
2017-05-28 13:36:27 +03:00
|
|
|
self.add(string)
|
2015-06-23 01:02:50 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
def __getitem__(self, string_or_hash: Union[str, int]) -> Union[str, int]:
|
|
|
|
"""Retrieve a string from a given hash. If a string
|
|
|
|
is passed as the input, add it to the store and return
|
|
|
|
its hash.
|
2017-04-15 12:59:21 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
string_or_hash (int / str): The hash value to lookup or the string to store.
|
|
|
|
RETURNS (str / int): The stored string or the hash of the newly added string.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2022-10-06 11:51:06 +03:00
|
|
|
if isinstance(string_or_hash, str):
|
|
|
|
return self.add(string_or_hash)
|
2022-07-04 16:04:03 +03:00
|
|
|
else:
|
2022-10-06 11:51:06 +03:00
|
|
|
return self._get_interned_str(string_or_hash)
|
2022-07-04 16:04:03 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
def __contains__(self, string_or_hash: Union[str, int]) -> bool:
|
|
|
|
"""Check whether a string or a hash is in the store.
|
2017-05-28 13:36:27 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
string (str / int): The string/hash to check.
|
|
|
|
RETURNS (bool): Whether the store contains the string.
|
|
|
|
"""
|
|
|
|
cdef hash_t str_hash = get_string_id(string_or_hash)
|
|
|
|
if str_hash in SYMBOLS_BY_INT:
|
|
|
|
return True
|
2018-09-24 16:25:20 +03:00
|
|
|
else:
|
2022-10-06 11:51:06 +03:00
|
|
|
return self._map.get(str_hash) is not NULL
|
2018-09-24 16:25:20 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
def __iter__(self) -> Iterator[str]:
|
|
|
|
"""Iterate over the strings in the store in insertion order.
|
|
|
|
|
|
|
|
RETURNS: An iterable collection of strings.
|
|
|
|
"""
|
|
|
|
return iter(self.keys())
|
2019-12-22 03:53:56 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
def __reduce__(self):
|
|
|
|
strings = list(self)
|
|
|
|
return (StringStore, (strings,), None, None, None)
|
|
|
|
|
|
|
|
def __len__(self) -> int:
|
|
|
|
"""The number of strings in the store.
|
|
|
|
|
|
|
|
RETURNS (int): The number of strings in the store.
|
|
|
|
"""
|
|
|
|
return self._keys.size()
|
|
|
|
|
|
|
|
def add(self, string: str) -> int:
|
2017-05-28 19:19:11 +03:00
|
|
|
"""Add a string to the StringStore.
|
|
|
|
|
2020-05-24 18:20:58 +03:00
|
|
|
string (str): The string to add.
|
2017-05-28 19:19:11 +03:00
|
|
|
RETURNS (uint64): The string's hash value.
|
|
|
|
"""
|
2022-10-06 11:51:06 +03:00
|
|
|
if not isinstance(string, str):
|
2018-04-03 16:50:31 +03:00
|
|
|
raise TypeError(Errors.E017.format(value_type=type(string)))
|
2017-05-28 13:36:27 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
if string in SYMBOLS_BY_STR:
|
|
|
|
return SYMBOLS_BY_STR[string]
|
|
|
|
else:
|
|
|
|
return self._intern_str(string)
|
2017-05-28 13:36:27 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
def as_int(self, string_or_hash: Union[str, int]) -> str:
|
|
|
|
"""If a hash value is passed as the input, return it as-is. If the input
|
|
|
|
is a string, return its corresponding hash.
|
|
|
|
|
|
|
|
string_or_hash (str / int): The string to hash or a hash value.
|
|
|
|
RETURNS (int): The hash of the string or the input hash value.
|
2017-05-28 13:36:27 +03:00
|
|
|
"""
|
2022-10-06 11:51:06 +03:00
|
|
|
if isinstance(string_or_hash, int):
|
|
|
|
return string_or_hash
|
|
|
|
else:
|
|
|
|
return get_string_id(string_or_hash)
|
2014-12-19 22:42:01 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
def as_string(self, string_or_hash: Union[str, int]) -> str:
|
|
|
|
"""If a string is passed as the input, return it as-is. If the input
|
|
|
|
is a hash value, return its corresponding string.
|
2016-11-01 14:25:36 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
string_or_hash (str / int): The hash value to lookup or a string.
|
|
|
|
RETURNS (str): The stored string or the input string.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2022-10-06 11:51:06 +03:00
|
|
|
if isinstance(string_or_hash, str):
|
|
|
|
return string_or_hash
|
2017-05-28 19:09:27 +03:00
|
|
|
else:
|
2022-10-06 11:51:06 +03:00
|
|
|
return self._get_interned_str(string_or_hash)
|
2022-07-04 16:04:03 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
def items(self) -> List[Tuple[str, int]]:
|
|
|
|
"""Iterate over the stored strings and their hashes in insertion order.
|
|
|
|
|
|
|
|
RETURNS: A list of string-hash pairs.
|
|
|
|
"""
|
|
|
|
# Even though we internally store the hashes as keys and the strings as
|
|
|
|
# values, we invert the order in the public API to keep it consistent with
|
|
|
|
# the implementation of the `__iter__` method (where we wish to iterate over
|
|
|
|
# the strings in the store).
|
|
|
|
cdef int i
|
|
|
|
pairs = [None] * self._keys.size()
|
|
|
|
for i in range(self._keys.size()):
|
|
|
|
str_hash = self._keys[i]
|
|
|
|
utf8str = <Utf8Str*>self._map.get(str_hash)
|
|
|
|
pairs[i] = (self._decode_str_repr(utf8str), str_hash)
|
|
|
|
return pairs
|
2016-03-08 18:49:10 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
def keys(self) -> List[str]:
|
|
|
|
"""Iterate over the stored strings in insertion order.
|
2016-11-01 14:25:36 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
RETURNS: A list of strings.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2015-08-22 23:04:34 +03:00
|
|
|
cdef int i
|
2022-10-06 11:51:06 +03:00
|
|
|
strings = [None] * self._keys.size()
|
|
|
|
for i in range(self._keys.size()):
|
|
|
|
utf8str = <Utf8Str*>self._map.get(self._keys[i])
|
|
|
|
strings[i] = self._decode_str_repr(utf8str)
|
|
|
|
return strings
|
2015-08-22 23:04:34 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
def values(self) -> List[int]:
|
|
|
|
"""Iterate over the stored strings hashes in insertion order.
|
|
|
|
|
|
|
|
RETURNS: A list of string hashs.
|
|
|
|
"""
|
|
|
|
cdef int i
|
|
|
|
hashes = [None] * self._keys.size()
|
|
|
|
for i in range(self._keys.size()):
|
|
|
|
hashes[i] = self._keys[i]
|
|
|
|
return hashes
|
2015-10-12 07:12:32 +03:00
|
|
|
|
2017-05-21 15:18:58 +03:00
|
|
|
def to_disk(self, path):
|
|
|
|
"""Save the current state to a directory.
|
|
|
|
|
2020-05-24 19:51:10 +03:00
|
|
|
path (str / Path): A path to a directory, which will be created if
|
2017-10-27 22:07:59 +03:00
|
|
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
2017-05-21 15:18:58 +03:00
|
|
|
"""
|
2017-05-22 13:38:00 +03:00
|
|
|
path = util.ensure_path(path)
|
2021-04-09 12:53:13 +03:00
|
|
|
strings = sorted(self)
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
srsly.write_json(path, strings)
|
2017-05-21 15:18:58 +03:00
|
|
|
|
|
|
|
def from_disk(self, path):
|
|
|
|
"""Loads state from a directory. Modifies the object in place and
|
|
|
|
returns it.
|
|
|
|
|
2020-05-24 19:51:10 +03:00
|
|
|
path (str / Path): A path to a directory. Paths may be either
|
2017-05-21 15:18:58 +03:00
|
|
|
strings or `Path`-like objects.
|
|
|
|
RETURNS (StringStore): The modified `StringStore` object.
|
|
|
|
"""
|
2017-05-22 13:38:00 +03:00
|
|
|
path = util.ensure_path(path)
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
strings = srsly.read_json(path)
|
2017-08-19 23:42:17 +03:00
|
|
|
prev = list(self)
|
2017-05-22 13:38:00 +03:00
|
|
|
self._reset_and_load(strings)
|
2017-08-19 23:42:17 +03:00
|
|
|
for word in prev:
|
|
|
|
self.add(word)
|
2017-05-22 13:38:00 +03:00
|
|
|
return self
|
2017-05-21 15:18:58 +03:00
|
|
|
|
2019-03-10 21:16:45 +03:00
|
|
|
def to_bytes(self, **kwargs):
|
2017-05-21 15:18:58 +03:00
|
|
|
"""Serialize the current state to a binary string.
|
|
|
|
|
|
|
|
RETURNS (bytes): The serialized form of the `StringStore` object.
|
|
|
|
"""
|
2021-04-09 12:53:13 +03:00
|
|
|
return srsly.json_dumps(sorted(self))
|
2017-05-21 15:18:58 +03:00
|
|
|
|
2019-03-10 21:16:45 +03:00
|
|
|
def from_bytes(self, bytes_data, **kwargs):
|
2017-05-21 15:18:58 +03:00
|
|
|
"""Load state from a binary string.
|
|
|
|
|
|
|
|
bytes_data (bytes): The data to load from.
|
|
|
|
RETURNS (StringStore): The `StringStore` object.
|
|
|
|
"""
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
strings = srsly.json_loads(bytes_data)
|
2017-08-19 23:42:17 +03:00
|
|
|
prev = list(self)
|
2017-05-22 13:38:00 +03:00
|
|
|
self._reset_and_load(strings)
|
2017-08-19 23:42:17 +03:00
|
|
|
for word in prev:
|
|
|
|
self.add(word)
|
2017-05-22 13:38:00 +03:00
|
|
|
return self
|
2017-05-21 15:18:58 +03:00
|
|
|
|
2017-10-16 20:23:10 +03:00
|
|
|
def _reset_and_load(self, strings):
|
2017-05-22 13:38:00 +03:00
|
|
|
self.mem = Pool()
|
|
|
|
self._map = PreshMap()
|
2022-10-06 11:51:06 +03:00
|
|
|
self._keys.clear()
|
2017-05-22 13:38:00 +03:00
|
|
|
for string in strings:
|
2017-05-28 13:36:27 +03:00
|
|
|
self.add(string)
|
2017-05-22 13:38:00 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
def _get_interned_str(self, hash_value: int) -> str:
|
|
|
|
cdef hash_t str_hash
|
|
|
|
if not _try_coerce_to_hash(hash_value, &str_hash):
|
|
|
|
raise TypeError(Errors.E4001.format(expected_types="'int'", received_type=type(hash_value)))
|
|
|
|
|
|
|
|
# Handle reserved symbols and empty strings correctly.
|
|
|
|
if str_hash == 0:
|
|
|
|
return ""
|
|
|
|
|
|
|
|
symbol = SYMBOLS_BY_INT.get(str_hash)
|
|
|
|
if symbol is not None:
|
|
|
|
return symbol
|
2015-07-20 12:26:46 +03:00
|
|
|
|
2022-10-06 11:51:06 +03:00
|
|
|
utf8str = <Utf8Str*>self._map.get(str_hash)
|
|
|
|
if utf8str is NULL:
|
|
|
|
raise KeyError(Errors.E018.format(hash_value=str_hash))
|
|
|
|
else:
|
|
|
|
return self._decode_str_repr(utf8str)
|
|
|
|
|
|
|
|
cdef hash_t _intern_str(self, str string):
|
2016-10-24 14:49:03 +03:00
|
|
|
# TODO: This function's API/behaviour is an unholy mess...
|
2016-09-30 21:20:22 +03:00
|
|
|
# 0 means missing, but we don't bother offsetting the index.
|
2022-10-06 11:51:06 +03:00
|
|
|
chars = string.encode('utf-8')
|
|
|
|
cdef hash_t key = hash64(<unsigned char*>chars, len(chars), 1)
|
2016-10-24 14:49:03 +03:00
|
|
|
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
|
|
|
if value is not NULL:
|
2022-10-06 11:51:06 +03:00
|
|
|
return key
|
|
|
|
|
|
|
|
value = self._allocate_str_repr(<unsigned char*>chars, len(chars))
|
2017-05-28 13:36:27 +03:00
|
|
|
self._map.set(key, value)
|
2022-10-06 11:51:06 +03:00
|
|
|
self._keys.push_back(key)
|
|
|
|
return key
|
|
|
|
|
|
|
|
cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *:
|
|
|
|
cdef int n_length_bytes
|
|
|
|
cdef int i
|
|
|
|
cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
|
|
|
|
cdef uint32_t ulength = length
|
|
|
|
if length < sizeof(string.s):
|
|
|
|
string.s[0] = <unsigned char>length
|
|
|
|
memcpy(&string.s[1], chars, length)
|
|
|
|
return string
|
|
|
|
elif length < 255:
|
|
|
|
string.p = <unsigned char*>self.mem.alloc(length + 1, sizeof(unsigned char))
|
|
|
|
string.p[0] = length
|
|
|
|
memcpy(&string.p[1], chars, length)
|
|
|
|
return string
|
|
|
|
else:
|
|
|
|
i = 0
|
|
|
|
n_length_bytes = (length // 255) + 1
|
|
|
|
string.p = <unsigned char*>self.mem.alloc(length + n_length_bytes, sizeof(unsigned char))
|
|
|
|
for i in range(n_length_bytes-1):
|
|
|
|
string.p[i] = 255
|
|
|
|
string.p[n_length_bytes-1] = length % 255
|
|
|
|
memcpy(&string.p[n_length_bytes], chars, length)
|
|
|
|
return string
|
|
|
|
|
|
|
|
cdef str _decode_str_repr(self, const Utf8Str* string):
|
|
|
|
cdef int i, length
|
|
|
|
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
|
|
|
return string.s[1:string.s[0]+1].decode('utf-8')
|
|
|
|
elif string.p[0] < 255:
|
|
|
|
return string.p[1:string.p[0]+1].decode('utf-8')
|
|
|
|
else:
|
|
|
|
i = 0
|
|
|
|
length = 0
|
|
|
|
while string.p[i] == 255:
|
|
|
|
i += 1
|
|
|
|
length += 255
|
|
|
|
length += string.p[i]
|
|
|
|
i += 1
|
|
|
|
return string.p[i:length + i].decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
cpdef hash_t hash_string(object string) except -1:
|
|
|
|
if not isinstance(string, str):
|
|
|
|
raise TypeError(Errors.E4001.format(expected_types="'str'", received_type=type(string)))
|
|
|
|
|
|
|
|
# Handle reserved symbols and empty strings correctly.
|
|
|
|
if len(string) == 0:
|
|
|
|
return 0
|
|
|
|
|
|
|
|
symbol = SYMBOLS_BY_STR.get(string)
|
|
|
|
if symbol is not None:
|
|
|
|
return symbol
|
|
|
|
|
|
|
|
chars = string.encode('utf-8')
|
|
|
|
return hash64(<unsigned char*>chars, len(chars), 1)
|
|
|
|
|
|
|
|
|
|
|
|
cpdef hash_t get_string_id(object string_or_hash) except -1:
|
|
|
|
cdef hash_t str_hash
|
|
|
|
|
|
|
|
try:
|
|
|
|
return hash_string(string_or_hash)
|
|
|
|
except:
|
|
|
|
if _try_coerce_to_hash(string_or_hash, &str_hash):
|
|
|
|
# Coerce the integral key to the expected primitive hash type.
|
|
|
|
# This ensures that custom/overloaded "primitive" data types
|
|
|
|
# such as those implemented by numpy are not inadvertently used
|
|
|
|
# downsteam (as these are internally implemented as custom PyObjects
|
|
|
|
# whose comparison operators can incur a significant overhead).
|
|
|
|
return str_hash
|
|
|
|
else:
|
|
|
|
raise TypeError(Errors.E4001.format(expected_types="'str','int'", received_type=type(string_or_hash)))
|
|
|
|
|
|
|
|
|
|
|
|
# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
|
|
|
|
cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
|
|
|
|
try:
|
|
|
|
out_hash[0] = key
|
|
|
|
return True
|
|
|
|
except:
|
|
|
|
return False
|
|
|
|
|