2016-10-24 14:49:03 +03:00
|
|
|
# cython: infer_types=True
|
2016-03-24 17:09:55 +03:00
|
|
|
cimport cython
|
2014-12-19 22:42:01 +03:00
|
|
|
from libc.string cimport memcpy
|
2017-11-11 03:11:27 +03:00
|
|
|
from libcpp.set cimport set
|
2017-05-22 13:38:00 +03:00
|
|
|
from libc.stdint cimport uint32_t
|
2017-10-27 22:07:59 +03:00
|
|
|
from murmurhash.mrmr cimport hash64, hash32
|
2020-03-02 13:48:10 +03:00
|
|
|
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
import srsly
|
2015-11-05 14:28:26 +03:00
|
|
|
|
2020-03-02 13:48:10 +03:00
|
|
|
from .typedefs cimport hash_t
|
|
|
|
|
2017-05-28 14:03:16 +03:00
|
|
|
from .symbols import IDS as SYMBOLS_BY_STR
|
2019-03-07 14:52:15 +03:00
|
|
|
from .symbols import NAMES as SYMBOLS_BY_INT
|
2018-04-03 16:50:31 +03:00
|
|
|
from .errors import Errors
|
2017-10-27 22:07:59 +03:00
|
|
|
from . import util
|
2014-12-19 22:42:01 +03:00
|
|
|
|
|
|
|
|
2018-12-10 18:09:26 +03:00
|
|
|
def get_string_id(key):
|
|
|
|
"""Get a string ID, handling the reserved symbols correctly. If the key is
|
|
|
|
already an ID, return it.
|
2019-03-08 13:42:26 +03:00
|
|
|
|
2018-12-10 18:09:26 +03:00
|
|
|
This function optimises for convenience over performance, so shouldn't be
|
|
|
|
used in tight loops.
|
|
|
|
"""
|
2019-12-22 03:53:56 +03:00
|
|
|
if not isinstance(key, str):
|
2018-12-10 18:09:26 +03:00
|
|
|
return key
|
|
|
|
elif key in SYMBOLS_BY_STR:
|
|
|
|
return SYMBOLS_BY_STR[key]
|
|
|
|
elif not key:
|
|
|
|
return 0
|
|
|
|
else:
|
2019-03-08 13:42:26 +03:00
|
|
|
chars = key.encode("utf8")
|
2018-12-10 18:09:26 +03:00
|
|
|
return hash_utf8(chars, len(chars))
|
|
|
|
|
|
|
|
|
2015-01-12 02:26:22 +03:00
|
|
|
cpdef hash_t hash_string(unicode string) except 0:
|
2019-03-08 13:42:26 +03:00
|
|
|
chars = string.encode("utf8")
|
2017-03-07 19:15:18 +03:00
|
|
|
return hash_utf8(chars, len(chars))
|
2016-03-24 17:09:55 +03:00
|
|
|
|
|
|
|
|
2017-03-07 19:15:18 +03:00
|
|
|
cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
|
2016-09-30 21:20:22 +03:00
|
|
|
return hash64(utf8_string, length, 1)
|
2015-01-12 02:26:22 +03:00
|
|
|
|
|
|
|
|
2017-03-07 19:15:18 +03:00
|
|
|
cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
|
2016-11-01 15:27:13 +03:00
|
|
|
return hash32(utf8_string, length, 1)
|
|
|
|
|
|
|
|
|
2017-05-28 13:36:27 +03:00
|
|
|
cdef unicode decode_Utf8Str(const Utf8Str* string):
|
2015-07-20 13:05:23 +03:00
|
|
|
cdef int i, length
|
2015-07-20 12:26:46 +03:00
|
|
|
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
2019-03-08 13:42:26 +03:00
|
|
|
return string.s[1:string.s[0]+1].decode("utf8")
|
2015-07-20 13:05:23 +03:00
|
|
|
elif string.p[0] < 255:
|
2019-03-08 13:42:26 +03:00
|
|
|
return string.p[1:string.p[0]+1].decode("utf8")
|
2015-07-20 12:26:46 +03:00
|
|
|
else:
|
2015-07-20 13:05:23 +03:00
|
|
|
i = 0
|
|
|
|
length = 0
|
|
|
|
while string.p[i] == 255:
|
|
|
|
i += 1
|
|
|
|
length += 255
|
|
|
|
length += string.p[i]
|
2015-07-20 12:26:46 +03:00
|
|
|
i += 1
|
2019-03-08 13:42:26 +03:00
|
|
|
return string.p[i:length + i].decode("utf8")
|
2015-07-20 12:26:46 +03:00
|
|
|
|
|
|
|
|
2017-05-28 13:36:27 +03:00
|
|
|
cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
|
2015-07-20 13:05:23 +03:00
|
|
|
cdef int n_length_bytes
|
|
|
|
cdef int i
|
2017-05-28 13:36:27 +03:00
|
|
|
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
|
2017-03-07 19:15:18 +03:00
|
|
|
cdef uint32_t ulength = length
|
2015-07-20 12:26:46 +03:00
|
|
|
if length < sizeof(string.s):
|
|
|
|
string.s[0] = <unsigned char>length
|
|
|
|
memcpy(&string.s[1], chars, length)
|
|
|
|
return string
|
2015-07-20 13:05:23 +03:00
|
|
|
elif length < 255:
|
2015-07-20 12:26:46 +03:00
|
|
|
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
|
|
|
|
string.p[0] = length
|
|
|
|
memcpy(&string.p[1], chars, length)
|
|
|
|
return string
|
|
|
|
else:
|
2015-07-20 13:05:23 +03:00
|
|
|
i = 0
|
|
|
|
n_length_bytes = (length // 255) + 1
|
|
|
|
string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
|
|
|
|
for i in range(n_length_bytes-1):
|
|
|
|
string.p[i] = 255
|
|
|
|
string.p[n_length_bytes-1] = length % 255
|
|
|
|
memcpy(&string.p[n_length_bytes], chars, length)
|
|
|
|
return string
|
2014-12-21 23:25:43 +03:00
|
|
|
|
2017-05-28 19:19:11 +03:00
|
|
|
|
2014-12-19 22:42:01 +03:00
|
|
|
cdef class StringStore:
|
2019-03-08 13:42:26 +03:00
|
|
|
"""Look up strings by 64-bit hashes.
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/stringstore
|
|
|
|
"""
|
2016-10-24 14:49:03 +03:00
|
|
|
def __init__(self, strings=None, freeze=False):
|
2017-05-21 15:18:58 +03:00
|
|
|
"""Create the StringStore.
|
2016-11-01 14:25:36 +03:00
|
|
|
|
2017-05-21 15:18:58 +03:00
|
|
|
strings (iterable): A sequence of unicode strings to add to the store.
|
|
|
|
RETURNS (StringStore): The newly constructed object.
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
2014-12-19 22:42:01 +03:00
|
|
|
self.mem = Pool()
|
|
|
|
self._map = PreshMap()
|
2015-10-12 07:12:32 +03:00
|
|
|
if strings is not None:
|
|
|
|
for string in strings:
|
2017-05-28 13:36:27 +03:00
|
|
|
self.add(string)
|
2015-06-23 01:02:50 +03:00
|
|
|
|
2014-12-19 22:42:01 +03:00
|
|
|
def __getitem__(self, object string_or_id):
|
2017-05-28 19:19:11 +03:00
|
|
|
"""Retrieve a string from a given hash, or vice versa.
|
2017-04-15 12:59:21 +03:00
|
|
|
|
2017-05-28 19:19:11 +03:00
|
|
|
string_or_id (bytes, unicode or uint64): The value to encode.
|
2020-05-24 19:51:10 +03:00
|
|
|
Returns (str / uint64): The value to be retrieved.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2016-10-24 14:49:03 +03:00
|
|
|
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
|
|
|
return 0
|
|
|
|
elif string_or_id == 0:
|
2019-03-08 13:42:26 +03:00
|
|
|
return ""
|
2017-05-28 14:03:16 +03:00
|
|
|
elif string_or_id in SYMBOLS_BY_STR:
|
|
|
|
return SYMBOLS_BY_STR[string_or_id]
|
2017-05-28 13:36:27 +03:00
|
|
|
cdef hash_t key
|
|
|
|
if isinstance(string_or_id, unicode):
|
|
|
|
key = hash_string(string_or_id)
|
|
|
|
return key
|
|
|
|
elif isinstance(string_or_id, bytes):
|
|
|
|
key = hash_utf8(string_or_id, len(string_or_id))
|
|
|
|
return key
|
2019-03-07 14:52:15 +03:00
|
|
|
elif string_or_id < len(SYMBOLS_BY_INT):
|
|
|
|
return SYMBOLS_BY_INT[string_or_id]
|
2016-10-24 15:22:51 +03:00
|
|
|
else:
|
2017-05-28 13:36:27 +03:00
|
|
|
key = string_or_id
|
2017-11-11 03:11:27 +03:00
|
|
|
self.hits.insert(key)
|
2017-05-28 13:36:27 +03:00
|
|
|
utf8str = <Utf8Str*>self._map.get(key)
|
2016-10-24 14:49:03 +03:00
|
|
|
if utf8str is NULL:
|
2018-04-03 16:50:31 +03:00
|
|
|
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
2016-10-24 14:49:03 +03:00
|
|
|
else:
|
2017-05-28 13:36:27 +03:00
|
|
|
return decode_Utf8Str(utf8str)
|
|
|
|
|
2018-09-24 16:25:20 +03:00
|
|
|
def as_int(self, key):
|
|
|
|
"""If key is an int, return it; otherwise, get the int value."""
|
|
|
|
if not isinstance(key, basestring):
|
|
|
|
return key
|
|
|
|
else:
|
|
|
|
return self[key]
|
|
|
|
|
|
|
|
def as_string(self, key):
|
|
|
|
"""If key is a string, return it; otherwise, get the string value."""
|
|
|
|
if isinstance(key, basestring):
|
|
|
|
return key
|
|
|
|
else:
|
|
|
|
return self[key]
|
2019-12-22 03:53:56 +03:00
|
|
|
|
2017-05-28 13:36:27 +03:00
|
|
|
def add(self, string):
|
2017-05-28 19:19:11 +03:00
|
|
|
"""Add a string to the StringStore.
|
|
|
|
|
2020-05-24 18:20:58 +03:00
|
|
|
string (str): The string to add.
|
2017-05-28 19:19:11 +03:00
|
|
|
RETURNS (uint64): The string's hash value.
|
|
|
|
"""
|
2017-05-28 13:36:27 +03:00
|
|
|
if isinstance(string, unicode):
|
2017-05-28 14:03:16 +03:00
|
|
|
if string in SYMBOLS_BY_STR:
|
|
|
|
return SYMBOLS_BY_STR[string]
|
2017-05-28 13:36:27 +03:00
|
|
|
key = hash_string(string)
|
|
|
|
self.intern_unicode(string)
|
|
|
|
elif isinstance(string, bytes):
|
2017-05-28 14:03:16 +03:00
|
|
|
if string in SYMBOLS_BY_STR:
|
|
|
|
return SYMBOLS_BY_STR[string]
|
2017-05-28 13:36:27 +03:00
|
|
|
key = hash_utf8(string, len(string))
|
|
|
|
self._intern_utf8(string, len(string))
|
|
|
|
else:
|
2018-04-03 16:50:31 +03:00
|
|
|
raise TypeError(Errors.E017.format(value_type=type(string)))
|
2017-05-28 13:36:27 +03:00
|
|
|
return key
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
"""The number of strings in the store.
|
|
|
|
|
|
|
|
RETURNS (int): The number of strings in the store.
|
|
|
|
"""
|
|
|
|
return self.keys.size()
|
2014-12-19 22:42:01 +03:00
|
|
|
|
2017-05-28 14:03:16 +03:00
|
|
|
def __contains__(self, string not None):
|
2017-05-21 15:18:58 +03:00
|
|
|
"""Check whether a string is in the store.
|
2016-11-01 14:25:36 +03:00
|
|
|
|
2020-05-24 18:20:58 +03:00
|
|
|
string (str): The string to check.
|
2017-05-21 15:18:58 +03:00
|
|
|
RETURNS (bool): Whether the store contains the string.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2017-05-28 19:09:27 +03:00
|
|
|
cdef hash_t key
|
|
|
|
if isinstance(string, int) or isinstance(string, long):
|
|
|
|
if string == 0:
|
|
|
|
return True
|
|
|
|
key = string
|
|
|
|
elif len(string) == 0:
|
2016-03-24 17:40:12 +03:00
|
|
|
return True
|
2017-05-28 19:09:27 +03:00
|
|
|
elif string in SYMBOLS_BY_STR:
|
2017-05-28 14:03:16 +03:00
|
|
|
return True
|
2017-05-28 19:09:27 +03:00
|
|
|
elif isinstance(string, unicode):
|
|
|
|
key = hash_string(string)
|
|
|
|
else:
|
2019-03-08 13:42:26 +03:00
|
|
|
string = string.encode("utf8")
|
2017-05-28 19:09:27 +03:00
|
|
|
key = hash_utf8(string, len(string))
|
2019-03-07 14:52:15 +03:00
|
|
|
if key < len(SYMBOLS_BY_INT):
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
self.hits.insert(key)
|
|
|
|
return self._map.get(key) is not NULL
|
2016-03-08 18:49:10 +03:00
|
|
|
|
2015-08-22 23:04:34 +03:00
|
|
|
def __iter__(self):
|
2017-05-21 15:18:58 +03:00
|
|
|
"""Iterate over the strings in the store, in order.
|
2016-11-01 14:25:36 +03:00
|
|
|
|
2020-05-24 18:20:58 +03:00
|
|
|
YIELDS (str): A string in the store.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2015-08-22 23:04:34 +03:00
|
|
|
cdef int i
|
2017-05-28 13:36:27 +03:00
|
|
|
cdef hash_t key
|
|
|
|
for i in range(self.keys.size()):
|
|
|
|
key = self.keys[i]
|
2017-11-11 03:11:27 +03:00
|
|
|
self.hits.insert(key)
|
2017-05-28 13:36:27 +03:00
|
|
|
utf8str = <Utf8Str*>self._map.get(key)
|
|
|
|
yield decode_Utf8Str(utf8str)
|
2016-10-24 14:49:03 +03:00
|
|
|
# TODO: Iterate OOV here?
|
2015-08-22 23:04:34 +03:00
|
|
|
|
2015-10-12 07:12:32 +03:00
|
|
|
def __reduce__(self):
|
2017-05-28 13:36:27 +03:00
|
|
|
strings = list(self)
|
2015-10-12 07:12:32 +03:00
|
|
|
return (StringStore, (strings,), None, None, None)
|
|
|
|
|
2017-05-21 15:18:58 +03:00
|
|
|
def to_disk(self, path):
|
|
|
|
"""Save the current state to a directory.
|
|
|
|
|
2020-05-24 19:51:10 +03:00
|
|
|
path (str / Path): A path to a directory, which will be created if
|
2017-10-27 22:07:59 +03:00
|
|
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
2017-05-21 15:18:58 +03:00
|
|
|
"""
|
2017-05-22 13:38:00 +03:00
|
|
|
path = util.ensure_path(path)
|
|
|
|
strings = list(self)
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
srsly.write_json(path, strings)
|
2017-05-21 15:18:58 +03:00
|
|
|
|
|
|
|
def from_disk(self, path):
|
|
|
|
"""Loads state from a directory. Modifies the object in place and
|
|
|
|
returns it.
|
|
|
|
|
2020-05-24 19:51:10 +03:00
|
|
|
path (str / Path): A path to a directory. Paths may be either
|
2017-05-21 15:18:58 +03:00
|
|
|
strings or `Path`-like objects.
|
|
|
|
RETURNS (StringStore): The modified `StringStore` object.
|
|
|
|
"""
|
2017-05-22 13:38:00 +03:00
|
|
|
path = util.ensure_path(path)
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
strings = srsly.read_json(path)
|
2017-08-19 23:42:17 +03:00
|
|
|
prev = list(self)
|
2017-05-22 13:38:00 +03:00
|
|
|
self._reset_and_load(strings)
|
2017-08-19 23:42:17 +03:00
|
|
|
for word in prev:
|
|
|
|
self.add(word)
|
2017-05-22 13:38:00 +03:00
|
|
|
return self
|
2017-05-21 15:18:58 +03:00
|
|
|
|
2019-03-10 21:16:45 +03:00
|
|
|
def to_bytes(self, **kwargs):
|
2017-05-21 15:18:58 +03:00
|
|
|
"""Serialize the current state to a binary string.
|
|
|
|
|
|
|
|
RETURNS (bytes): The serialized form of the `StringStore` object.
|
|
|
|
"""
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
return srsly.json_dumps(list(self))
|
2017-05-21 15:18:58 +03:00
|
|
|
|
2019-03-10 21:16:45 +03:00
|
|
|
def from_bytes(self, bytes_data, **kwargs):
|
2017-05-21 15:18:58 +03:00
|
|
|
"""Load state from a binary string.
|
|
|
|
|
|
|
|
bytes_data (bytes): The data to load from.
|
|
|
|
RETURNS (StringStore): The `StringStore` object.
|
|
|
|
"""
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
strings = srsly.json_loads(bytes_data)
|
2017-08-19 23:42:17 +03:00
|
|
|
prev = list(self)
|
2017-05-22 13:38:00 +03:00
|
|
|
self._reset_and_load(strings)
|
2017-08-19 23:42:17 +03:00
|
|
|
for word in prev:
|
|
|
|
self.add(word)
|
2017-05-22 13:38:00 +03:00
|
|
|
return self
|
2017-05-21 15:18:58 +03:00
|
|
|
|
2017-10-16 20:23:10 +03:00
|
|
|
def _reset_and_load(self, strings):
|
2017-05-22 13:38:00 +03:00
|
|
|
self.mem = Pool()
|
|
|
|
self._map = PreshMap()
|
2017-05-28 13:36:27 +03:00
|
|
|
self.keys.clear()
|
2017-11-11 03:11:27 +03:00
|
|
|
self.hits.clear()
|
2017-05-22 13:38:00 +03:00
|
|
|
for string in strings:
|
2017-05-28 13:36:27 +03:00
|
|
|
self.add(string)
|
2017-05-22 13:38:00 +03:00
|
|
|
|
2017-11-14 22:40:03 +03:00
|
|
|
def _cleanup_stale_strings(self, excepted):
|
2017-11-14 21:15:04 +03:00
|
|
|
"""
|
2017-11-14 22:58:46 +03:00
|
|
|
excepted (list): Strings that should not be removed.
|
2017-11-14 21:15:04 +03:00
|
|
|
RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
|
|
|
|
"""
|
2017-11-11 03:11:27 +03:00
|
|
|
if self.hits.size() == 0:
|
2017-11-11 20:38:32 +03:00
|
|
|
# If we don't have any hits, just skip cleanup
|
2017-11-11 03:11:27 +03:00
|
|
|
return
|
|
|
|
|
|
|
|
cdef vector[hash_t] tmp
|
2017-11-14 21:15:04 +03:00
|
|
|
dropped_strings = []
|
|
|
|
dropped_keys = []
|
2017-11-11 03:11:27 +03:00
|
|
|
for i in range(self.keys.size()):
|
|
|
|
key = self.keys[i]
|
2017-11-14 22:58:46 +03:00
|
|
|
# Here we cannot use __getitem__ because it also set hit.
|
|
|
|
utf8str = <Utf8Str*>self._map.get(key)
|
|
|
|
value = decode_Utf8Str(utf8str)
|
2017-11-14 22:40:03 +03:00
|
|
|
if self.hits.count(key) != 0 or value in excepted:
|
2017-11-11 03:11:27 +03:00
|
|
|
tmp.push_back(key)
|
2017-11-14 21:15:04 +03:00
|
|
|
else:
|
|
|
|
dropped_keys.append(key)
|
2017-11-14 22:40:03 +03:00
|
|
|
dropped_strings.append(value)
|
2017-11-11 03:11:27 +03:00
|
|
|
|
2017-11-14 17:56:30 +03:00
|
|
|
self.keys.swap(tmp)
|
2017-11-14 17:45:50 +03:00
|
|
|
strings = list(self)
|
|
|
|
self._reset_and_load(strings)
|
2017-11-14 19:44:40 +03:00
|
|
|
# Here we have strings but hits to it should be reseted
|
|
|
|
self.hits.clear()
|
2017-11-14 17:45:50 +03:00
|
|
|
|
2017-11-14 21:15:04 +03:00
|
|
|
return dropped_keys, dropped_strings
|
|
|
|
|
2016-10-24 14:49:03 +03:00
|
|
|
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
2016-09-30 21:20:22 +03:00
|
|
|
# 0 means missing, but we don't bother offsetting the index.
|
2019-03-08 13:42:26 +03:00
|
|
|
cdef bytes byte_string = py_string.encode("utf8")
|
2016-09-30 21:20:22 +03:00
|
|
|
return self._intern_utf8(byte_string, len(byte_string))
|
2015-07-20 12:26:46 +03:00
|
|
|
|
2016-09-30 11:14:47 +03:00
|
|
|
@cython.final
|
2016-10-24 14:49:03 +03:00
|
|
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
|
|
|
|
# TODO: This function's API/behaviour is an unholy mess...
|
2016-09-30 21:20:22 +03:00
|
|
|
# 0 means missing, but we don't bother offsetting the index.
|
2017-03-07 19:15:18 +03:00
|
|
|
cdef hash_t key = hash_utf8(utf8_string, length)
|
2016-10-24 14:49:03 +03:00
|
|
|
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
|
|
|
|
if value is not NULL:
|
|
|
|
return value
|
2017-05-28 13:36:27 +03:00
|
|
|
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
|
|
|
self._map.set(key, value)
|
2017-11-11 03:11:27 +03:00
|
|
|
self.hits.insert(key)
|
2017-05-28 13:36:27 +03:00
|
|
|
self.keys.push_back(key)
|
|
|
|
return value
|