from libc.string cimport memcpy from murmurhash.mrmr cimport hash64 import codecs SEPARATOR = '\n|-SEP-|\n' cdef class StringStore: def __init__(self): self.mem = Pool() self.table = PreshMap() self._resize_at = 10000 self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.size = 1 property size: def __get__(self): return self.size-1 def __getitem__(self, string_or_id): cdef bytes byte_string cdef Utf8Str* utf8str if type(string_or_id) == int or type(string_or_id) == long: if string_or_id < 1 or string_or_id >= self.size: raise IndexError(string_or_id) utf8str = &self.strings[string_or_id] return utf8str.chars[:utf8str.length] elif type(string_or_id) == bytes: utf8str = self.intern(string_or_id, len(string_or_id)) return utf8str.i else: raise TypeError(type(string_or_id)) cdef Utf8Str* intern(self, char* chars, int length) except NULL: # 0 means missing, but we don't bother offsetting the index. We waste # slot 0 to simplify the code, because it doesn't matter. assert length != 0 cdef hash_t key = hash64(chars, length * sizeof(char), 0) cdef void* value = self.table.get(key) cdef size_t i if value == NULL: if self.size == self._resize_at: self._resize_at *= 2 self.strings = self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str)) i = self.size self.strings[i].i = self.size self.strings[i].key = key self.strings[i].chars = self.mem.alloc(length, sizeof(char)) memcpy(self.strings[i].chars, chars, length) self.strings[i].length = length self.table.set(key, self.size) self.size += 1 else: i = value return &self.strings[i] def dump(self, loc): strings = [] cdef Utf8Str* string cdef bytes py_string for i in range(self.size): string = &self.strings[i] py_string = string.chars[:string.length] strings.append(py_string.decode('utf8')) with codecs.open(loc, 'w', 'utf8') as file_: file_.write(SEPARATOR.join(strings)) def load(self, loc): with codecs.open(loc, 'r', 'utf8') as file_: strings = file_.read().split(SEPARATOR) cdef unicode string cdef bytes byte_string for string in strings[1:]: byte_string = string.encode('utf8') self.intern(byte_string, len(byte_string))