* Allow StringStore to be pickled, to start addressing Issue #125

This commit is contained in:
Matthew Honnibal 2015-10-12 15:12:32 +11:00
parent 41012907a8
commit 0cee928467
2 changed files with 29 additions and 1 deletions

View File

@ -69,12 +69,15 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except
cdef class StringStore:
'''Map strings to and from integer IDs.'''
def __init__(self):
def __init__(self, strings=None):
self.mem = Pool()
self._map = PreshMap()
self._resize_at = 10000
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
if strings is not None:
for string in strings:
_ = self[string]
property size:
def __get__(self):
@ -113,6 +116,14 @@ cdef class StringStore:
for i in range(self.size):
yield self[i]
def __reduce__(self):
strings = [""]
for i in range(1, self.size):
string = &self.c[i]
py_string = _decode(string)
strings.append(py_string)
return (StringStore, (strings,), None, None, None)
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
# 0 means missing, but we don't bother offsetting the index.
key = hash64(chars, length * sizeof(char), 0)

View File

@ -1,5 +1,7 @@
# -*- coding: utf8 -*-
from __future__ import unicode_literals
import pickle
import StringIO
from spacy.strings import StringStore
@ -76,3 +78,18 @@ def test_massive_strings(sstore):
s513 = '1' * 513
orth = sstore[s513]
assert sstore[orth] == s513
def test_pickle_string_store(sstore):
hello_id = sstore[u'Hi']
string_file = StringIO.StringIO()
pickle.dump(sstore, string_file)
string_file.seek(0)
loaded = pickle.load(string_file)
assert loaded[hello_id] == u'Hi'