diff --git a/spacy/strings.pyx b/spacy/strings.pyx index a4a470158..2208d3bdf 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -69,12 +69,15 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except cdef class StringStore: '''Map strings to and from integer IDs.''' - def __init__(self): + def __init__(self, strings=None): self.mem = Pool() self._map = PreshMap() self._resize_at = 10000 self.c = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.size = 1 + if strings is not None: + for string in strings: + _ = self[string] property size: def __get__(self): @@ -113,6 +116,14 @@ cdef class StringStore: for i in range(self.size): yield self[i] + def __reduce__(self): + strings = [""] + for i in range(1, self.size): + string = &self.c[i] + py_string = _decode(string) + strings.append(py_string) + return (StringStore, (strings,), None, None, None) + cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL: # 0 means missing, but we don't bother offsetting the index. key = hash64(chars, length * sizeof(char), 0) diff --git a/tests/vocab/test_intern.py b/tests/vocab/test_intern.py index 6e007c645..256706c6f 100644 --- a/tests/vocab/test_intern.py +++ b/tests/vocab/test_intern.py @@ -1,5 +1,7 @@ # -*- coding: utf8 -*- from __future__ import unicode_literals +import pickle +import StringIO from spacy.strings import StringStore @@ -76,3 +78,18 @@ def test_massive_strings(sstore): s513 = '1' * 513 orth = sstore[s513] assert sstore[orth] == s513 + + +def test_pickle_string_store(sstore): + hello_id = sstore[u'Hi'] + string_file = StringIO.StringIO() + pickle.dump(sstore, string_file) + + string_file.seek(0) + + loaded = pickle.load(string_file) + + assert loaded[hello_id] == u'Hi' + + +