Fix Doc pickling. This also removes need for Binder class

2025-07-15 10:42:34 +03:00 · 2017-10-17 16:11:13 +02:00 · 2017-10-17 16:11:13 +02:00 · 92c1eb2d6f
commit 92c1eb2d6f
parent ed8da9b11f
1 changed files with 19 additions and 3 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -27,7 +27,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYP
 from ..attrs cimport SENT_START
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 from ..util import normalize_slice
-from ..compat import is_config
+from ..compat import is_config, copy_reg, pickle
 from .. import about
 from .. import util
 from .underscore import Underscore
@ -104,7 +104,8 @@ cdef class Doc:
    def has_extension(cls, name):
        return name in Underscore.doc_extensions

-    def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
+    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
+                 orths_and_spaces=None):
        """Create a Doc object.

        vocab (Vocab): A vocabulary object, which must match any models you want
@ -114,6 +115,8 @@ cdef class Doc:
        spaces (list or None): A list of boolean values, of the same length as
            words. True means that the word is followed by a space, False means
            it is not. If `None`, defaults to `[True]*len(words)`
+        user_data (dict or None): Optional extra data to attach to the Doc.
+ 
        RETURNS (Doc): The newly constructed object.
        """
        self.vocab = vocab
@ -139,7 +142,7 @@ cdef class Doc:
        self.user_token_hooks = {}
        self.user_span_hooks = {}
        self.tensor = numpy.zeros((0,), dtype='float32')
-        self.user_data = {}
+        self.user_data = {} if user_data is None else user_data
        self._vector = None
        self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
        cdef unicode orth
@ -914,3 +917,16 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
        if tokens[i].head == 0 and tokens[i].dep != 0:
            tokens[tokens[i].l_edge].sent_start = True

+
+def pickle_doc(doc):
+    bytes_data = doc.to_bytes(exclude='vocab')
+    return (unpickle_doc, (doc.vocab, doc.user_data, bytes_data))
+
+
+def unpickle_doc(vocab, user_data, bytes_data):
+    doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data)
+    return doc
+
+
+copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
+