From 1d2e39d97476ed1f01b3a711db69b1ce9a4917d6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2020 15:10:10 +0200 Subject: [PATCH] Support to_dict in Doc --- spacy/tokens/doc.pyx | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index debab6aeb..3aa27e451 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -881,6 +881,32 @@ cdef class Doc: def to_bytes(self, exclude=tuple(), **kwargs): """Serialize, i.e. export the document contents to a binary string. + exclude (list): String names of serialization fields to exclude. + RETURNS (bytes): A losslessly serialized copy of the `Doc`, including + all annotations. + + DOCS: https://spacy.io/api/doc#to_bytes + """ + return srsly.msgpack_dumps(self.to_dict(exclude=exclude, **kwargs)) + + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + """Deserialize, i.e. import the document contents from a binary string. + + data (bytes): The string to load from. + exclude (list): String names of serialization fields to exclude. + RETURNS (Doc): Itself. + + DOCS: https://spacy.io/api/doc#from_bytes + """ + return self.from_dict( + srsly.msgpack_loads(bytes_data), + exclude=exclude, + **kwargs + ) + + def to_dict(self, exclude=tuple(), **kwargs): + """Export the document contents to a dictionary for serialization. + exclude (list): String names of serialization fields to exclude. RETURNS (bytes): A losslessly serialized copy of the `Doc`, including all annotations. @@ -917,9 +943,9 @@ cdef class Doc: serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys) if "user_data_values" not in exclude: serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) - return util.to_bytes(serializers, exclude) + return util.to_dict(serializers, exclude) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_dict(self, msg, exclude=tuple(), **kwargs): """Deserialize, i.e. import the document contents from a binary string. data (bytes): The string to load from. @@ -943,7 +969,6 @@ cdef class Doc: for key in kwargs: if key in deserializers or key in ("user_data",): raise ValueError(Errors.E128.format(arg=key)) - msg = util.from_bytes(bytes_data, deserializers, exclude) # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope @@ -975,6 +1000,7 @@ cdef class Doc: self.from_array(msg["array_head"][2:], attrs[:, 2:]) return self + def extend_tensor(self, tensor): """Concatenate a new tensor onto the doc.tensor object.