mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Fix Doc pickling. This also removes need for Binder class
This commit is contained in:
parent
ed8da9b11f
commit
92c1eb2d6f
|
@ -27,7 +27,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYP
|
||||||
from ..attrs cimport SENT_START
|
from ..attrs cimport SENT_START
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..compat import is_config
|
from ..compat import is_config, copy_reg, pickle
|
||||||
from .. import about
|
from .. import about
|
||||||
from .. import util
|
from .. import util
|
||||||
from .underscore import Underscore
|
from .underscore import Underscore
|
||||||
|
@ -104,7 +104,8 @@ cdef class Doc:
|
||||||
def has_extension(cls, name):
|
def has_extension(cls, name):
|
||||||
return name in Underscore.doc_extensions
|
return name in Underscore.doc_extensions
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
|
||||||
|
orths_and_spaces=None):
|
||||||
"""Create a Doc object.
|
"""Create a Doc object.
|
||||||
|
|
||||||
vocab (Vocab): A vocabulary object, which must match any models you want
|
vocab (Vocab): A vocabulary object, which must match any models you want
|
||||||
|
@ -114,6 +115,8 @@ cdef class Doc:
|
||||||
spaces (list or None): A list of boolean values, of the same length as
|
spaces (list or None): A list of boolean values, of the same length as
|
||||||
words. True means that the word is followed by a space, False means
|
words. True means that the word is followed by a space, False means
|
||||||
it is not. If `None`, defaults to `[True]*len(words)`
|
it is not. If `None`, defaults to `[True]*len(words)`
|
||||||
|
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||||
|
|
||||||
RETURNS (Doc): The newly constructed object.
|
RETURNS (Doc): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -139,7 +142,7 @@ cdef class Doc:
|
||||||
self.user_token_hooks = {}
|
self.user_token_hooks = {}
|
||||||
self.user_span_hooks = {}
|
self.user_span_hooks = {}
|
||||||
self.tensor = numpy.zeros((0,), dtype='float32')
|
self.tensor = numpy.zeros((0,), dtype='float32')
|
||||||
self.user_data = {}
|
self.user_data = {} if user_data is None else user_data
|
||||||
self._vector = None
|
self._vector = None
|
||||||
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
||||||
cdef unicode orth
|
cdef unicode orth
|
||||||
|
@ -914,3 +917,16 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
if tokens[i].head == 0 and tokens[i].dep != 0:
|
if tokens[i].head == 0 and tokens[i].dep != 0:
|
||||||
tokens[tokens[i].l_edge].sent_start = True
|
tokens[tokens[i].l_edge].sent_start = True
|
||||||
|
|
||||||
|
|
||||||
|
def pickle_doc(doc):
|
||||||
|
bytes_data = doc.to_bytes(exclude='vocab')
|
||||||
|
return (unpickle_doc, (doc.vocab, doc.user_data, bytes_data))
|
||||||
|
|
||||||
|
|
||||||
|
def unpickle_doc(vocab, user_data, bytes_data):
|
||||||
|
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user