mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Move serialization functionality out into a Serializer object
This commit is contained in:
parent
a6d040bd11
commit
e2133d990e
|
@ -6,7 +6,8 @@ import numpy
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
from ..serialize import BitArray
|
from ..serialize import BitArray
|
||||||
from ..strings cimport slice_unicode
|
from ..strings cimport slice_unicode
|
||||||
from ..attrs cimport attr_id_t, attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
|
from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
from ..parts_of_speech import UNIV_POS_NAMES
|
from ..parts_of_speech import UNIV_POS_NAMES
|
||||||
|
@ -369,41 +370,3 @@ cdef class Doc:
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
return self[start]
|
return self[start]
|
||||||
|
|
||||||
def serialize(self, codecs, bits=None):
|
|
||||||
if bits is None:
|
|
||||||
bits = BitArray()
|
|
||||||
array = self.to_array([codec.attr_id for codec in codecs])
|
|
||||||
for i, codec in enumerate(codecs):
|
|
||||||
codec.encode(array[i,], bits)
|
|
||||||
return bits
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def deserialize(Vocab vocab, bits):
|
|
||||||
biterator = iter(bits)
|
|
||||||
ids = vocab.codecs[0].decode(bits)
|
|
||||||
cdef Doc doc = Doc(vocab)
|
|
||||||
cdef int id_
|
|
||||||
for id_ in ids:
|
|
||||||
is_spacy = biterator.next()
|
|
||||||
doc.push_back(vocab.lexemes.at(id_), is_spacy)
|
|
||||||
|
|
||||||
cdef int i
|
|
||||||
cdef attr_t value
|
|
||||||
for codec in vocab.codecs[1:]:
|
|
||||||
values = codec.decode(biterator)
|
|
||||||
if codec.id == HEAD:
|
|
||||||
for i, value in enumerate(values):
|
|
||||||
doc.data[i].head = value
|
|
||||||
elif codec.id == TAG:
|
|
||||||
for i, value in enumerate(values):
|
|
||||||
doc.data[i].tag = value
|
|
||||||
elif codec.id == DEP:
|
|
||||||
for i, value in enumerate(values):
|
|
||||||
doc.data[i].dep = value
|
|
||||||
elif codec.id == ENT_IOB:
|
|
||||||
for i, value in enumerate(values):
|
|
||||||
doc.data[i].ent_iob = value
|
|
||||||
elif codec.id == ENT_TYPE:
|
|
||||||
for i, value in enumerate(values):
|
|
||||||
doc.data[i].ent_type = value
|
|
||||||
return doc
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user