* Add working to/from bytes API to Doc

This commit is contained in:
Matthew Honnibal 2015-07-23 01:14:45 +02:00
parent 1f31d96bf9
commit a0e36e8efc

View File

@ -71,17 +71,6 @@ cdef class Doc:
self.is_tagged = False self.is_tagged = False
self.is_parsed = False self.is_parsed = False
self._py_tokens = [] self._py_tokens = []
cdef const LexemeC* lex
cdef attr_t orth
cdef bint space
if orths_and_spaces is not None:
for orth, space in orths_and_spaces:
lex = <LexemeC*>self.vocab._by_orth.get(orth)
if lex != NULL:
assert lex.orth == orth
self.push_back(lex, space)
else:
raise Exception('Lexeme not found: %d' % orth)
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a token. """Get a token.
@ -303,12 +292,11 @@ cdef class Doc:
return self return self
def to_bytes(self): def to_bytes(self):
bits = self.vocab.packer.pack(self) byte_string = self.vocab.serializer.pack(self)
return struct.pack('I', len(bits)) + bits.as_bytes() return struct.pack('I', len(byte_string)) + byte_string
def from_bytes(self, data): def from_bytes(self, bytes data):
bits = BitArray(data) self.vocab.serializer.unpack_into(data[4:], self)
self.vocab.packer.unpack_into(bits, self)
return self return self
@staticmethod @staticmethod
@ -316,15 +304,14 @@ cdef class Doc:
keep_reading = True keep_reading = True
while keep_reading: while keep_reading:
try: try:
n_bits_str = file_.read(4) n_bytes_str = file_.read(4)
if len(n_bits_str) < 4: if len(n_bytes_str) < 4:
break break
n_bits = struct.unpack('I', n_bits_str)[0] n_bytes = struct.unpack('I', n_bytes_str)[0]
n_bytes = n_bits // 8 + bool(n_bits % 8)
data = file_.read(n_bytes) data = file_.read(n_bytes)
except StopIteration: except StopIteration:
keep_reading = False keep_reading = False
yield data yield n_bytes_str + data
# This function is terrible --- need to fix this. # This function is terrible --- need to fix this.
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,