mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
* Add working to/from bytes API to Doc
This commit is contained in:
parent
1f31d96bf9
commit
a0e36e8efc
|
@ -71,17 +71,6 @@ cdef class Doc:
|
||||||
self.is_tagged = False
|
self.is_tagged = False
|
||||||
self.is_parsed = False
|
self.is_parsed = False
|
||||||
self._py_tokens = []
|
self._py_tokens = []
|
||||||
cdef const LexemeC* lex
|
|
||||||
cdef attr_t orth
|
|
||||||
cdef bint space
|
|
||||||
if orths_and_spaces is not None:
|
|
||||||
for orth, space in orths_and_spaces:
|
|
||||||
lex = <LexemeC*>self.vocab._by_orth.get(orth)
|
|
||||||
if lex != NULL:
|
|
||||||
assert lex.orth == orth
|
|
||||||
self.push_back(lex, space)
|
|
||||||
else:
|
|
||||||
raise Exception('Lexeme not found: %d' % orth)
|
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a token.
|
"""Get a token.
|
||||||
|
@ -303,12 +292,11 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
bits = self.vocab.packer.pack(self)
|
byte_string = self.vocab.serializer.pack(self)
|
||||||
return struct.pack('I', len(bits)) + bits.as_bytes()
|
return struct.pack('I', len(byte_string)) + byte_string
|
||||||
|
|
||||||
def from_bytes(self, data):
|
def from_bytes(self, bytes data):
|
||||||
bits = BitArray(data)
|
self.vocab.serializer.unpack_into(data[4:], self)
|
||||||
self.vocab.packer.unpack_into(bits, self)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -316,15 +304,14 @@ cdef class Doc:
|
||||||
keep_reading = True
|
keep_reading = True
|
||||||
while keep_reading:
|
while keep_reading:
|
||||||
try:
|
try:
|
||||||
n_bits_str = file_.read(4)
|
n_bytes_str = file_.read(4)
|
||||||
if len(n_bits_str) < 4:
|
if len(n_bytes_str) < 4:
|
||||||
break
|
break
|
||||||
n_bits = struct.unpack('I', n_bits_str)[0]
|
n_bytes = struct.unpack('I', n_bytes_str)[0]
|
||||||
n_bytes = n_bits // 8 + bool(n_bits % 8)
|
|
||||||
data = file_.read(n_bytes)
|
data = file_.read(n_bytes)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
keep_reading = False
|
keep_reading = False
|
||||||
yield data
|
yield n_bytes_str + data
|
||||||
|
|
||||||
# This function is terrible --- need to fix this.
|
# This function is terrible --- need to fix this.
|
||||||
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user