* Optimistically try orth encoding, with char as a back-off

This commit is contained in:
Matthew Honnibal 2015-07-21 20:22:45 +02:00
parent c8b89e37a5
commit 9cfa59ec33

View File

@ -106,15 +106,9 @@ cdef class Packer:
return cls(vocab, util.read_encoding_freqs(data_dir)) return cls(vocab, util.read_encoding_freqs(data_dir))
def pack(self, Doc doc): def pack(self, Doc doc):
orths = doc.to_array([ORTH]) bits = self._orth_encode(doc)
orths = orths[:, 0] if bits is None:
cdef bytes chars = doc.string.encode('utf8') bits = self._char_encode(doc)
# n_bits returns nan for oov words, i.e. can't encode message.
# So, it's important to write the conditional like this.
if self.orth_codec.n_bits(orths) < self.char_codec.n_bits(chars, overhead=1):
bits = self._orth_encode(doc, orths)
else:
bits = self._char_encode(doc, chars)
cdef int i cdef int i
if self.attrs: if self.attrs:
@ -138,11 +132,14 @@ cdef class Packer:
doc.from_array(self.attrs, array) doc.from_array(self.attrs, array)
return doc return doc
def _orth_encode(self, Doc doc, attr_t[:] orths): def _orth_encode(self, Doc doc):
cdef BitArray bits = BitArray() cdef BitArray bits = BitArray()
cdef int32_t length = len(doc) cdef int32_t length = len(doc)
bits.extend(length, 32) bits.extend(length, 32)
self.orth_codec.encode_int32(orths, bits) orths = doc.to_array([ORTH])
n_bits = self.orth_codec.encode_int32(orths[:, 0], bits)
if n_bits == 0:
return None
for token in doc: for token in doc:
bits.append(bool(token.whitespace_)) bits.append(bool(token.whitespace_))
return bits return bits
@ -154,7 +151,8 @@ cdef class Packer:
cdef Doc doc = Doc(self.vocab, orths_and_spaces) cdef Doc doc = Doc(self.vocab, orths_and_spaces)
return doc return doc
def _char_encode(self, Doc doc, bytes utf8_str): def _char_encode(self, Doc doc):
cdef bytes utf8_str = doc.string.encode('utf8')
cdef BitArray bits = BitArray() cdef BitArray bits = BitArray()
cdef int32_t length = len(utf8_str) cdef int32_t length = len(utf8_str)
# Signal chars with negative length # Signal chars with negative length