From 5a042ee0d3a571613d18c8b3038eeada0a0a902e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 20 Jul 2015 01:35:11 +0200 Subject: [PATCH] * Add function to predict number of bits needed to encode message --- spacy/serialize/huffman.pyx | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/spacy/serialize/huffman.pyx b/spacy/serialize/huffman.pyx index 3b3a4ac09..27b88f5ae 100644 --- a/spacy/serialize/huffman.pyx +++ b/spacy/serialize/huffman.pyx @@ -1,6 +1,7 @@ cimport cython from libcpp.queue cimport priority_queue from libcpp.pair cimport pair +import numpy from ..typedefs cimport attr_t @@ -59,6 +60,16 @@ cdef class HuffmanCodec: bits.extend(self.codes[i].bits, self.codes[i].length) return bits + def n_bits(self, msg, overhead=0): + cdef int i + length = 0 + for word in msg: + if word not in self._map: + return numpy.nan + i = self._map[word] + length += self.codes[i].length + return length + overhead * len(msg) + def decode(self, bits, msg): node = self.root cdef int i = 0