mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 19:08:06 +03:00
* Nearly finished huffman coder
This commit is contained in:
parent
e1a25fba32
commit
281f1faefb
|
@ -11,19 +11,12 @@ import numpy
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
# Format
|
||||||
#cdef class Serializer:
|
# - Total number of bytes in message (32 bit int)
|
||||||
# def __init__(self, Vocab vocab):
|
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
|
||||||
# pass
|
# - Spaces ~1 bit per word
|
||||||
#
|
# - Parse: Huffman coded head offset / dep label / POS tag / entity IOB tag
|
||||||
# def dump(self, Doc tokens, file_):
|
# combo. ? bits per word. 40 * 80 * 40 * 12 = 1.5m symbol vocab
|
||||||
# pass
|
|
||||||
# # Format
|
|
||||||
# # - Total number of bytes in message (32 bit int)
|
|
||||||
# # - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
|
|
||||||
# # - Spaces ~1 bit per word
|
|
||||||
# # - Parse: Huffman coded head offset / dep label / POS tag / entity IOB tag
|
|
||||||
# # combo. ? bits per word. 40 * 80 * 40 * 12 = 1.5m symbol vocab
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Node:
|
cdef struct Node:
|
||||||
|
@ -53,21 +46,11 @@ cdef Code bit_append(Code code, bint bit) nogil:
|
||||||
cdef class HuffmanCodec:
|
cdef class HuffmanCodec:
|
||||||
cdef vector[Node] nodes
|
cdef vector[Node] nodes
|
||||||
cdef vector[Code] codes
|
cdef vector[Code] codes
|
||||||
cdef float[:] probs
|
cdef readonly float[:] probs
|
||||||
cdef PreshMap table
|
cdef PreshMap table
|
||||||
def __init__(self, symbols, probs):
|
cdef uint32_t eol
|
||||||
self.table = PreshMap()
|
def __init__(self, probs, eol):
|
||||||
cdef bytes symb_str
|
self.eol = eol
|
||||||
cdef uint64_t key
|
|
||||||
cdef uint32_t i
|
|
||||||
for i, symbol in enumerate(symbols):
|
|
||||||
if type(symbol) == unicode or type(symbol) == bytes:
|
|
||||||
symb_str = symbol.encode('utf8')
|
|
||||||
key = hash64(<unsigned char*>symb_str, len(symb_str), 0)
|
|
||||||
else:
|
|
||||||
key = int(symbol)
|
|
||||||
self.table[key] = i+1
|
|
||||||
self.symbols = symbols
|
|
||||||
self.probs = probs
|
self.probs = probs
|
||||||
self.codes.resize(len(probs))
|
self.codes.resize(len(probs))
|
||||||
for i in range(len(self.codes)):
|
for i in range(len(self.codes)):
|
||||||
|
@ -79,47 +62,47 @@ cdef class HuffmanCodec:
|
||||||
path.length = 0
|
path.length = 0
|
||||||
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
|
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
|
||||||
|
|
||||||
def encode(self, sequence):
|
def encode(self, uint32_t[:] sequence):
|
||||||
cdef vector[bint] bits
|
cdef Code code
|
||||||
cdef uint64_t key
|
cdef bytes output = b''
|
||||||
cdef uint64_t i
|
cdef unsigned char byte = 0
|
||||||
for symbol in sequence:
|
cdef uint64_t one = 1
|
||||||
if type(symbol) == unicode or type(symbol) == bytes:
|
cdef unsigned char i_of_byte = 0
|
||||||
symb_str = symbol.encode('utf8')
|
cdef unsigned char i_of_code = 0
|
||||||
key = hash64(<unsigned char*>symb_str, len(symb_str), 0)
|
for index in sequence:
|
||||||
else:
|
code = self.codes[index]
|
||||||
key = int(symbol)
|
for i_of_code in range(code.length):
|
||||||
i = <uint32_t>self.table.get(key)
|
if code.bits & (one << i_of_code):
|
||||||
if i == 0:
|
byte |= one << i_of_byte
|
||||||
raise Exception("Unseen symbol: %s" % symbol)
|
else:
|
||||||
else:
|
byte &= ~(one << i_of_byte)
|
||||||
code = self.codes[i]
|
i_of_byte += 1
|
||||||
bits.extend(code)
|
if i_of_byte == 8:
|
||||||
return bits
|
output += chr(byte)
|
||||||
|
byte = 0
|
||||||
|
i_of_byte = 0
|
||||||
|
if i_of_byte != 0:
|
||||||
|
output += chr(byte)
|
||||||
|
return output
|
||||||
|
|
||||||
def decode(self, unsigned char[:] data):
|
def decode(self, bytes data):
|
||||||
symbols = []
|
|
||||||
node = self.nodes.back()
|
node = self.nodes.back()
|
||||||
bits = []
|
symbols = []
|
||||||
cdef unsigned char byte
|
cdef unsigned char byte
|
||||||
cdef unsigned char one
|
cdef unsigned char i = 0
|
||||||
cdef int i = 0
|
cdef unsigned char one = 1
|
||||||
for byte_ in data:
|
for byte in data:
|
||||||
for i in range(7, -1, -1):
|
for i in range(8):
|
||||||
bits.append(bool(byte & (one << i)))
|
branch = node.right if (byte & (one << i)) else node.left
|
||||||
|
if branch >= 0:
|
||||||
cdef bint bit = 0
|
node = self.nodes.at(branch)
|
||||||
|
else:
|
||||||
for bit in bits:
|
symbol = -(branch + 1)
|
||||||
branch = node.right if bit else node.left
|
if symbol == self.eol:
|
||||||
if branch >= 0:
|
return symbols
|
||||||
node = self.nodes.at(branch)
|
else:
|
||||||
else:
|
symbols.append(symbol)
|
||||||
symbol = self.symbols[-(branch + 1)]
|
node = self.nodes.back()
|
||||||
if symbol == self.eol_symbol:
|
|
||||||
break
|
|
||||||
symbols.append(symbol)
|
|
||||||
node = self.nodes.back()
|
|
||||||
return symbols
|
return symbols
|
||||||
|
|
||||||
property strings:
|
property strings:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user