mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Major refactor of serialization. Nearly complete now.
This commit is contained in:
parent
c8282f9934
commit
db9dfd2e23
4
setup.py
4
setup.py
|
@ -94,6 +94,7 @@ def cython_setup(mod_names, language, includes, compile_args, link_args):
|
||||||
"data/vocab/lexemes.bin",
|
"data/vocab/lexemes.bin",
|
||||||
"data/vocab/strings.txt"],
|
"data/vocab/strings.txt"],
|
||||||
"spacy.tokens": ["*.pxd"],
|
"spacy.tokens": ["*.pxd"],
|
||||||
|
"spacy.serialize": ["*.pxd"],
|
||||||
"spacy.syntax": ["*.pxd"]},
|
"spacy.syntax": ["*.pxd"]},
|
||||||
ext_modules=exts,
|
ext_modules=exts,
|
||||||
cmdclass={'build_ext': Cython.Distutils.build_ext},
|
cmdclass={'build_ext': Cython.Distutils.build_ext},
|
||||||
|
@ -158,8 +159,9 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
||||||
'spacy.syntax.transition_system',
|
'spacy.syntax.transition_system',
|
||||||
'spacy.syntax.arc_eager',
|
'spacy.syntax.arc_eager',
|
||||||
'spacy.syntax._parse_features',
|
'spacy.syntax._parse_features',
|
||||||
'spacy.gold', 'spacy.orth', 'spacy.serialize',
|
'spacy.gold', 'spacy.orth',
|
||||||
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
|
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
|
||||||
|
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
|
||||||
'spacy.syntax.ner']
|
'spacy.syntax.ner']
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -70,6 +70,7 @@ class English(object):
|
||||||
Tagger=EnPosTagger,
|
Tagger=EnPosTagger,
|
||||||
Parser=ParserFactory(ParserTransitionSystem),
|
Parser=ParserFactory(ParserTransitionSystem),
|
||||||
Entity=ParserFactory(EntityTransitionSystem),
|
Entity=ParserFactory(EntityTransitionSystem),
|
||||||
|
Packer=None,
|
||||||
load_vectors=True
|
load_vectors=True
|
||||||
):
|
):
|
||||||
|
|
||||||
|
@ -101,10 +102,10 @@ class English(object):
|
||||||
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
|
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
|
||||||
else:
|
else:
|
||||||
self.entity = None
|
self.entity = None
|
||||||
if Serializer:
|
if Packer:
|
||||||
self.bitter = Serializer(self.vocab, data_dir)
|
self.packer = Packer(self.vocab, data_dir)
|
||||||
else:
|
else:
|
||||||
self.bitter = None
|
self.packer = None
|
||||||
self.mwe_merger = RegexMerger([
|
self.mwe_merger = RegexMerger([
|
||||||
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
|
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
|
||||||
('CD', 'TIME', regexes.TIME_RE),
|
('CD', 'TIME', regexes.TIME_RE),
|
||||||
|
|
|
@ -1,334 +0,0 @@
|
||||||
from libcpp.vector cimport vector
|
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
from libc.stdint cimport int64_t
|
|
||||||
from libc.stdint cimport int32_t
|
|
||||||
from libc.stdint cimport uint64_t
|
|
||||||
from libcpp.queue cimport priority_queue
|
|
||||||
from libcpp.pair cimport pair
|
|
||||||
|
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
from .tokens.doc cimport Doc
|
|
||||||
from .vocab cimport Vocab
|
|
||||||
|
|
||||||
from os import path
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
cimport cython
|
|
||||||
|
|
||||||
ctypedef unsigned char uchar
|
|
||||||
|
|
||||||
# Format
|
|
||||||
# - Total number of bytes in message (32 bit int)
|
|
||||||
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
|
|
||||||
# - Spaces ~1 bit per word
|
|
||||||
# - Parse: Huffman coded head offset / dep label / POS tag / entity IOB tag
|
|
||||||
# combo. ? bits per word. 40 * 80 * 40 * 12 = 1.5m symbol vocab
|
|
||||||
|
|
||||||
|
|
||||||
# Note that we're setting the most significant bits here first, when in practice
|
|
||||||
# we're actually wanting the last bit to be most significant (for Huffman coding,
|
|
||||||
# anyway).
|
|
||||||
cdef Code bit_append(Code code, bint bit) nogil:
|
|
||||||
cdef uint64_t one = 1
|
|
||||||
if bit:
|
|
||||||
code.bits |= one << code.length
|
|
||||||
else:
|
|
||||||
code.bits &= ~(one << code.length)
|
|
||||||
code.length += 1
|
|
||||||
return code
|
|
||||||
|
|
||||||
|
|
||||||
cdef class BitArray:
|
|
||||||
cdef bytes data
|
|
||||||
cdef unsigned char byte
|
|
||||||
cdef unsigned char bit_of_byte
|
|
||||||
cdef uint32_t i
|
|
||||||
def __init__(self):
|
|
||||||
self.data = b''
|
|
||||||
self.byte = 0
|
|
||||||
self.bit_of_byte = 0
|
|
||||||
self.i = 0
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
cdef uchar byte, i
|
|
||||||
cdef uchar one = 1
|
|
||||||
start_byte = self.i // 8
|
|
||||||
if (self.i % 8) != 0:
|
|
||||||
for i in range(self.i % 8):
|
|
||||||
yield 1 if (self.data[start_byte] & (one << i)) else 0
|
|
||||||
start_byte += 1
|
|
||||||
for byte in self.data[start_byte:]:
|
|
||||||
for i in range(8):
|
|
||||||
yield 1 if byte & (one << i) else 0
|
|
||||||
for i in range(self.bit_of_byte):
|
|
||||||
yield 1 if self.byte & (one << i) else 0
|
|
||||||
|
|
||||||
def as_bytes(self):
|
|
||||||
if self.bit_of_byte != 0:
|
|
||||||
return self.data + chr(self.byte)
|
|
||||||
else:
|
|
||||||
return self.data
|
|
||||||
|
|
||||||
def append(self, bint bit):
|
|
||||||
cdef uint64_t one = 1
|
|
||||||
if bit:
|
|
||||||
self.byte |= one << self.bit_of_byte
|
|
||||||
else:
|
|
||||||
self.byte &= ~(one << self.bit_of_byte)
|
|
||||||
self.bit_of_byte += 1
|
|
||||||
if self.bit_of_byte == 8:
|
|
||||||
self.data += chr(self.byte)
|
|
||||||
self.byte = 0
|
|
||||||
self.bit_of_byte = 0
|
|
||||||
|
|
||||||
cdef int extend(self, uint64_t code, char n_bits) except -1:
|
|
||||||
cdef uint64_t one = 1
|
|
||||||
cdef unsigned char bit_of_code
|
|
||||||
for bit_of_code in range(n_bits):
|
|
||||||
if code & (one << bit_of_code):
|
|
||||||
self.byte |= one << self.bit_of_byte
|
|
||||||
else:
|
|
||||||
self.byte &= ~(one << self.bit_of_byte)
|
|
||||||
self.bit_of_byte += 1
|
|
||||||
if self.bit_of_byte == 8:
|
|
||||||
self.data += chr(self.byte)
|
|
||||||
self.byte = 0
|
|
||||||
self.bit_of_byte = 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Serializer:
|
|
||||||
# Manage codecs, maintain consistent format for io
|
|
||||||
def __init__(self, Vocab vocab, data_dir):
|
|
||||||
model_dir = path.join(data_dir, 'bitter')
|
|
||||||
self.vocab = vocab # Vocab owns the word codec, the big one
|
|
||||||
#self.cfg = Config.read(model_dir, 'config')
|
|
||||||
self.codecs = tuple([CodecWrapper(attr) for attr in self.cfg.attrs])
|
|
||||||
|
|
||||||
def __call__(self, doc_or_bits):
|
|
||||||
if isinstance(doc_or_bits, Doc):
|
|
||||||
return self.serialize(doc_or_bits)
|
|
||||||
elif isinstance(doc_or_bits, BitArray):
|
|
||||||
return self.deserialize(doc_or_bits)
|
|
||||||
else:
|
|
||||||
raise ValueError(doc_or_bits)
|
|
||||||
|
|
||||||
def train(self, doc):
|
|
||||||
array = doc.to_array([codec.id for codec in self.codecs])
|
|
||||||
for i, codec in enumerate(self.codecs):
|
|
||||||
codec.count(array[i])
|
|
||||||
|
|
||||||
def serialize(self, doc):
|
|
||||||
bits = BitArray()
|
|
||||||
array = doc.to_array(self.attrs)
|
|
||||||
for i, codec in enumerate(self.codecs):
|
|
||||||
codec.encode(array[i,], bits)
|
|
||||||
return bits
|
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
|
||||||
def deserialize(self, bits):
|
|
||||||
biterator = iter(bits)
|
|
||||||
cdef Doc doc = Doc(self.vocab)
|
|
||||||
ids = self.vocab.codec.decode(biterator)
|
|
||||||
cdef int id_
|
|
||||||
cdef bint is_spacy
|
|
||||||
for id_ in ids:
|
|
||||||
is_spacy = biterator.next()
|
|
||||||
doc.push_back(self.vocab.lexemes.at(id_), is_spacy)
|
|
||||||
|
|
||||||
cdef int length = doc.length
|
|
||||||
array = numpy.zeros(shape=(length, len(self.codecs)), dtype=numpy.int)
|
|
||||||
for i, codec in enumerate(self.codecs):
|
|
||||||
array[i] = codec.decode(biterator)
|
|
||||||
doc.from_array([c.id for c in self.codecs], array)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
cdef class CodecWrapper:
|
|
||||||
"""Wrapper around HuffmanCodec"""
|
|
||||||
def __init__(self, freqs, id=0):
|
|
||||||
cdef uint64_t key
|
|
||||||
cdef uint64_t count
|
|
||||||
cdef pair[uint64_t, uint64_t] item
|
|
||||||
cdef priority_queue[pair[uint64_t, uint64_t]] items
|
|
||||||
for key, count in freqs:
|
|
||||||
item.first = count
|
|
||||||
item.second = key
|
|
||||||
items.push(item)
|
|
||||||
|
|
||||||
weights = [] #array('f')
|
|
||||||
keys = [] #array('i')
|
|
||||||
key_to_i = PreshMap()
|
|
||||||
i = 0
|
|
||||||
while not items.empty():
|
|
||||||
item = items.top()
|
|
||||||
weights.append(item.first)
|
|
||||||
keys.append(item.second)
|
|
||||||
key_to_i[item.second] = i
|
|
||||||
i += 1
|
|
||||||
items.pop()
|
|
||||||
|
|
||||||
def encode(self, symbols):
|
|
||||||
indices = [self.table[symbol] for symbol in symbols]
|
|
||||||
return self._codec.encode(indices)
|
|
||||||
|
|
||||||
def decode(self, bits):
|
|
||||||
indices = self._codec.decode(bits)
|
|
||||||
return [self.symbols[i] for i in indices]
|
|
||||||
|
|
||||||
|
|
||||||
cdef class HuffmanCodec:
|
|
||||||
"""Create a Huffman code table, and use it to pack and unpack sequences into
|
|
||||||
byte strings. Emphasis is on efficiency, so API is quite strict:
|
|
||||||
|
|
||||||
Messages will be encoded/decoded as indices that refer to the probability sequence.
|
|
||||||
For instance, the sequence [5, 10, 8] indicates the 5th most frequent item,
|
|
||||||
the 10th most frequent item, the 8th most frequent item. The codec will add
|
|
||||||
the EOL symbol to your message. An exception will be raised if you include
|
|
||||||
the EOL symbol in your message.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
weights (float[:]): A descending-sorted sequence of probabilities/weights.
|
|
||||||
Must include a weight for an EOL symbol.
|
|
||||||
|
|
||||||
eol (uint32_t): The index of the weight of the EOL symbol.
|
|
||||||
"""
|
|
||||||
def __init__(self, float[:] weights, uint32_t eol):
|
|
||||||
self.codes.resize(len(weights))
|
|
||||||
for i in range(len(self.codes)):
|
|
||||||
self.codes[i].bits = 0
|
|
||||||
self.codes[i].length = 0
|
|
||||||
populate_nodes(self.nodes, weights)
|
|
||||||
cdef Code path
|
|
||||||
path.bits = 0
|
|
||||||
path.length = 0
|
|
||||||
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
|
|
||||||
|
|
||||||
def encode(self, uint32_t[:] sequence, BitArray bits=None):
|
|
||||||
if bits is None:
|
|
||||||
bits = BitArray()
|
|
||||||
for i in sequence:
|
|
||||||
bits.extend(self.codes[i].bits, self.codes[i].length)
|
|
||||||
bits.extend(self.codes[self.eol].bits, self.codes[self.eol].length)
|
|
||||||
return bits
|
|
||||||
|
|
||||||
def decode(self, bits):
|
|
||||||
node = self.nodes.back()
|
|
||||||
symbols = []
|
|
||||||
for bit in bits:
|
|
||||||
branch = node.right if bit else node.left
|
|
||||||
if branch >= 0:
|
|
||||||
node = self.nodes.at(branch)
|
|
||||||
else:
|
|
||||||
symbol = -(branch + 1)
|
|
||||||
if symbol == self.eol:
|
|
||||||
return symbols
|
|
||||||
else:
|
|
||||||
symbols.append(symbol)
|
|
||||||
node = self.nodes.back()
|
|
||||||
return symbols
|
|
||||||
|
|
||||||
property strings:
|
|
||||||
@cython.boundscheck(False)
|
|
||||||
@cython.wraparound(False)
|
|
||||||
@cython.nonecheck(False)
|
|
||||||
def __get__(self):
|
|
||||||
output = []
|
|
||||||
cdef int i, j
|
|
||||||
cdef bytes string
|
|
||||||
cdef Code code
|
|
||||||
for i in range(self.codes.size()):
|
|
||||||
code = self.codes[i]
|
|
||||||
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
|
|
||||||
string = string[::-1]
|
|
||||||
output.append(string)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
|
||||||
@cython.wraparound(False)
|
|
||||||
@cython.nonecheck(False)
|
|
||||||
cdef int populate_nodes(vector[Node]& nodes, float[:] probs) except -1:
|
|
||||||
assert len(probs) >= 3
|
|
||||||
cdef int size = len(probs)
|
|
||||||
cdef int i = size - 1
|
|
||||||
cdef int j = 0
|
|
||||||
|
|
||||||
while i >= 0 or (j+1) < nodes.size():
|
|
||||||
if i < 0:
|
|
||||||
_cover_two_nodes(nodes, j)
|
|
||||||
j += 2
|
|
||||||
elif j >= nodes.size():
|
|
||||||
_cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
|
|
||||||
i -= 2
|
|
||||||
elif i >= 1 and (j == nodes.size() or probs[i-1] < nodes[j].prob):
|
|
||||||
_cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
|
|
||||||
i -= 2
|
|
||||||
elif (j+1) < nodes.size() and nodes[j+1].prob < probs[i]:
|
|
||||||
_cover_two_nodes(nodes, j)
|
|
||||||
j += 2
|
|
||||||
else:
|
|
||||||
_cover_one_word_one_node(nodes, j, i, probs[i])
|
|
||||||
i -= 1
|
|
||||||
j += 1
|
|
||||||
return 0
|
|
||||||
|
|
||||||
cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil:
|
|
||||||
"""Introduce a new non-terminal, over two non-terminals)"""
|
|
||||||
cdef Node node
|
|
||||||
node.left = j
|
|
||||||
node.right = j+1
|
|
||||||
node.prob = nodes[j].prob + nodes[j+1].prob
|
|
||||||
nodes.push_back(node)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float prob) nogil:
|
|
||||||
"""Introduce a new non-terminal, over one terminal and one non-terminal."""
|
|
||||||
cdef Node node
|
|
||||||
# Encode leaves as negative integers, where the integer is the index of the
|
|
||||||
# word in the vocabulary.
|
|
||||||
cdef int64_t leaf_id = - <int64_t>(id_ + 1)
|
|
||||||
cdef float new_prob = prob + nodes[j].prob
|
|
||||||
if prob < nodes[j].prob:
|
|
||||||
node.left = leaf_id
|
|
||||||
node.right = j
|
|
||||||
node.prob = new_prob
|
|
||||||
else:
|
|
||||||
node.left = j
|
|
||||||
node.right = leaf_id
|
|
||||||
node.prob = new_prob
|
|
||||||
nodes.push_back(node)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nogil:
|
|
||||||
"""Introduce a new node, over two non-terminals."""
|
|
||||||
cdef Node node
|
|
||||||
node.left = -(id1+1)
|
|
||||||
node.right = -(id2+1)
|
|
||||||
node.prob = prob
|
|
||||||
nodes.push_back(node)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
|
|
||||||
"""Recursively assign paths, from the top down. At the end, the entry codes[i]
|
|
||||||
knows the bit-address of the node[j] that points to entry i in the vocabulary.
|
|
||||||
So, to encode i, we go to codes[i] and read its bit-string. To decode, we
|
|
||||||
navigate nodes recursively.
|
|
||||||
"""
|
|
||||||
cdef Code left_path = bit_append(path, 0)
|
|
||||||
cdef Code right_path = bit_append(path, 1)
|
|
||||||
|
|
||||||
# Assign down left branch
|
|
||||||
if nodes[i].left >= 0:
|
|
||||||
assign_codes(nodes, codes, nodes[i].left, left_path)
|
|
||||||
else:
|
|
||||||
# Leaf on left
|
|
||||||
id_ = -(nodes[i].left + 1)
|
|
||||||
codes[id_] = left_path
|
|
||||||
# Assign down right branch
|
|
||||||
if nodes[i].right >= 0:
|
|
||||||
assign_codes(nodes, codes, nodes[i].right, right_path)
|
|
||||||
else:
|
|
||||||
# Leaf on right
|
|
||||||
id_ = -(nodes[i].right + 1)
|
|
||||||
codes[id_] = right_path
|
|
21
spacy/serialize/bits.pxd
Normal file
21
spacy/serialize/bits.pxd
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
|
ctypedef unsigned char uchar
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct Code:
|
||||||
|
uint64_t bits
|
||||||
|
char length
|
||||||
|
|
||||||
|
|
||||||
|
cdef Code bit_append(Code code, bint bit) nogil
|
||||||
|
|
||||||
|
|
||||||
|
cdef class BitArray:
|
||||||
|
cdef bytes data
|
||||||
|
cdef uchar byte
|
||||||
|
cdef uchar bit_of_byte
|
||||||
|
cdef uint32_t i
|
||||||
|
|
||||||
|
cdef int extend(self, uint64_t code, char n_bits) except -1
|
69
spacy/serialize/bits.pyx
Normal file
69
spacy/serialize/bits.pyx
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
|
||||||
|
|
||||||
|
# Note that we're setting the most significant bits here first, when in practice
|
||||||
|
# we're actually wanting the last bit to be most significant (for Huffman coding,
|
||||||
|
# anyway).
|
||||||
|
cdef Code bit_append(Code code, bint bit) nogil:
|
||||||
|
cdef uint64_t one = 1
|
||||||
|
if bit:
|
||||||
|
code.bits |= one << code.length
|
||||||
|
else:
|
||||||
|
code.bits &= ~(one << code.length)
|
||||||
|
code.length += 1
|
||||||
|
return code
|
||||||
|
|
||||||
|
|
||||||
|
cdef class BitArray:
|
||||||
|
def __init__(self):
|
||||||
|
self.data = b''
|
||||||
|
self.byte = 0
|
||||||
|
self.bit_of_byte = 0
|
||||||
|
self.i = 0
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
cdef uchar byte, i
|
||||||
|
cdef uchar one = 1
|
||||||
|
start_byte = self.i // 8
|
||||||
|
if (self.i % 8) != 0:
|
||||||
|
for i in range(self.i % 8):
|
||||||
|
yield 1 if (self.data[start_byte] & (one << i)) else 0
|
||||||
|
start_byte += 1
|
||||||
|
for byte in self.data[start_byte:]:
|
||||||
|
for i in range(8):
|
||||||
|
yield 1 if byte & (one << i) else 0
|
||||||
|
for i in range(self.bit_of_byte):
|
||||||
|
yield 1 if self.byte & (one << i) else 0
|
||||||
|
|
||||||
|
def as_bytes(self):
|
||||||
|
if self.bit_of_byte != 0:
|
||||||
|
return self.data + chr(self.byte)
|
||||||
|
else:
|
||||||
|
return self.data
|
||||||
|
|
||||||
|
def append(self, bint bit):
|
||||||
|
cdef uint64_t one = 1
|
||||||
|
if bit:
|
||||||
|
self.byte |= one << self.bit_of_byte
|
||||||
|
else:
|
||||||
|
self.byte &= ~(one << self.bit_of_byte)
|
||||||
|
self.bit_of_byte += 1
|
||||||
|
if self.bit_of_byte == 8:
|
||||||
|
self.data += chr(self.byte)
|
||||||
|
self.byte = 0
|
||||||
|
self.bit_of_byte = 0
|
||||||
|
|
||||||
|
cdef int extend(self, uint64_t code, char n_bits) except -1:
|
||||||
|
cdef uint64_t one = 1
|
||||||
|
cdef unsigned char bit_of_code
|
||||||
|
for bit_of_code in range(n_bits):
|
||||||
|
if code & (one << bit_of_code):
|
||||||
|
self.byte |= one << self.bit_of_byte
|
||||||
|
else:
|
||||||
|
self.byte &= ~(one << self.bit_of_byte)
|
||||||
|
self.bit_of_byte += 1
|
||||||
|
if self.bit_of_byte == 8:
|
||||||
|
self.data += chr(self.byte)
|
||||||
|
self.byte = 0
|
||||||
|
self.bit_of_byte = 0
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from libc.stdint cimport int64_t
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
from .vocab cimport Vocab
|
from .bits cimport Code
|
||||||
|
|
||||||
|
|
||||||
cdef struct Node:
|
cdef struct Node:
|
||||||
|
@ -13,19 +13,6 @@ cdef struct Node:
|
||||||
int32_t right
|
int32_t right
|
||||||
|
|
||||||
|
|
||||||
cdef struct Code:
|
|
||||||
uint64_t bits
|
|
||||||
char length
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Serializer:
|
|
||||||
cdef list codecs
|
|
||||||
cdef Vocab vocab
|
|
||||||
|
|
||||||
|
|
||||||
cdef class HuffmanCodec:
|
cdef class HuffmanCodec:
|
||||||
cdef vector[Node] nodes
|
cdef vector[Node] nodes
|
||||||
cdef vector[Code] codes
|
cdef vector[Code] codes
|
||||||
cdef uint32_t eol
|
|
||||||
cdef int id
|
|
||||||
|
|
157
spacy/serialize/huffman.pyx
Normal file
157
spacy/serialize/huffman.pyx
Normal file
|
@ -0,0 +1,157 @@
|
||||||
|
cimport cython
|
||||||
|
|
||||||
|
from .bits cimport bit_append
|
||||||
|
from .bits cimport BitArray
|
||||||
|
|
||||||
|
|
||||||
|
cdef class HuffmanCodec:
|
||||||
|
"""Create a Huffman code table, and use it to pack and unpack sequences into
|
||||||
|
byte strings. Emphasis is on efficiency, so API is quite strict:
|
||||||
|
|
||||||
|
Messages will be encoded/decoded as indices that refer to the probability sequence.
|
||||||
|
For instance, the sequence [5, 10, 8] indicates the 5th most frequent item,
|
||||||
|
the 10th most frequent item, the 8th most frequent item.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
weights (float[:]): A descending-sorted sequence of probabilities/weights.
|
||||||
|
Must include a weight for an EOL symbol.
|
||||||
|
|
||||||
|
eol (uint32_t): The index of the weight of the EOL symbol.
|
||||||
|
"""
|
||||||
|
def __init__(self, float[:] weights):
|
||||||
|
self.codes.resize(len(weights))
|
||||||
|
for i in range(len(self.codes)):
|
||||||
|
self.codes[i].bits = 0
|
||||||
|
self.codes[i].length = 0
|
||||||
|
populate_nodes(self.nodes, weights)
|
||||||
|
cdef Code path
|
||||||
|
path.bits = 0
|
||||||
|
path.length = 0
|
||||||
|
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
|
||||||
|
|
||||||
|
def encode(self, uint32_t[:] msg, BitArray into_bits):
|
||||||
|
cdef uint32_t i
|
||||||
|
for i in range(len(msg)):
|
||||||
|
into_bits.extend(self.codes[msg[i]].bits, self.codes[msg[i]].length)
|
||||||
|
|
||||||
|
def decode(self, bits, uint32_t[:] into_msg):
|
||||||
|
node = self.nodes.back()
|
||||||
|
cdef int i = 0
|
||||||
|
cdef int n = len(into_msg)
|
||||||
|
for bit in bits:
|
||||||
|
branch = node.right if bit else node.left
|
||||||
|
if branch >= 0:
|
||||||
|
node = self.nodes.at(branch)
|
||||||
|
else:
|
||||||
|
into_msg[i] = -(branch + 1)
|
||||||
|
node = self.nodes.back()
|
||||||
|
i += 1
|
||||||
|
if i == n:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise Exception
|
||||||
|
|
||||||
|
property strings:
|
||||||
|
@cython.boundscheck(False)
|
||||||
|
@cython.wraparound(False)
|
||||||
|
@cython.nonecheck(False)
|
||||||
|
def __get__(self):
|
||||||
|
output = []
|
||||||
|
cdef int i, j
|
||||||
|
cdef bytes string
|
||||||
|
cdef Code code
|
||||||
|
for i in range(self.codes.size()):
|
||||||
|
code = self.codes[i]
|
||||||
|
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
|
||||||
|
string = string[::-1]
|
||||||
|
output.append(string)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
@cython.boundscheck(False)
|
||||||
|
@cython.wraparound(False)
|
||||||
|
@cython.nonecheck(False)
|
||||||
|
cdef int populate_nodes(vector[Node]& nodes, float[:] probs) except -1:
|
||||||
|
assert len(probs) >= 3
|
||||||
|
cdef int size = len(probs)
|
||||||
|
cdef int i = size - 1
|
||||||
|
cdef int j = 0
|
||||||
|
|
||||||
|
while i >= 0 or (j+1) < nodes.size():
|
||||||
|
if i < 0:
|
||||||
|
_cover_two_nodes(nodes, j)
|
||||||
|
j += 2
|
||||||
|
elif j >= nodes.size():
|
||||||
|
_cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
|
||||||
|
i -= 2
|
||||||
|
elif i >= 1 and (j == nodes.size() or probs[i-1] < nodes[j].prob):
|
||||||
|
_cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
|
||||||
|
i -= 2
|
||||||
|
elif (j+1) < nodes.size() and nodes[j+1].prob < probs[i]:
|
||||||
|
_cover_two_nodes(nodes, j)
|
||||||
|
j += 2
|
||||||
|
else:
|
||||||
|
_cover_one_word_one_node(nodes, j, i, probs[i])
|
||||||
|
i -= 1
|
||||||
|
j += 1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil:
|
||||||
|
"""Introduce a new non-terminal, over two non-terminals)"""
|
||||||
|
cdef Node node
|
||||||
|
node.left = j
|
||||||
|
node.right = j+1
|
||||||
|
node.prob = nodes[j].prob + nodes[j+1].prob
|
||||||
|
nodes.push_back(node)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float prob) nogil:
|
||||||
|
"""Introduce a new non-terminal, over one terminal and one non-terminal."""
|
||||||
|
cdef Node node
|
||||||
|
# Encode leaves as negative integers, where the integer is the index of the
|
||||||
|
# word in the vocabulary.
|
||||||
|
cdef int64_t leaf_id = - <int64_t>(id_ + 1)
|
||||||
|
cdef float new_prob = prob + nodes[j].prob
|
||||||
|
if prob < nodes[j].prob:
|
||||||
|
node.left = leaf_id
|
||||||
|
node.right = j
|
||||||
|
node.prob = new_prob
|
||||||
|
else:
|
||||||
|
node.left = j
|
||||||
|
node.right = leaf_id
|
||||||
|
node.prob = new_prob
|
||||||
|
nodes.push_back(node)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nogil:
|
||||||
|
"""Introduce a new node, over two non-terminals."""
|
||||||
|
cdef Node node
|
||||||
|
node.left = -(id1+1)
|
||||||
|
node.right = -(id2+1)
|
||||||
|
node.prob = prob
|
||||||
|
nodes.push_back(node)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
|
||||||
|
"""Recursively assign paths, from the top down. At the end, the entry codes[i]
|
||||||
|
knows the bit-address of the node[j] that points to entry i in the vocabulary.
|
||||||
|
So, to encode i, we go to codes[i] and read its bit-string. To decode, we
|
||||||
|
navigate nodes recursively.
|
||||||
|
"""
|
||||||
|
cdef Code left_path = bit_append(path, 0)
|
||||||
|
cdef Code right_path = bit_append(path, 1)
|
||||||
|
|
||||||
|
# Assign down left branch
|
||||||
|
if nodes[i].left >= 0:
|
||||||
|
assign_codes(nodes, codes, nodes[i].left, left_path)
|
||||||
|
else:
|
||||||
|
# Leaf on left
|
||||||
|
id_ = -(nodes[i].left + 1)
|
||||||
|
codes[id_] = left_path
|
||||||
|
# Assign down right branch
|
||||||
|
if nodes[i].right >= 0:
|
||||||
|
assign_codes(nodes, codes, nodes[i].right, right_path)
|
||||||
|
else:
|
||||||
|
# Leaf on right
|
||||||
|
id_ = -(nodes[i].right + 1)
|
||||||
|
codes[id_] = right_path
|
6
spacy/serialize/packer.pxd
Normal file
6
spacy/serialize/packer.pxd
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from ..vocab cimport Vocab
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Packer:
|
||||||
|
cdef tuple _codecs
|
||||||
|
cdef Vocab vocab
|
136
spacy/serialize/packer.pyx
Normal file
136
spacy/serialize/packer.pyx
Normal file
|
@ -0,0 +1,136 @@
|
||||||
|
from libc.stdint cimport uint32_t
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
from libc.math cimport exp as c_exp
|
||||||
|
from libcpp.queue cimport priority_queue
|
||||||
|
from libcpp.pair cimport pair
|
||||||
|
|
||||||
|
from cymem.cymem cimport Address, Pool
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
|
from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
from ..vocab cimport Vocab
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
from .bits cimport BitArray
|
||||||
|
from .huffman cimport HuffmanCodec
|
||||||
|
|
||||||
|
from os import path
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
|
# Format
|
||||||
|
# - Total number of bytes in message (32 bit int) --- handled outside this
|
||||||
|
# - Number of words (32 bit int)
|
||||||
|
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
|
||||||
|
# - Spaces 1 bit per word
|
||||||
|
# - Attributes:
|
||||||
|
# POS tag
|
||||||
|
# Head offset
|
||||||
|
# Dep label
|
||||||
|
# Entity IOB
|
||||||
|
# Entity tag
|
||||||
|
|
||||||
|
|
||||||
|
def make_vocab_codec(Vocab vocab):
|
||||||
|
cdef int length = len(vocab)
|
||||||
|
cdef Address mem = Address(length, sizeof(float))
|
||||||
|
probs = <float*>mem.ptr
|
||||||
|
cdef int i
|
||||||
|
for i in range(length):
|
||||||
|
probs[i] = <float>c_exp(vocab.lexemes[i].prob)
|
||||||
|
cdef float[:] cv_probs = <float[:len(vocab)]>probs
|
||||||
|
return HuffmanCodec(cv_probs)
|
||||||
|
|
||||||
|
|
||||||
|
cdef class _BinaryCodec:
|
||||||
|
def encode(self, src, bits):
|
||||||
|
cdef int i
|
||||||
|
for i in range(len(src)):
|
||||||
|
bits.append(src[i])
|
||||||
|
|
||||||
|
def decode(self, dest, bits, n):
|
||||||
|
for i in range(n):
|
||||||
|
dest[i] = bits.next()
|
||||||
|
|
||||||
|
|
||||||
|
cdef class _AttributeCodec:
|
||||||
|
cdef Pool mem
|
||||||
|
cdef attr_t* _keys
|
||||||
|
cdef PreshMap _map
|
||||||
|
cdef HuffmanCodec _codec
|
||||||
|
|
||||||
|
def __init__(self, freqs):
|
||||||
|
cdef uint64_t key
|
||||||
|
cdef uint64_t count
|
||||||
|
cdef pair[uint64_t, uint64_t] item
|
||||||
|
|
||||||
|
cdef priority_queue[pair[uint64_t, uint64_t]] items
|
||||||
|
|
||||||
|
for key, count in freqs:
|
||||||
|
item.first = count
|
||||||
|
item.second = key
|
||||||
|
items.push(item)
|
||||||
|
weights = numpy.array(shape=(len(freqs),), dtype=numpy.float32)
|
||||||
|
self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
|
||||||
|
self._map = PreshMap()
|
||||||
|
cdef int i = 0
|
||||||
|
while not items.empty():
|
||||||
|
item = items.top()
|
||||||
|
weights[i] = item.first
|
||||||
|
self._keys[i] = item.second
|
||||||
|
self._map[self.keys[i]] = i
|
||||||
|
items.pop()
|
||||||
|
self._codec = HuffmanCodec(weights)
|
||||||
|
|
||||||
|
def encode(self, attr_t[:] msg, BitArray into_bits):
|
||||||
|
for i in range(len(msg)):
|
||||||
|
msg[i] = self._map[msg[i]]
|
||||||
|
self._codec.encode(msg, into_bits)
|
||||||
|
|
||||||
|
def decode(self, BitArray bits, attr_t[:] into_msg):
|
||||||
|
cdef int i
|
||||||
|
self._codec.decode(bits, into_msg)
|
||||||
|
for i in range(len(into_msg)):
|
||||||
|
into_msg[i] = self._keys[into_msg[i]]
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Packer:
|
||||||
|
def __init__(self, Vocab vocab, list_of_attr_freqs):
|
||||||
|
self.vocab = vocab
|
||||||
|
codecs = []
|
||||||
|
self.attrs = []
|
||||||
|
|
||||||
|
for attr, freqs in list_of_attr_freqs:
|
||||||
|
if attr == ID:
|
||||||
|
codecs.append(make_vocab_codec(vocab))
|
||||||
|
elif attr == SPACY:
|
||||||
|
codecs.append(_BinaryCodec())
|
||||||
|
else:
|
||||||
|
codecs.append(_AttributeCodec(freqs))
|
||||||
|
self.attrs.append(attr)
|
||||||
|
self._codecs = tuple(codecs)
|
||||||
|
|
||||||
|
def __call__(self, msg_or_bits):
|
||||||
|
if isinstance(msg_or_bits, BitArray):
|
||||||
|
bits = msg_or_bits
|
||||||
|
return Doc.from_array(self.vocab, self.attrs, self.deserialize(bits))
|
||||||
|
else:
|
||||||
|
msg = msg_or_bits
|
||||||
|
return self.serialize(msg.to_array(self.attrs))
|
||||||
|
|
||||||
|
def serialize(self, array):
|
||||||
|
cdef BitArray bits = BitArray()
|
||||||
|
cdef uint32_t length = len(array)
|
||||||
|
bits.extend(length, 32)
|
||||||
|
for i, codec in enumerate(self._codecs):
|
||||||
|
codec.encode(array[i], bits)
|
||||||
|
return bits
|
||||||
|
|
||||||
|
def deserialize(self, bits):
|
||||||
|
cdef uint32_t length = bits.read(32)
|
||||||
|
array = numpy.ndarray(shape=(len(self.codecs), length), dtype=numpy.int)
|
||||||
|
for i, codec in enumerate(self.codecs):
|
||||||
|
array[i] = codec.decode(bits)
|
||||||
|
return array
|
|
@ -4,7 +4,6 @@ from libc.string cimport memcpy, memset
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
from ..serialize import BitArray
|
|
||||||
from ..strings cimport slice_unicode
|
from ..strings cimport slice_unicode
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
|
@ -371,10 +370,12 @@ cdef class Doc:
|
||||||
return self[start]
|
return self[start]
|
||||||
|
|
||||||
def from_array(self, attrs, array):
|
def from_array(self, attrs, array):
|
||||||
cdef int i
|
cdef int i, col
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.data
|
cdef TokenC* tokens = self.data
|
||||||
for attr_id in attrs:
|
cdef int length = len(array)
|
||||||
|
for col, attr_id in enumerate(attrs):
|
||||||
|
values = array[:, col]
|
||||||
if attr_id == HEAD:
|
if attr_id == HEAD:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
tokens[i].head = values[i]
|
tokens[i].head = values[i]
|
||||||
|
|
|
@ -35,5 +35,3 @@ cdef class Vocab:
|
||||||
|
|
||||||
cdef PreshMap _map
|
cdef PreshMap _map
|
||||||
cdef readonly int repvec_length
|
cdef readonly int repvec_length
|
||||||
|
|
||||||
cdef public object _codec
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
from libc.math cimport exp as c_exp
|
|
||||||
|
|
||||||
import bz2
|
import bz2
|
||||||
from os import path
|
from os import path
|
||||||
|
@ -15,7 +14,6 @@ from .strings cimport slice_unicode
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .serialize cimport HuffmanCodec
|
|
||||||
|
|
||||||
from cymem.cymem cimport Address
|
from cymem.cymem cimport Address
|
||||||
|
|
||||||
|
@ -227,22 +225,6 @@ cdef class Vocab:
|
||||||
lex.repvec = EMPTY_VEC
|
lex.repvec = EMPTY_VEC
|
||||||
return vec_len
|
return vec_len
|
||||||
|
|
||||||
property codec:
|
|
||||||
def __get__(self):
|
|
||||||
cdef Address mem
|
|
||||||
cdef int i
|
|
||||||
cdef float[:] cv_probs
|
|
||||||
if self._codec is not None:
|
|
||||||
return self._codec
|
|
||||||
else:
|
|
||||||
mem = Address(len(self), sizeof(float))
|
|
||||||
probs = <float*>mem.ptr
|
|
||||||
for i in range(len(self)):
|
|
||||||
probs[i] = <float>c_exp(self.lexemes[i].prob)
|
|
||||||
cv_probs = <float[:len(self)]>probs
|
|
||||||
self._codec = HuffmanCodec(cv_probs, 0)
|
|
||||||
return self._codec
|
|
||||||
|
|
||||||
|
|
||||||
def write_binary_vectors(in_loc, out_loc):
|
def write_binary_vectors(in_loc, out_loc):
|
||||||
cdef _CFile out_file = _CFile(out_loc, 'wb')
|
cdef _CFile out_file = _CFile(out_loc, 'wb')
|
||||||
|
|
|
@ -3,14 +3,15 @@ from __future__ import division
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.serialize import HuffmanCodec
|
from spacy.serialize.huffman import HuffmanCodec
|
||||||
|
from spacy.serialize.bits import BitArray
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from heapq import heappush, heappop, heapify
|
from heapq import heappush, heappop, heapify
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
class Vocab(object):
|
class MockPacker(object):
|
||||||
def __init__(self, freqs):
|
def __init__(self, freqs):
|
||||||
freqs['-eol-'] = 5
|
freqs['-eol-'] = 5
|
||||||
total = sum(freqs.values())
|
total = sum(freqs.values())
|
||||||
|
@ -19,15 +20,19 @@ class Vocab(object):
|
||||||
self.symbols = [sym for sym, freq in by_freq]
|
self.symbols = [sym for sym, freq in by_freq]
|
||||||
self.probs = numpy.array([item[1] / total for item in by_freq], dtype=numpy.float32)
|
self.probs = numpy.array([item[1] / total for item in by_freq], dtype=numpy.float32)
|
||||||
self.table = {sym: i for i, sym in enumerate(self.symbols)}
|
self.table = {sym: i for i, sym in enumerate(self.symbols)}
|
||||||
self.codec = HuffmanCodec(self.probs, self.table['-eol-'])
|
self.codec = HuffmanCodec(self.probs)
|
||||||
|
|
||||||
def pack(self, message):
|
def pack(self, message):
|
||||||
seq = [self.table[sym] for sym in message]
|
seq = [self.table[sym] for sym in message]
|
||||||
return self.codec.encode(numpy.array(seq, dtype=numpy.uint32))
|
msg = numpy.array(seq, dtype=numpy.uint32)
|
||||||
|
bits = BitArray()
|
||||||
|
self.codec.encode(msg, bits)
|
||||||
|
return bits
|
||||||
|
|
||||||
def unpack(self, packed):
|
def unpack(self, bits, n):
|
||||||
ids = self.codec.decode(packed)
|
msg = numpy.array(range(n), dtype=numpy.uint32)
|
||||||
return [self.symbols[i] for i in ids]
|
self.codec.decode(bits, msg)
|
||||||
|
return [self.symbols[i] for i in msg]
|
||||||
|
|
||||||
|
|
||||||
def py_encode(symb2freq):
|
def py_encode(symb2freq):
|
||||||
|
@ -60,7 +65,7 @@ def test1():
|
||||||
probs[8] = 0.0001
|
probs[8] = 0.0001
|
||||||
probs[9] = 0.000001
|
probs[9] = 0.000001
|
||||||
|
|
||||||
codec = HuffmanCodec(probs, 9)
|
codec = HuffmanCodec(probs)
|
||||||
|
|
||||||
py_codes = py_encode(dict(enumerate(probs)))
|
py_codes = py_encode(dict(enumerate(probs)))
|
||||||
py_codes = py_codes.items()
|
py_codes = py_codes.items()
|
||||||
|
@ -71,19 +76,19 @@ def test1():
|
||||||
def test_round_trip():
|
def test_round_trip():
|
||||||
freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, 'over': 8,
|
freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, 'over': 8,
|
||||||
'lazy': 1, 'dog': 2, '.': 9}
|
'lazy': 1, 'dog': 2, '.': 9}
|
||||||
vocab = Vocab(freqs)
|
packer = MockPacker(freqs)
|
||||||
|
|
||||||
message = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the',
|
message = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the',
|
||||||
'the', 'lazy', 'dog', '.']
|
'the', 'lazy', 'dog', '.']
|
||||||
strings = list(vocab.codec.strings)
|
strings = list(packer.codec.strings)
|
||||||
codes = {vocab.symbols[i]: strings[i] for i in range(len(vocab.symbols))}
|
codes = {packer.symbols[i]: strings[i] for i in range(len(packer.symbols))}
|
||||||
packed = vocab.pack(message)
|
bits = packer.pack(message)
|
||||||
string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in packed.as_bytes())
|
string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes())
|
||||||
for word in message:
|
for word in message:
|
||||||
code = codes[word]
|
code = codes[word]
|
||||||
assert string[:len(code)] == code
|
assert string[:len(code)] == code
|
||||||
string = string[len(code):]
|
string = string[len(code):]
|
||||||
unpacked = vocab.unpack(packed)
|
unpacked = packer.unpack(bits, len(message))
|
||||||
assert message == unpacked
|
assert message == unpacked
|
||||||
|
|
||||||
|
|
||||||
|
@ -92,13 +97,12 @@ def test_rosetta():
|
||||||
symb2freq = defaultdict(int)
|
symb2freq = defaultdict(int)
|
||||||
for ch in txt:
|
for ch in txt:
|
||||||
symb2freq[ch] += 1
|
symb2freq[ch] += 1
|
||||||
symb2freq['-eol-'] = 1
|
|
||||||
by_freq = symb2freq.items()
|
by_freq = symb2freq.items()
|
||||||
by_freq.sort(reverse=True, key=lambda item: item[1])
|
by_freq.sort(reverse=True, key=lambda item: item[1])
|
||||||
symbols = [sym for sym, prob in by_freq]
|
symbols = [sym for sym, prob in by_freq]
|
||||||
probs = numpy.array([prob for sym, prob in by_freq], dtype=numpy.float32)
|
probs = numpy.array([prob for sym, prob in by_freq], dtype=numpy.float32)
|
||||||
|
|
||||||
codec = HuffmanCodec(probs, symbols.index('-eol-'))
|
codec = HuffmanCodec(probs)
|
||||||
py_codec = py_encode(symb2freq)
|
py_codec = py_encode(symb2freq)
|
||||||
|
|
||||||
my_lengths = defaultdict(int)
|
my_lengths = defaultdict(int)
|
||||||
|
@ -112,6 +116,7 @@ def test_rosetta():
|
||||||
assert my_exp_len == py_exp_len
|
assert my_exp_len == py_exp_len
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
def test_vocab(EN):
|
def test_vocab(EN):
|
||||||
codec = EN.vocab.codec
|
codec = EN.vocab.codec
|
||||||
expected_length = 0
|
expected_length = 0
|
||||||
|
@ -137,3 +142,4 @@ def test_freqs():
|
||||||
for i, code in enumerate(codec.strings):
|
for i, code in enumerate(codec.strings):
|
||||||
expected_length += len(code) * freqs[i]
|
expected_length += len(code) * freqs[i]
|
||||||
assert 8 < expected_length < 14
|
assert 8 < expected_length < 14
|
||||||
|
"""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user