* Major refactor of serialization. Nearly complete now.

This commit is contained in:
Matthew Honnibal 2015-07-17 01:19:29 +02:00
parent c8282f9934
commit db9dfd2e23
13 changed files with 423 additions and 391 deletions

View File

@ -94,6 +94,7 @@ def cython_setup(mod_names, language, includes, compile_args, link_args):
"data/vocab/lexemes.bin",
"data/vocab/strings.txt"],
"spacy.tokens": ["*.pxd"],
"spacy.serialize": ["*.pxd"],
"spacy.syntax": ["*.pxd"]},
ext_modules=exts,
cmdclass={'build_ext': Cython.Distutils.build_ext},
@ -158,8 +159,9 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.syntax.transition_system',
'spacy.syntax.arc_eager',
'spacy.syntax._parse_features',
'spacy.gold', 'spacy.orth', 'spacy.serialize',
'spacy.gold', 'spacy.orth',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.syntax.ner']

View File

@ -70,6 +70,7 @@ class English(object):
Tagger=EnPosTagger,
Parser=ParserFactory(ParserTransitionSystem),
Entity=ParserFactory(EntityTransitionSystem),
Packer=None,
load_vectors=True
):
@ -101,10 +102,10 @@ class English(object):
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
else:
self.entity = None
if Serializer:
self.bitter = Serializer(self.vocab, data_dir)
if Packer:
self.packer = Packer(self.vocab, data_dir)
else:
self.bitter = None
self.packer = None
self.mwe_merger = RegexMerger([
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
('CD', 'TIME', regexes.TIME_RE),

View File

@ -1,334 +0,0 @@
from libcpp.vector cimport vector
from libc.stdint cimport uint32_t
from libc.stdint cimport int64_t
from libc.stdint cimport int32_t
from libc.stdint cimport uint64_t
from libcpp.queue cimport priority_queue
from libcpp.pair cimport pair
from preshed.maps cimport PreshMap
from murmurhash.mrmr cimport hash64
from .tokens.doc cimport Doc
from .vocab cimport Vocab
from os import path
import numpy
cimport cython
ctypedef unsigned char uchar
# Format
# - Total number of bytes in message (32 bit int)
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
# - Spaces ~1 bit per word
# - Parse: Huffman coded head offset / dep label / POS tag / entity IOB tag
# combo. ? bits per word. 40 * 80 * 40 * 12 = 1.5m symbol vocab
# Note that we're setting the most significant bits here first, when in practice
# we're actually wanting the last bit to be most significant (for Huffman coding,
# anyway).
cdef Code bit_append(Code code, bint bit) nogil:
cdef uint64_t one = 1
if bit:
code.bits |= one << code.length
else:
code.bits &= ~(one << code.length)
code.length += 1
return code
cdef class BitArray:
cdef bytes data
cdef unsigned char byte
cdef unsigned char bit_of_byte
cdef uint32_t i
def __init__(self):
self.data = b''
self.byte = 0
self.bit_of_byte = 0
self.i = 0
def __iter__(self):
cdef uchar byte, i
cdef uchar one = 1
start_byte = self.i // 8
if (self.i % 8) != 0:
for i in range(self.i % 8):
yield 1 if (self.data[start_byte] & (one << i)) else 0
start_byte += 1
for byte in self.data[start_byte:]:
for i in range(8):
yield 1 if byte & (one << i) else 0
for i in range(self.bit_of_byte):
yield 1 if self.byte & (one << i) else 0
def as_bytes(self):
if self.bit_of_byte != 0:
return self.data + chr(self.byte)
else:
return self.data
def append(self, bint bit):
cdef uint64_t one = 1
if bit:
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.byte = 0
self.bit_of_byte = 0
cdef int extend(self, uint64_t code, char n_bits) except -1:
cdef uint64_t one = 1
cdef unsigned char bit_of_code
for bit_of_code in range(n_bits):
if code & (one << bit_of_code):
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.byte = 0
self.bit_of_byte = 0
cdef class Serializer:
# Manage codecs, maintain consistent format for io
def __init__(self, Vocab vocab, data_dir):
model_dir = path.join(data_dir, 'bitter')
self.vocab = vocab # Vocab owns the word codec, the big one
#self.cfg = Config.read(model_dir, 'config')
self.codecs = tuple([CodecWrapper(attr) for attr in self.cfg.attrs])
def __call__(self, doc_or_bits):
if isinstance(doc_or_bits, Doc):
return self.serialize(doc_or_bits)
elif isinstance(doc_or_bits, BitArray):
return self.deserialize(doc_or_bits)
else:
raise ValueError(doc_or_bits)
def train(self, doc):
array = doc.to_array([codec.id for codec in self.codecs])
for i, codec in enumerate(self.codecs):
codec.count(array[i])
def serialize(self, doc):
bits = BitArray()
array = doc.to_array(self.attrs)
for i, codec in enumerate(self.codecs):
codec.encode(array[i,], bits)
return bits
@cython.boundscheck(False)
def deserialize(self, bits):
biterator = iter(bits)
cdef Doc doc = Doc(self.vocab)
ids = self.vocab.codec.decode(biterator)
cdef int id_
cdef bint is_spacy
for id_ in ids:
is_spacy = biterator.next()
doc.push_back(self.vocab.lexemes.at(id_), is_spacy)
cdef int length = doc.length
array = numpy.zeros(shape=(length, len(self.codecs)), dtype=numpy.int)
for i, codec in enumerate(self.codecs):
array[i] = codec.decode(biterator)
doc.from_array([c.id for c in self.codecs], array)
return doc
cdef class CodecWrapper:
"""Wrapper around HuffmanCodec"""
def __init__(self, freqs, id=0):
cdef uint64_t key
cdef uint64_t count
cdef pair[uint64_t, uint64_t] item
cdef priority_queue[pair[uint64_t, uint64_t]] items
for key, count in freqs:
item.first = count
item.second = key
items.push(item)
weights = [] #array('f')
keys = [] #array('i')
key_to_i = PreshMap()
i = 0
while not items.empty():
item = items.top()
weights.append(item.first)
keys.append(item.second)
key_to_i[item.second] = i
i += 1
items.pop()
def encode(self, symbols):
indices = [self.table[symbol] for symbol in symbols]
return self._codec.encode(indices)
def decode(self, bits):
indices = self._codec.decode(bits)
return [self.symbols[i] for i in indices]
cdef class HuffmanCodec:
"""Create a Huffman code table, and use it to pack and unpack sequences into
byte strings. Emphasis is on efficiency, so API is quite strict:
Messages will be encoded/decoded as indices that refer to the probability sequence.
For instance, the sequence [5, 10, 8] indicates the 5th most frequent item,
the 10th most frequent item, the 8th most frequent item. The codec will add
the EOL symbol to your message. An exception will be raised if you include
the EOL symbol in your message.
Arguments:
weights (float[:]): A descending-sorted sequence of probabilities/weights.
Must include a weight for an EOL symbol.
eol (uint32_t): The index of the weight of the EOL symbol.
"""
def __init__(self, float[:] weights, uint32_t eol):
self.codes.resize(len(weights))
for i in range(len(self.codes)):
self.codes[i].bits = 0
self.codes[i].length = 0
populate_nodes(self.nodes, weights)
cdef Code path
path.bits = 0
path.length = 0
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
def encode(self, uint32_t[:] sequence, BitArray bits=None):
if bits is None:
bits = BitArray()
for i in sequence:
bits.extend(self.codes[i].bits, self.codes[i].length)
bits.extend(self.codes[self.eol].bits, self.codes[self.eol].length)
return bits
def decode(self, bits):
node = self.nodes.back()
symbols = []
for bit in bits:
branch = node.right if bit else node.left
if branch >= 0:
node = self.nodes.at(branch)
else:
symbol = -(branch + 1)
if symbol == self.eol:
return symbols
else:
symbols.append(symbol)
node = self.nodes.back()
return symbols
property strings:
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def __get__(self):
output = []
cdef int i, j
cdef bytes string
cdef Code code
for i in range(self.codes.size()):
code = self.codes[i]
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
string = string[::-1]
output.append(string)
return output
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef int populate_nodes(vector[Node]& nodes, float[:] probs) except -1:
assert len(probs) >= 3
cdef int size = len(probs)
cdef int i = size - 1
cdef int j = 0
while i >= 0 or (j+1) < nodes.size():
if i < 0:
_cover_two_nodes(nodes, j)
j += 2
elif j >= nodes.size():
_cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
i -= 2
elif i >= 1 and (j == nodes.size() or probs[i-1] < nodes[j].prob):
_cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
i -= 2
elif (j+1) < nodes.size() and nodes[j+1].prob < probs[i]:
_cover_two_nodes(nodes, j)
j += 2
else:
_cover_one_word_one_node(nodes, j, i, probs[i])
i -= 1
j += 1
return 0
cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil:
"""Introduce a new non-terminal, over two non-terminals)"""
cdef Node node
node.left = j
node.right = j+1
node.prob = nodes[j].prob + nodes[j+1].prob
nodes.push_back(node)
cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float prob) nogil:
"""Introduce a new non-terminal, over one terminal and one non-terminal."""
cdef Node node
# Encode leaves as negative integers, where the integer is the index of the
# word in the vocabulary.
cdef int64_t leaf_id = - <int64_t>(id_ + 1)
cdef float new_prob = prob + nodes[j].prob
if prob < nodes[j].prob:
node.left = leaf_id
node.right = j
node.prob = new_prob
else:
node.left = j
node.right = leaf_id
node.prob = new_prob
nodes.push_back(node)
cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nogil:
"""Introduce a new node, over two non-terminals."""
cdef Node node
node.left = -(id1+1)
node.right = -(id2+1)
node.prob = prob
nodes.push_back(node)
cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
"""Recursively assign paths, from the top down. At the end, the entry codes[i]
knows the bit-address of the node[j] that points to entry i in the vocabulary.
So, to encode i, we go to codes[i] and read its bit-string. To decode, we
navigate nodes recursively.
"""
cdef Code left_path = bit_append(path, 0)
cdef Code right_path = bit_append(path, 1)
# Assign down left branch
if nodes[i].left >= 0:
assign_codes(nodes, codes, nodes[i].left, left_path)
else:
# Leaf on left
id_ = -(nodes[i].left + 1)
codes[id_] = left_path
# Assign down right branch
if nodes[i].right >= 0:
assign_codes(nodes, codes, nodes[i].right, right_path)
else:
# Leaf on right
id_ = -(nodes[i].right + 1)
codes[id_] = right_path

21
spacy/serialize/bits.pxd Normal file
View File

@ -0,0 +1,21 @@
from libc.stdint cimport uint64_t
from libc.stdint cimport uint32_t
ctypedef unsigned char uchar
cdef struct Code:
uint64_t bits
char length
cdef Code bit_append(Code code, bint bit) nogil
cdef class BitArray:
cdef bytes data
cdef uchar byte
cdef uchar bit_of_byte
cdef uint32_t i
cdef int extend(self, uint64_t code, char n_bits) except -1

69
spacy/serialize/bits.pyx Normal file
View File

@ -0,0 +1,69 @@
# Note that we're setting the most significant bits here first, when in practice
# we're actually wanting the last bit to be most significant (for Huffman coding,
# anyway).
cdef Code bit_append(Code code, bint bit) nogil:
cdef uint64_t one = 1
if bit:
code.bits |= one << code.length
else:
code.bits &= ~(one << code.length)
code.length += 1
return code
cdef class BitArray:
def __init__(self):
self.data = b''
self.byte = 0
self.bit_of_byte = 0
self.i = 0
def __iter__(self):
cdef uchar byte, i
cdef uchar one = 1
start_byte = self.i // 8
if (self.i % 8) != 0:
for i in range(self.i % 8):
yield 1 if (self.data[start_byte] & (one << i)) else 0
start_byte += 1
for byte in self.data[start_byte:]:
for i in range(8):
yield 1 if byte & (one << i) else 0
for i in range(self.bit_of_byte):
yield 1 if self.byte & (one << i) else 0
def as_bytes(self):
if self.bit_of_byte != 0:
return self.data + chr(self.byte)
else:
return self.data
def append(self, bint bit):
cdef uint64_t one = 1
if bit:
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.byte = 0
self.bit_of_byte = 0
cdef int extend(self, uint64_t code, char n_bits) except -1:
cdef uint64_t one = 1
cdef unsigned char bit_of_code
for bit_of_code in range(n_bits):
if code & (one << bit_of_code):
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.byte = 0
self.bit_of_byte = 0

View File

@ -4,7 +4,7 @@ from libc.stdint cimport int64_t
from libc.stdint cimport int32_t
from libc.stdint cimport uint64_t
from .vocab cimport Vocab
from .bits cimport Code
cdef struct Node:
@ -13,19 +13,6 @@ cdef struct Node:
int32_t right
cdef struct Code:
uint64_t bits
char length
cdef class Serializer:
cdef list codecs
cdef Vocab vocab
cdef class HuffmanCodec:
cdef vector[Node] nodes
cdef vector[Code] codes
cdef uint32_t eol
cdef int id

157
spacy/serialize/huffman.pyx Normal file
View File

@ -0,0 +1,157 @@
cimport cython
from .bits cimport bit_append
from .bits cimport BitArray
cdef class HuffmanCodec:
"""Create a Huffman code table, and use it to pack and unpack sequences into
byte strings. Emphasis is on efficiency, so API is quite strict:
Messages will be encoded/decoded as indices that refer to the probability sequence.
For instance, the sequence [5, 10, 8] indicates the 5th most frequent item,
the 10th most frequent item, the 8th most frequent item.
Arguments:
weights (float[:]): A descending-sorted sequence of probabilities/weights.
Must include a weight for an EOL symbol.
eol (uint32_t): The index of the weight of the EOL symbol.
"""
def __init__(self, float[:] weights):
self.codes.resize(len(weights))
for i in range(len(self.codes)):
self.codes[i].bits = 0
self.codes[i].length = 0
populate_nodes(self.nodes, weights)
cdef Code path
path.bits = 0
path.length = 0
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
def encode(self, uint32_t[:] msg, BitArray into_bits):
cdef uint32_t i
for i in range(len(msg)):
into_bits.extend(self.codes[msg[i]].bits, self.codes[msg[i]].length)
def decode(self, bits, uint32_t[:] into_msg):
node = self.nodes.back()
cdef int i = 0
cdef int n = len(into_msg)
for bit in bits:
branch = node.right if bit else node.left
if branch >= 0:
node = self.nodes.at(branch)
else:
into_msg[i] = -(branch + 1)
node = self.nodes.back()
i += 1
if i == n:
break
else:
raise Exception
property strings:
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def __get__(self):
output = []
cdef int i, j
cdef bytes string
cdef Code code
for i in range(self.codes.size()):
code = self.codes[i]
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
string = string[::-1]
output.append(string)
return output
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef int populate_nodes(vector[Node]& nodes, float[:] probs) except -1:
assert len(probs) >= 3
cdef int size = len(probs)
cdef int i = size - 1
cdef int j = 0
while i >= 0 or (j+1) < nodes.size():
if i < 0:
_cover_two_nodes(nodes, j)
j += 2
elif j >= nodes.size():
_cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
i -= 2
elif i >= 1 and (j == nodes.size() or probs[i-1] < nodes[j].prob):
_cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
i -= 2
elif (j+1) < nodes.size() and nodes[j+1].prob < probs[i]:
_cover_two_nodes(nodes, j)
j += 2
else:
_cover_one_word_one_node(nodes, j, i, probs[i])
i -= 1
j += 1
return 0
cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil:
"""Introduce a new non-terminal, over two non-terminals)"""
cdef Node node
node.left = j
node.right = j+1
node.prob = nodes[j].prob + nodes[j+1].prob
nodes.push_back(node)
cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float prob) nogil:
"""Introduce a new non-terminal, over one terminal and one non-terminal."""
cdef Node node
# Encode leaves as negative integers, where the integer is the index of the
# word in the vocabulary.
cdef int64_t leaf_id = - <int64_t>(id_ + 1)
cdef float new_prob = prob + nodes[j].prob
if prob < nodes[j].prob:
node.left = leaf_id
node.right = j
node.prob = new_prob
else:
node.left = j
node.right = leaf_id
node.prob = new_prob
nodes.push_back(node)
cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nogil:
"""Introduce a new node, over two non-terminals."""
cdef Node node
node.left = -(id1+1)
node.right = -(id2+1)
node.prob = prob
nodes.push_back(node)
cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
"""Recursively assign paths, from the top down. At the end, the entry codes[i]
knows the bit-address of the node[j] that points to entry i in the vocabulary.
So, to encode i, we go to codes[i] and read its bit-string. To decode, we
navigate nodes recursively.
"""
cdef Code left_path = bit_append(path, 0)
cdef Code right_path = bit_append(path, 1)
# Assign down left branch
if nodes[i].left >= 0:
assign_codes(nodes, codes, nodes[i].left, left_path)
else:
# Leaf on left
id_ = -(nodes[i].left + 1)
codes[id_] = left_path
# Assign down right branch
if nodes[i].right >= 0:
assign_codes(nodes, codes, nodes[i].right, right_path)
else:
# Leaf on right
id_ = -(nodes[i].right + 1)
codes[id_] = right_path

View File

@ -0,0 +1,6 @@
from ..vocab cimport Vocab
cdef class Packer:
cdef tuple _codecs
cdef Vocab vocab

136
spacy/serialize/packer.pyx Normal file
View File

@ -0,0 +1,136 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
from libc.math cimport exp as c_exp
from libcpp.queue cimport priority_queue
from libcpp.pair cimport pair
from cymem.cymem cimport Address, Pool
from preshed.maps cimport PreshMap
from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..typedefs cimport attr_t
from .bits cimport BitArray
from .huffman cimport HuffmanCodec
from os import path
import numpy
cimport cython
# Format
# - Total number of bytes in message (32 bit int) --- handled outside this
# - Number of words (32 bit int)
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
# - Spaces 1 bit per word
# - Attributes:
# POS tag
# Head offset
# Dep label
# Entity IOB
# Entity tag
def make_vocab_codec(Vocab vocab):
cdef int length = len(vocab)
cdef Address mem = Address(length, sizeof(float))
probs = <float*>mem.ptr
cdef int i
for i in range(length):
probs[i] = <float>c_exp(vocab.lexemes[i].prob)
cdef float[:] cv_probs = <float[:len(vocab)]>probs
return HuffmanCodec(cv_probs)
cdef class _BinaryCodec:
def encode(self, src, bits):
cdef int i
for i in range(len(src)):
bits.append(src[i])
def decode(self, dest, bits, n):
for i in range(n):
dest[i] = bits.next()
cdef class _AttributeCodec:
cdef Pool mem
cdef attr_t* _keys
cdef PreshMap _map
cdef HuffmanCodec _codec
def __init__(self, freqs):
cdef uint64_t key
cdef uint64_t count
cdef pair[uint64_t, uint64_t] item
cdef priority_queue[pair[uint64_t, uint64_t]] items
for key, count in freqs:
item.first = count
item.second = key
items.push(item)
weights = numpy.array(shape=(len(freqs),), dtype=numpy.float32)
self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
self._map = PreshMap()
cdef int i = 0
while not items.empty():
item = items.top()
weights[i] = item.first
self._keys[i] = item.second
self._map[self.keys[i]] = i
items.pop()
self._codec = HuffmanCodec(weights)
def encode(self, attr_t[:] msg, BitArray into_bits):
for i in range(len(msg)):
msg[i] = self._map[msg[i]]
self._codec.encode(msg, into_bits)
def decode(self, BitArray bits, attr_t[:] into_msg):
cdef int i
self._codec.decode(bits, into_msg)
for i in range(len(into_msg)):
into_msg[i] = self._keys[into_msg[i]]
cdef class Packer:
def __init__(self, Vocab vocab, list_of_attr_freqs):
self.vocab = vocab
codecs = []
self.attrs = []
for attr, freqs in list_of_attr_freqs:
if attr == ID:
codecs.append(make_vocab_codec(vocab))
elif attr == SPACY:
codecs.append(_BinaryCodec())
else:
codecs.append(_AttributeCodec(freqs))
self.attrs.append(attr)
self._codecs = tuple(codecs)
def __call__(self, msg_or_bits):
if isinstance(msg_or_bits, BitArray):
bits = msg_or_bits
return Doc.from_array(self.vocab, self.attrs, self.deserialize(bits))
else:
msg = msg_or_bits
return self.serialize(msg.to_array(self.attrs))
def serialize(self, array):
cdef BitArray bits = BitArray()
cdef uint32_t length = len(array)
bits.extend(length, 32)
for i, codec in enumerate(self._codecs):
codec.encode(array[i], bits)
return bits
def deserialize(self, bits):
cdef uint32_t length = bits.read(32)
array = numpy.ndarray(shape=(len(self.codecs), length), dtype=numpy.int)
for i, codec in enumerate(self.codecs):
array[i] = codec.decode(bits)
return array

View File

@ -4,7 +4,6 @@ from libc.string cimport memcpy, memset
import numpy
from ..lexeme cimport EMPTY_LEXEME
from ..serialize import BitArray
from ..strings cimport slice_unicode
from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t
@ -371,10 +370,12 @@ cdef class Doc:
return self[start]
def from_array(self, attrs, array):
cdef int i
cdef int i, col
cdef attr_id_t attr_id
cdef TokenC* tokens = self.data
for attr_id in attrs:
cdef int length = len(array)
for col, attr_id in enumerate(attrs):
values = array[:, col]
if attr_id == HEAD:
for i in range(length):
tokens[i].head = values[i]

View File

@ -35,5 +35,3 @@ cdef class Vocab:
cdef PreshMap _map
cdef readonly int repvec_length
cdef public object _codec

View File

@ -1,7 +1,6 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memset
from libc.stdint cimport int32_t
from libc.math cimport exp as c_exp
import bz2
from os import path
@ -15,7 +14,6 @@ from .strings cimport slice_unicode
from .strings cimport hash_string
from .orth cimport word_shape
from .typedefs cimport attr_t
from .serialize cimport HuffmanCodec
from cymem.cymem cimport Address
@ -227,22 +225,6 @@ cdef class Vocab:
lex.repvec = EMPTY_VEC
return vec_len
property codec:
def __get__(self):
cdef Address mem
cdef int i
cdef float[:] cv_probs
if self._codec is not None:
return self._codec
else:
mem = Address(len(self), sizeof(float))
probs = <float*>mem.ptr
for i in range(len(self)):
probs[i] = <float>c_exp(self.lexemes[i].prob)
cv_probs = <float[:len(self)]>probs
self._codec = HuffmanCodec(cv_probs, 0)
return self._codec
def write_binary_vectors(in_loc, out_loc):
cdef _CFile out_file = _CFile(out_loc, 'wb')

View File

@ -3,14 +3,15 @@ from __future__ import division
import pytest
from spacy.serialize import HuffmanCodec
from spacy.serialize.huffman import HuffmanCodec
from spacy.serialize.bits import BitArray
import numpy
from heapq import heappush, heappop, heapify
from collections import defaultdict
class Vocab(object):
class MockPacker(object):
def __init__(self, freqs):
freqs['-eol-'] = 5
total = sum(freqs.values())
@ -19,15 +20,19 @@ class Vocab(object):
self.symbols = [sym for sym, freq in by_freq]
self.probs = numpy.array([item[1] / total for item in by_freq], dtype=numpy.float32)
self.table = {sym: i for i, sym in enumerate(self.symbols)}
self.codec = HuffmanCodec(self.probs, self.table['-eol-'])
self.codec = HuffmanCodec(self.probs)
def pack(self, message):
seq = [self.table[sym] for sym in message]
return self.codec.encode(numpy.array(seq, dtype=numpy.uint32))
msg = numpy.array(seq, dtype=numpy.uint32)
bits = BitArray()
self.codec.encode(msg, bits)
return bits
def unpack(self, packed):
ids = self.codec.decode(packed)
return [self.symbols[i] for i in ids]
def unpack(self, bits, n):
msg = numpy.array(range(n), dtype=numpy.uint32)
self.codec.decode(bits, msg)
return [self.symbols[i] for i in msg]
def py_encode(symb2freq):
@ -60,7 +65,7 @@ def test1():
probs[8] = 0.0001
probs[9] = 0.000001
codec = HuffmanCodec(probs, 9)
codec = HuffmanCodec(probs)
py_codes = py_encode(dict(enumerate(probs)))
py_codes = py_codes.items()
@ -71,19 +76,19 @@ def test1():
def test_round_trip():
freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, 'over': 8,
'lazy': 1, 'dog': 2, '.': 9}
vocab = Vocab(freqs)
packer = MockPacker(freqs)
message = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the',
'the', 'lazy', 'dog', '.']
strings = list(vocab.codec.strings)
codes = {vocab.symbols[i]: strings[i] for i in range(len(vocab.symbols))}
packed = vocab.pack(message)
string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in packed.as_bytes())
strings = list(packer.codec.strings)
codes = {packer.symbols[i]: strings[i] for i in range(len(packer.symbols))}
bits = packer.pack(message)
string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes())
for word in message:
code = codes[word]
assert string[:len(code)] == code
string = string[len(code):]
unpacked = vocab.unpack(packed)
unpacked = packer.unpack(bits, len(message))
assert message == unpacked
@ -92,13 +97,12 @@ def test_rosetta():
symb2freq = defaultdict(int)
for ch in txt:
symb2freq[ch] += 1
symb2freq['-eol-'] = 1
by_freq = symb2freq.items()
by_freq.sort(reverse=True, key=lambda item: item[1])
symbols = [sym for sym, prob in by_freq]
probs = numpy.array([prob for sym, prob in by_freq], dtype=numpy.float32)
codec = HuffmanCodec(probs, symbols.index('-eol-'))
codec = HuffmanCodec(probs)
py_codec = py_encode(symb2freq)
my_lengths = defaultdict(int)
@ -112,6 +116,7 @@ def test_rosetta():
assert my_exp_len == py_exp_len
"""
def test_vocab(EN):
codec = EN.vocab.codec
expected_length = 0
@ -137,3 +142,4 @@ def test_freqs():
for i, code in enumerate(codec.strings):
expected_length += len(code) * freqs[i]
assert 8 < expected_length < 14
"""