* Major refactor of serialization. Nearly complete now.

2025-11-04 01:48:04 +03:00 · 2015-07-17 01:19:29 +02:00 · 2015-07-17 01:19:29 +02:00 · db9dfd2e23
commit db9dfd2e23
parent c8282f9934
13 changed files with 423 additions and 391 deletions
--- a/setup.py
+++ b/setup.py
@ -94,6 +94,7 @@ def cython_setup(mod_names, language, includes, compile_args, link_args):
                                   "data/vocab/lexemes.bin",
                                   "data/vocab/strings.txt"],
                      "spacy.tokens": ["*.pxd"],
                      "spacy.serialize": ["*.pxd"],
                      "spacy.syntax": ["*.pxd"]},
        ext_modules=exts,
        cmdclass={'build_ext': Cython.Distutils.build_ext},
@ -158,8 +159,9 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
             'spacy.syntax.transition_system',
             'spacy.syntax.arc_eager',
             'spacy.syntax._parse_features',
-             'spacy.gold', 'spacy.orth', 'spacy.serialize',
+             'spacy.gold', 'spacy.orth',
             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
             'spacy.syntax.ner']
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -70,6 +70,7 @@ class English(object):
      Tagger=EnPosTagger,
      Parser=ParserFactory(ParserTransitionSystem),
      Entity=ParserFactory(EntityTransitionSystem),
      Packer=None,
      load_vectors=True
    ):
@ -101,10 +102,10 @@ class English(object):
            self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
        else:
            self.entity = None
-        if Serializer:
+        if Packer:
-            self.bitter = Serializer(self.vocab, data_dir)
+            self.packer = Packer(self.vocab, data_dir)
        else:
-            self.bitter = None
+            self.packer = None
        self.mwe_merger = RegexMerger([
            ('IN', 'O', regexes.MW_PREPOSITIONS_RE),
            ('CD', 'TIME', regexes.TIME_RE),
--- a/spacy/serialize.pyx
+++ b/spacy/serialize.pyx
@ -1,334 +0,0 @@
 from libcpp.vector cimport vector
 from libc.stdint cimport uint32_t
 from libc.stdint cimport int64_t
 from libc.stdint cimport int32_t
 from libc.stdint cimport uint64_t
 from libcpp.queue cimport priority_queue
 from libcpp.pair cimport pair
 from preshed.maps cimport PreshMap
 from murmurhash.mrmr cimport hash64
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab
 from os import path
 import numpy
 cimport cython
 ctypedef unsigned char uchar
 # Format
 # - Total number of bytes in message (32 bit int)
 # - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
 # - Spaces ~1 bit per word
 # - Parse: Huffman coded head offset / dep label / POS tag / entity IOB tag
 #          combo. ? bits per word. 40 * 80 * 40 * 12 = 1.5m symbol vocab
 # Note that we're setting the most significant bits here first, when in practice
 # we're actually wanting the last bit to be most significant (for Huffman coding,
 # anyway).
 cdef Code bit_append(Code code, bint bit) nogil:
    cdef uint64_t one = 1
    if bit:
        code.bits |= one << code.length
    else:
        code.bits &= ~(one << code.length)
    code.length += 1
    return code
 cdef class BitArray:
    cdef bytes data
    cdef unsigned char byte
    cdef unsigned char bit_of_byte
    cdef uint32_t i
    def __init__(self):
        self.data = b''
        self.byte = 0
        self.bit_of_byte = 0
        self.i = 0
    def __iter__(self):
        cdef uchar byte, i
        cdef uchar one = 1
        start_byte = self.i // 8
        if (self.i % 8) != 0:
            for i in range(self.i % 8):
                yield 1 if (self.data[start_byte] & (one << i)) else 0
            start_byte += 1
        for byte in self.data[start_byte:]:
            for i in range(8):
                yield 1 if byte & (one << i) else 0
        for i in range(self.bit_of_byte):
            yield 1 if self.byte & (one << i) else 0
    def as_bytes(self):
        if self.bit_of_byte != 0:
            return self.data + chr(self.byte)
        else:
            return self.data
    def append(self, bint bit):
        cdef uint64_t one = 1
        if bit:
            self.byte |= one << self.bit_of_byte
        else:
            self.byte &= ~(one << self.bit_of_byte)
        self.bit_of_byte += 1
        if self.bit_of_byte == 8:
            self.data += chr(self.byte)
            self.byte = 0
            self.bit_of_byte = 0
    cdef int extend(self, uint64_t code, char n_bits) except -1:
        cdef uint64_t one = 1
        cdef unsigned char bit_of_code
        for bit_of_code in range(n_bits):
            if code & (one << bit_of_code):
                self.byte |= one << self.bit_of_byte
            else:
                self.byte &= ~(one << self.bit_of_byte)
            self.bit_of_byte += 1
            if self.bit_of_byte == 8:
                self.data += chr(self.byte)
                self.byte = 0
                self.bit_of_byte = 0
 cdef class Serializer:
    # Manage codecs, maintain consistent format for io
    def __init__(self, Vocab vocab, data_dir):
        model_dir = path.join(data_dir, 'bitter')
        self.vocab = vocab # Vocab owns the word codec, the big one
        #self.cfg = Config.read(model_dir, 'config')
        self.codecs = tuple([CodecWrapper(attr) for attr in self.cfg.attrs])
    def __call__(self, doc_or_bits):
        if isinstance(doc_or_bits, Doc):
            return self.serialize(doc_or_bits)
        elif isinstance(doc_or_bits, BitArray):
            return self.deserialize(doc_or_bits)
        else:
            raise ValueError(doc_or_bits)
    def train(self, doc):
        array = doc.to_array([codec.id for codec in self.codecs])
        for i, codec in enumerate(self.codecs):
            codec.count(array[i]) 
    def serialize(self, doc):
        bits = BitArray()
        array = doc.to_array(self.attrs)
        for i, codec in enumerate(self.codecs):
            codec.encode(array[i,], bits)
        return bits
    @cython.boundscheck(False)
    def deserialize(self, bits):
        biterator = iter(bits)
        cdef Doc doc = Doc(self.vocab)
        ids = self.vocab.codec.decode(biterator)
        cdef int id_
        cdef bint is_spacy
        for id_ in ids:
            is_spacy = biterator.next()
            doc.push_back(self.vocab.lexemes.at(id_), is_spacy)
        cdef int length = doc.length
        array = numpy.zeros(shape=(length, len(self.codecs)), dtype=numpy.int)
        for i, codec in enumerate(self.codecs):
            array[i] = codec.decode(biterator)
        doc.from_array([c.id for c in self.codecs], array)
        return doc
 cdef class CodecWrapper:
    """Wrapper around HuffmanCodec"""
    def __init__(self, freqs, id=0):
        cdef uint64_t key
        cdef uint64_t count
        cdef pair[uint64_t, uint64_t] item
        cdef priority_queue[pair[uint64_t, uint64_t]] items
        for key, count in freqs:
            item.first = count
            item.second = key
            items.push(item)
        weights = [] #array('f')
        keys = [] #array('i')
        key_to_i = PreshMap()
        i = 0
        while not items.empty():
            item = items.top()
            weights.append(item.first)
            keys.append(item.second)
            key_to_i[item.second] = i
            i += 1
            items.pop()
    def encode(self, symbols):
        indices = [self.table[symbol] for symbol in symbols]
        return self._codec.encode(indices)
    def decode(self, bits):
        indices = self._codec.decode(bits)
        return [self.symbols[i] for i in indices]
 cdef class HuffmanCodec:
    """Create a Huffman code table, and use it to pack and unpack sequences into
    byte strings. Emphasis is on efficiency, so API is quite strict:
    Messages will be encoded/decoded as indices that refer to the probability sequence.
    For instance, the sequence [5, 10, 8] indicates the 5th most frequent item,
    the 10th most frequent item, the 8th most frequent item.  The codec will add
    the EOL symbol to your message. An exception will be raised if you include
    the EOL symbol in your message.
    Arguments:
        weights (float[:]): A descending-sorted sequence of probabilities/weights.
          Must include a weight for an EOL symbol.
        eol (uint32_t): The index of the weight of the EOL symbol.
    """
    def __init__(self, float[:] weights, uint32_t eol):
        self.codes.resize(len(weights))
        for i in range(len(self.codes)):
            self.codes[i].bits = 0
            self.codes[i].length = 0
        populate_nodes(self.nodes, weights)
        cdef Code path
        path.bits = 0
        path.length = 0
        assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
    def encode(self, uint32_t[:] sequence, BitArray bits=None):
        if bits is None:
            bits = BitArray()
        for i in sequence:
            bits.extend(self.codes[i].bits, self.codes[i].length) 
        bits.extend(self.codes[self.eol].bits, self.codes[self.eol].length)
        return bits
    def decode(self, bits):
        node = self.nodes.back()
        symbols = []
        for bit in bits:
            branch = node.right if bit else node.left
            if branch >= 0:
                node = self.nodes.at(branch)
            else:
                symbol = -(branch + 1)
                if symbol == self.eol:
                    return symbols
                else:
                    symbols.append(symbol)
                node = self.nodes.back()
        return symbols
    property strings:
        @cython.boundscheck(False)
        @cython.wraparound(False)
        @cython.nonecheck(False)
        def __get__(self):
            output = []
            cdef int i, j
            cdef bytes string
            cdef Code code
            for i in range(self.codes.size()):
                code = self.codes[i]
                string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
                string = string[::-1]
                output.append(string)
            return output
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
 cdef int populate_nodes(vector[Node]& nodes, float[:] probs) except -1:
    assert len(probs) >= 3
    cdef int size = len(probs)
    cdef int i = size - 1
    cdef int j = 0
    while i >= 0 or (j+1) < nodes.size():
        if i < 0:
            _cover_two_nodes(nodes, j)
            j += 2
        elif j >= nodes.size():
            _cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
            i -= 2
        elif i >= 1 and (j == nodes.size() or probs[i-1] < nodes[j].prob):
            _cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
            i -= 2
        elif (j+1) < nodes.size() and nodes[j+1].prob < probs[i]:
            _cover_two_nodes(nodes, j)
            j += 2
        else:
            _cover_one_word_one_node(nodes, j, i, probs[i])
            i -= 1
            j += 1
    return 0
 cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil:
    """Introduce a new non-terminal, over two non-terminals)"""
    cdef Node node
    node.left = j
    node.right = j+1
    node.prob = nodes[j].prob + nodes[j+1].prob
    nodes.push_back(node)
 cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float prob) nogil:
    """Introduce a new non-terminal, over one terminal and one non-terminal."""
    cdef Node node
    # Encode leaves as negative integers, where the integer is the index of the
    # word in the vocabulary.
    cdef int64_t leaf_id = - <int64_t>(id_ + 1)
    cdef float new_prob = prob + nodes[j].prob
    if prob < nodes[j].prob:
        node.left = leaf_id
        node.right = j
        node.prob = new_prob
    else:
        node.left = j
        node.right = leaf_id
        node.prob = new_prob
    nodes.push_back(node)
 cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nogil:
    """Introduce a new node, over two non-terminals."""
    cdef Node node
    node.left = -(id1+1)
    node.right = -(id2+1)
    node.prob = prob
    nodes.push_back(node)
 cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
    """Recursively assign paths, from the top down. At the end, the entry codes[i]
    knows the bit-address of the node[j] that points to entry i in the vocabulary.
    So, to encode i, we go to codes[i] and read its bit-string. To decode, we
    navigate nodes recursively.
    """
    cdef Code left_path = bit_append(path, 0)
    cdef Code right_path = bit_append(path, 1)
    # Assign down left branch
    if nodes[i].left >= 0:
        assign_codes(nodes, codes, nodes[i].left, left_path)
    else:
        # Leaf on left
        id_ = -(nodes[i].left + 1)
        codes[id_] = left_path
    # Assign down right branch
    if nodes[i].right >= 0:
        assign_codes(nodes, codes, nodes[i].right, right_path)
    else:
        # Leaf on right
        id_ = -(nodes[i].right + 1)
        codes[id_] = right_path
--- a/spacy/serialize/bits.pxd
+++ b/spacy/serialize/bits.pxd
@ -0,0 +1,21 @@
 from libc.stdint cimport uint64_t
 from libc.stdint cimport uint32_t
 ctypedef unsigned char uchar
 cdef struct Code:
    uint64_t bits
    char length
 cdef Code bit_append(Code code, bint bit) nogil
 cdef class BitArray:
    cdef bytes data
    cdef uchar byte
    cdef uchar bit_of_byte
    cdef uint32_t i
    cdef int extend(self, uint64_t code, char n_bits) except -1
--- a/spacy/serialize/bits.pyx
+++ b/spacy/serialize/bits.pyx
@ -0,0 +1,69 @@
 # Note that we're setting the most significant bits here first, when in practice
 # we're actually wanting the last bit to be most significant (for Huffman coding,
 # anyway).
 cdef Code bit_append(Code code, bint bit) nogil:
    cdef uint64_t one = 1
    if bit:
        code.bits |= one << code.length
    else:
        code.bits &= ~(one << code.length)
    code.length += 1
    return code
 cdef class BitArray:
    def __init__(self):
        self.data = b''
        self.byte = 0
        self.bit_of_byte = 0
        self.i = 0
    def __iter__(self):
        cdef uchar byte, i
        cdef uchar one = 1
        start_byte = self.i // 8
        if (self.i % 8) != 0:
            for i in range(self.i % 8):
                yield 1 if (self.data[start_byte] & (one << i)) else 0
            start_byte += 1
        for byte in self.data[start_byte:]:
            for i in range(8):
                yield 1 if byte & (one << i) else 0
        for i in range(self.bit_of_byte):
            yield 1 if self.byte & (one << i) else 0
    def as_bytes(self):
        if self.bit_of_byte != 0:
            return self.data + chr(self.byte)
        else:
            return self.data
    def append(self, bint bit):
        cdef uint64_t one = 1
        if bit:
            self.byte |= one << self.bit_of_byte
        else:
            self.byte &= ~(one << self.bit_of_byte)
        self.bit_of_byte += 1
        if self.bit_of_byte == 8:
            self.data += chr(self.byte)
            self.byte = 0
            self.bit_of_byte = 0
    cdef int extend(self, uint64_t code, char n_bits) except -1:
        cdef uint64_t one = 1
        cdef unsigned char bit_of_code
        for bit_of_code in range(n_bits):
            if code & (one << bit_of_code):
                self.byte |= one << self.bit_of_byte
            else:
                self.byte &= ~(one << self.bit_of_byte)
            self.bit_of_byte += 1
            if self.bit_of_byte == 8:
                self.data += chr(self.byte)
                self.byte = 0
                self.bit_of_byte = 0
--- a/spacy/serialize/huffman.pxd
+++ b/spacy/serialize/huffman.pxd
@ -4,7 +4,7 @@ from libc.stdint cimport int64_t
 from libc.stdint cimport int32_t
 from libc.stdint cimport uint64_t
-from .vocab cimport Vocab
+from .bits cimport Code
 cdef struct Node:
@ -13,19 +13,6 @@ cdef struct Node:
    int32_t right
 cdef struct Code:
    uint64_t bits
    char length
 cdef class Serializer:
    cdef list codecs
    cdef Vocab vocab
 cdef class HuffmanCodec:
    cdef vector[Node] nodes
    cdef vector[Code] codes
    cdef uint32_t eol
    cdef int id
--- a/spacy/serialize/huffman.pyx
+++ b/spacy/serialize/huffman.pyx
@ -0,0 +1,157 @@
 cimport cython
 from .bits cimport bit_append
 from .bits cimport BitArray
 cdef class HuffmanCodec:
    """Create a Huffman code table, and use it to pack and unpack sequences into
    byte strings. Emphasis is on efficiency, so API is quite strict:
    Messages will be encoded/decoded as indices that refer to the probability sequence.
    For instance, the sequence [5, 10, 8] indicates the 5th most frequent item,
    the 10th most frequent item, the 8th most frequent item.
    Arguments:
        weights (float[:]): A descending-sorted sequence of probabilities/weights.
          Must include a weight for an EOL symbol.
        eol (uint32_t): The index of the weight of the EOL symbol.
    """
    def __init__(self, float[:] weights):
        self.codes.resize(len(weights))
        for i in range(len(self.codes)):
            self.codes[i].bits = 0
            self.codes[i].length = 0
        populate_nodes(self.nodes, weights)
        cdef Code path
        path.bits = 0
        path.length = 0
        assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
    def encode(self, uint32_t[:] msg, BitArray into_bits):
        cdef uint32_t i
        for i in range(len(msg)):
            into_bits.extend(self.codes[msg[i]].bits, self.codes[msg[i]].length)
    def decode(self, bits, uint32_t[:] into_msg):
        node = self.nodes.back()
        cdef int i = 0
        cdef int n = len(into_msg)
        for bit in bits:
            branch = node.right if bit else node.left
            if branch >= 0:
                node = self.nodes.at(branch)
            else:
                into_msg[i] = -(branch + 1)
                node = self.nodes.back()
                i += 1
                if i == n:
                    break
        else:
            raise Exception
    property strings:
        @cython.boundscheck(False)
        @cython.wraparound(False)
        @cython.nonecheck(False)
        def __get__(self):
            output = []
            cdef int i, j
            cdef bytes string
            cdef Code code
            for i in range(self.codes.size()):
                code = self.codes[i]
                string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
                string = string[::-1]
                output.append(string)
            return output
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
 cdef int populate_nodes(vector[Node]& nodes, float[:] probs) except -1:
    assert len(probs) >= 3
    cdef int size = len(probs)
    cdef int i = size - 1
    cdef int j = 0
    while i >= 0 or (j+1) < nodes.size():
        if i < 0:
            _cover_two_nodes(nodes, j)
            j += 2
        elif j >= nodes.size():
            _cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
            i -= 2
        elif i >= 1 and (j == nodes.size() or probs[i-1] < nodes[j].prob):
            _cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
            i -= 2
        elif (j+1) < nodes.size() and nodes[j+1].prob < probs[i]:
            _cover_two_nodes(nodes, j)
            j += 2
        else:
            _cover_one_word_one_node(nodes, j, i, probs[i])
            i -= 1
            j += 1
    return 0
 cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil:
    """Introduce a new non-terminal, over two non-terminals)"""
    cdef Node node
    node.left = j
    node.right = j+1
    node.prob = nodes[j].prob + nodes[j+1].prob
    nodes.push_back(node)
 cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float prob) nogil:
    """Introduce a new non-terminal, over one terminal and one non-terminal."""
    cdef Node node
    # Encode leaves as negative integers, where the integer is the index of the
    # word in the vocabulary.
    cdef int64_t leaf_id = - <int64_t>(id_ + 1)
    cdef float new_prob = prob + nodes[j].prob
    if prob < nodes[j].prob:
        node.left = leaf_id
        node.right = j
        node.prob = new_prob
    else:
        node.left = j
        node.right = leaf_id
        node.prob = new_prob
    nodes.push_back(node)
 cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nogil:
    """Introduce a new node, over two non-terminals."""
    cdef Node node
    node.left = -(id1+1)
    node.right = -(id2+1)
    node.prob = prob
    nodes.push_back(node)
 cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
    """Recursively assign paths, from the top down. At the end, the entry codes[i]
    knows the bit-address of the node[j] that points to entry i in the vocabulary.
    So, to encode i, we go to codes[i] and read its bit-string. To decode, we
    navigate nodes recursively.
    """
    cdef Code left_path = bit_append(path, 0)
    cdef Code right_path = bit_append(path, 1)
    # Assign down left branch
    if nodes[i].left >= 0:
        assign_codes(nodes, codes, nodes[i].left, left_path)
    else:
        # Leaf on left
        id_ = -(nodes[i].left + 1)
        codes[id_] = left_path
    # Assign down right branch
    if nodes[i].right >= 0:
        assign_codes(nodes, codes, nodes[i].right, right_path)
    else:
        # Leaf on right
        id_ = -(nodes[i].right + 1)
        codes[id_] = right_path
--- a/spacy/serialize/packer.pxd
+++ b/spacy/serialize/packer.pxd
@ -0,0 +1,6 @@
 from ..vocab cimport Vocab
 cdef class Packer:
    cdef tuple _codecs
    cdef Vocab vocab
--- a/spacy/serialize/packer.pyx
+++ b/spacy/serialize/packer.pyx
@ -0,0 +1,136 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
 from libc.math cimport exp as c_exp
 from libcpp.queue cimport priority_queue
 from libcpp.pair cimport pair
 from cymem.cymem cimport Address, Pool
 from preshed.maps cimport PreshMap
 from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 from ..typedefs cimport attr_t
 from .bits cimport BitArray
 from .huffman cimport HuffmanCodec
 from os import path
 import numpy
 cimport cython
 # Format
 # - Total number of bytes in message (32 bit int) --- handled outside this
 # - Number of words (32 bit int)
 # - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
 # - Spaces 1 bit per word
 # - Attributes:
 #       POS tag
 #       Head offset
 #       Dep label
 #       Entity IOB
 #       Entity tag
 def make_vocab_codec(Vocab vocab):
    cdef int length = len(vocab)
    cdef Address mem = Address(length, sizeof(float))
    probs = <float*>mem.ptr
    cdef int i
    for i in range(length):
        probs[i] = <float>c_exp(vocab.lexemes[i].prob)
    cdef float[:] cv_probs = <float[:len(vocab)]>probs
    return HuffmanCodec(cv_probs)
 cdef class _BinaryCodec:
    def encode(self, src, bits):
        cdef int i
        for i in range(len(src)):
            bits.append(src[i])
    def decode(self, dest, bits, n):
        for i in range(n):
            dest[i] = bits.next()
 cdef class _AttributeCodec:
    cdef Pool mem
    cdef attr_t* _keys
    cdef PreshMap _map
    cdef HuffmanCodec _codec
    def __init__(self, freqs):
        cdef uint64_t key
        cdef uint64_t count
        cdef pair[uint64_t, uint64_t] item
        cdef priority_queue[pair[uint64_t, uint64_t]] items
        for key, count in freqs:
            item.first = count
            item.second = key
            items.push(item)
        weights = numpy.array(shape=(len(freqs),), dtype=numpy.float32)
        self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
        self._map = PreshMap()
        cdef int i = 0
        while not items.empty():
            item = items.top()
            weights[i] = item.first
            self._keys[i] = item.second
            self._map[self.keys[i]] = i
            items.pop()
        self._codec = HuffmanCodec(weights)
    def encode(self, attr_t[:] msg, BitArray into_bits):
        for i in range(len(msg)):
            msg[i] = self._map[msg[i]]
        self._codec.encode(msg, into_bits)
    def decode(self, BitArray bits, attr_t[:] into_msg):
        cdef int i
        self._codec.decode(bits, into_msg)
        for i in range(len(into_msg)):
            into_msg[i] = self._keys[into_msg[i]]
 cdef class Packer:
    def __init__(self, Vocab vocab, list_of_attr_freqs):
        self.vocab = vocab
        codecs = []
        self.attrs = []
        for attr, freqs in list_of_attr_freqs:
            if attr == ID:
                codecs.append(make_vocab_codec(vocab))
            elif attr == SPACY:
                codecs.append(_BinaryCodec())
            else:
                codecs.append(_AttributeCodec(freqs))
            self.attrs.append(attr)
        self._codecs = tuple(codecs)
    def __call__(self, msg_or_bits):
        if isinstance(msg_or_bits, BitArray):
            bits = msg_or_bits
            return Doc.from_array(self.vocab, self.attrs, self.deserialize(bits))
        else:
            msg = msg_or_bits
            return self.serialize(msg.to_array(self.attrs))
    def serialize(self, array):
        cdef BitArray bits = BitArray()
        cdef uint32_t length = len(array)
        bits.extend(length, 32)
        for i, codec in enumerate(self._codecs):
            codec.encode(array[i], bits)
        return bits
    def deserialize(self, bits):
        cdef uint32_t length = bits.read(32)
        array = numpy.ndarray(shape=(len(self.codecs), length), dtype=numpy.int)
        for i, codec in enumerate(self.codecs):
            array[i] = codec.decode(bits)
        return array
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -4,7 +4,6 @@ from libc.string cimport memcpy, memset
 import numpy
 from ..lexeme cimport EMPTY_LEXEME
 from ..serialize import BitArray
 from ..strings cimport slice_unicode
 from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
@ -371,10 +370,12 @@ cdef class Doc:
        return self[start]
    def from_array(self, attrs, array):
-        cdef int i
+        cdef int i, col
        cdef attr_id_t attr_id
        cdef TokenC* tokens = self.data
-        for attr_id in attrs:
+        cdef int length = len(array)
        for col, attr_id in enumerate(attrs): 
            values = array[:, col]
            if attr_id == HEAD:
                for i in range(length):
                    tokens[i].head = values[i]
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -35,5 +35,3 @@ cdef class Vocab:
    cdef PreshMap _map
    cdef readonly int repvec_length
    cdef public object _codec
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,7 +1,6 @@
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from libc.string cimport memset
 from libc.stdint cimport int32_t
 from libc.math cimport exp as c_exp
 import bz2
 from os import path
@ -15,7 +14,6 @@ from .strings cimport slice_unicode
 from .strings cimport hash_string
 from .orth cimport word_shape
 from .typedefs cimport attr_t
 from .serialize cimport HuffmanCodec
 from cymem.cymem cimport Address
@ -227,22 +225,6 @@ cdef class Vocab:
                lex.repvec = EMPTY_VEC
        return vec_len
    property codec:
        def __get__(self):
            cdef Address mem
            cdef int i
            cdef float[:] cv_probs
            if self._codec is not None:
                return self._codec
            else:
                mem = Address(len(self), sizeof(float))
                probs = <float*>mem.ptr
                for i in range(len(self)):
                    probs[i] = <float>c_exp(self.lexemes[i].prob)
                cv_probs = <float[:len(self)]>probs
                self._codec = HuffmanCodec(cv_probs, 0)
                return self._codec
 def write_binary_vectors(in_loc, out_loc):
    cdef _CFile out_file = _CFile(out_loc, 'wb')
--- a/tests/vocab/test_huffman.py
+++ b/tests/vocab/test_huffman.py
@ -3,14 +3,15 @@ from __future__ import division
 import pytest
-from spacy.serialize import HuffmanCodec
+from spacy.serialize.huffman import HuffmanCodec
 from spacy.serialize.bits import BitArray
 import numpy
 from heapq import heappush, heappop, heapify
 from collections import defaultdict
-class Vocab(object):
+class MockPacker(object):
    def __init__(self, freqs):
        freqs['-eol-'] = 5
        total = sum(freqs.values())
@ -19,15 +20,19 @@ class Vocab(object):
        self.symbols = [sym for sym, freq in by_freq]
        self.probs = numpy.array([item[1] / total for item in by_freq], dtype=numpy.float32)
        self.table = {sym: i for i, sym in enumerate(self.symbols)}
-        self.codec = HuffmanCodec(self.probs, self.table['-eol-'])
+        self.codec = HuffmanCodec(self.probs)
    def pack(self, message):
        seq = [self.table[sym] for sym in message]
-        return self.codec.encode(numpy.array(seq, dtype=numpy.uint32))
+        msg = numpy.array(seq, dtype=numpy.uint32)
        bits = BitArray()
        self.codec.encode(msg, bits)
        return bits
-    def unpack(self, packed):
+    def unpack(self, bits, n):
-        ids = self.codec.decode(packed)
+        msg = numpy.array(range(n), dtype=numpy.uint32)
-        return [self.symbols[i] for i in ids]
+        self.codec.decode(bits, msg)
        return [self.symbols[i] for i in msg]
 def py_encode(symb2freq):
@ -60,7 +65,7 @@ def test1():
    probs[8] = 0.0001
    probs[9] = 0.000001
-    codec = HuffmanCodec(probs, 9)
+    codec = HuffmanCodec(probs)
    py_codes = py_encode(dict(enumerate(probs)))
    py_codes = py_codes.items()
@ -71,19 +76,19 @@ def test1():
 def test_round_trip():
    freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, 'over': 8,
            'lazy': 1, 'dog': 2, '.': 9}
-    vocab = Vocab(freqs)
+    packer = MockPacker(freqs)
    message = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the',
                'the', 'lazy', 'dog', '.']
-    strings = list(vocab.codec.strings)
+    strings = list(packer.codec.strings)
-    codes = {vocab.symbols[i]: strings[i] for i in range(len(vocab.symbols))}
+    codes = {packer.symbols[i]: strings[i] for i in range(len(packer.symbols))}
-    packed = vocab.pack(message)
+    bits = packer.pack(message)
-    string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in packed.as_bytes())
+    string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes())
    for word in message:
        code = codes[word]
        assert string[:len(code)] == code
        string = string[len(code):]
-    unpacked = vocab.unpack(packed)
+    unpacked = packer.unpack(bits, len(message))
    assert message == unpacked
@ -92,13 +97,12 @@ def test_rosetta():
    symb2freq = defaultdict(int)
    for ch in txt:
        symb2freq[ch] += 1
    symb2freq['-eol-'] = 1
    by_freq = symb2freq.items()
    by_freq.sort(reverse=True, key=lambda item: item[1])
    symbols = [sym for sym, prob in by_freq]
    probs = numpy.array([prob for sym, prob in by_freq], dtype=numpy.float32)
-    codec = HuffmanCodec(probs, symbols.index('-eol-'))
+    codec = HuffmanCodec(probs)
    py_codec = py_encode(symb2freq)
    my_lengths = defaultdict(int)
@ -112,6 +116,7 @@ def test_rosetta():
    assert my_exp_len == py_exp_len
 """
 def test_vocab(EN):
    codec = EN.vocab.codec
    expected_length = 0
@ -137,3 +142,4 @@ def test_freqs():
    for i, code in enumerate(codec.strings):
        expected_length += len(code) * freqs[i]
    assert 8 < expected_length < 14
 """