diff --git a/bin/parser/train.py b/bin/parser/train.py
index a3903848b..d706f7747 100755
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@@ -141,11 +141,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
         print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                scorer.tags_acc,
                                                scorer.token_acc)
-    nlp.parser.model.end_training()
-    nlp.entity.model.end_training()
-    nlp.tagger.model.end_training()
-    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
-
+    nlp.end_training()
 
 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
              beam_width=None):
diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py
index 2ee5e4d84..240efd54e 100644
--- a/spacy/en/__init__.py
+++ b/spacy/en/__init__.py
@@ -1,6 +1,8 @@
 from __future__ import unicode_literals
 from os import path
 import re
+import struct
+import json
 
 from .. import orth
 from ..vocab import Vocab
@@ -8,6 +10,7 @@ from ..tokenizer import Tokenizer
 from ..syntax.arc_eager import ArcEager
 from ..syntax.ner import BiluoPushDown
 from ..syntax.parser import ParserFactory
+from ..serialize.bits import BitArray
 
 from ..tokens import Doc
 from ..multi_words import RegexMerger
@@ -19,6 +22,8 @@ from . import regexes
 
 from ..util import read_lang_data
 
+from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
+
 
 def get_lex_props(string):
     return {
@@ -74,7 +79,7 @@ class English(object):
       load_vectors=True
     ):
         
-        self._data_dir = data_dir
+        self.data_dir = data_dir
         
         self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
                            get_lex_props=get_lex_props, load_vectors=load_vectors,
@@ -140,6 +145,29 @@ class English(object):
             self.mwe_merger(tokens)
         return tokens
 
+    def end_training(self, data_dir=None):
+        if data_dir is None:
+            data_dir = self.data_dir
+        self.parser.model.end_training()
+        self.entity.model.end_training()
+        self.tagger.model.end_training()
+        self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
+        
+        with open(path.join(data_dir, 'pos', 'tag_freqs.json'), 'w') as file_:
+            json.dump(list(self.tagger.freqs[TAG].items()), file_)
+ 
+        with open(path.join(data_dir, 'deps', 'head_freqs.json'), 'w') as file_:
+            json.dump(list(self.parser.moves.freqs[HEAD].items()), file_)
+ 
+        with open(path.join(data_dir, 'deps', 'dep_freqs.json'), 'w') as file_:
+            json.dump(list(self.parser.moves.freqs[DEP].items()), file_)
+ 
+        with open(path.join(data_dir, 'ner', 'iob_freqs.json'), 'w') as file_:
+            json.dump(list(self.entity.moves.freqs[ENT_IOB].items()), file_)
+ 
+        with open(path.join(data_dir, 'ner', 'ne_freqs.json'), 'w') as file_:
+            json.dump(list(self.entity.moves.freqs[ENT_TYPE].items()), file_)
+
     @property
     def tags(self):
         """List of part-of-speech tag names."""
diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd
index b59481020..2fc7b4ac7 100644
--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@@ -1,4 +1,5 @@
 from preshed.maps cimport PreshMapArray
+from preshed.counter cimport PreshCounter
 from cymem.cymem cimport Pool
 
 from .._ml cimport Model
@@ -14,6 +15,7 @@ cdef class EnPosTagger:
     cdef readonly Model model
     cdef public object lemmatizer
     cdef PreshMapArray _morph_cache
+    cdef public dict freqs
 
     cdef PosTag* tags
     cdef readonly object tag_names
diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx
index db1679c28..3dab084a8 100644
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@@ -7,6 +7,7 @@ from libc.string cimport memset
 
 from cymem.cymem cimport Address
 from thinc.typedefs cimport atom_t, weight_t
+from collections import defaultdict
 
 from ..parts_of_speech cimport univ_pos_t
 from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
@@ -17,7 +18,7 @@ from ..tokens.doc cimport Doc
 from ..morphology cimport set_morph_from_dict
 from .._ml cimport arg_max
 
-from .attrs cimport IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
+from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
 from ..typedefs cimport attr_t
 
 from .lemmatizer import Lemmatizer
@@ -260,6 +261,7 @@ cdef class EnPosTagger:
             self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
                                                  'morphs.json'))))
         self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
+        self.freqs = {TAG: defaultdict(int)}
 
     def __call__(self, Doc tokens):
         """Apply the tagger, setting the POS tags onto the Doc object.
@@ -309,6 +311,7 @@ cdef class EnPosTagger:
             tokens.data[i].tag = self.strings[self.tag_names[guess]]
             self.set_morph(i, &self.tags[guess], tokens.data)
             correct += loss == 0
+            self.freqs[TAG][tokens.data[i].tag] += 1
         return correct
 
     cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
diff --git a/spacy/serialize/bits.pyx b/spacy/serialize/bits.pyx
index 340b18cd5..3b879b2ee 100644
--- a/spacy/serialize/bits.pyx
+++ b/spacy/serialize/bits.pyx
@@ -14,8 +14,8 @@ cdef Code bit_append(Code code, bint bit) nogil:
 
 
 cdef class BitArray:
-    def __init__(self):
-        self.data = b''
+    def __init__(self, data=b''):
+        self.data = data
         self.byte = 0
         self.bit_of_byte = 0
         self.i = 0
diff --git a/spacy/serialize/packer.pxd b/spacy/serialize/packer.pxd
index 02bcdf56e..d8bc96553 100644
--- a/spacy/serialize/packer.pxd
+++ b/spacy/serialize/packer.pxd
@@ -4,4 +4,5 @@ from ..vocab cimport Vocab
 cdef class Packer:
     cdef readonly tuple attrs
     cdef readonly tuple _codecs
+    cdef readonly object lex_codec
     cdef readonly Vocab vocab
diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx
index 09f6de57a..8c7adb1d8 100644
--- a/spacy/serialize/packer.pyx
+++ b/spacy/serialize/packer.pyx
@@ -8,15 +8,17 @@ from libcpp.pair cimport pair
 from cymem.cymem cimport Address, Pool
 from preshed.maps cimport PreshMap
 
-from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
+from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
+from ..structs cimport LexemeC
 from ..typedefs cimport attr_t
 from .bits cimport BitArray
 from .huffman cimport HuffmanCodec
 
 from os import path
 import numpy
+from .. import util
 
 cimport cython
 
@@ -67,8 +69,8 @@ cdef class _AttributeCodec:
             item.first = count
             item.second = key
             items.push(item)
-        weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32)
-        self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
+        weights = numpy.ndarray(shape=(items.size(),), dtype=numpy.float32)
+        self._keys = <attr_t*>self.mem.alloc(items.size(), sizeof(attr_t))
         self._map = {}
         cdef int i = 0
         while not items.empty():
@@ -94,21 +96,33 @@ cdef class _AttributeCodec:
             dest[i] = <attr_t>self._keys[dest[i]]
 
 
-cdef class Packer:
-    def __init__(self, Vocab vocab, list_of_attr_freqs):
-        self.vocab = vocab
-        codecs = []
-        attrs = []
+def _gen_orths(Vocab vocab):
+    cdef attr_t orth
+    cdef size_t addr
+    for orth, addr in vocab._by_orth.items():
+        lex = <LexemeC*>addr
+        yield orth, c_exp(lex.prob)
 
-        for attr, freqs in list_of_attr_freqs:
-            if attr == SPACY:
-                codecs.append(_BinaryCodec())
-            else:
-                codecs.append(_AttributeCodec(freqs))
+
+cdef class Packer:
+    def __init__(self, Vocab vocab, attr_freqs):
+        self.vocab = vocab
+        self.lex_codec = _AttributeCodec(_gen_orths(vocab))
+        
+        codecs = [_AttributeCodec(_gen_orths(vocab)), _BinaryCodec()]
+        attrs = [ORTH, SPACY]
+        for attr, freqs in sorted(attr_freqs):
+            if attr in (ORTH, ID, SPACY):
+                continue
+            codecs.append(_AttributeCodec(freqs))
             attrs.append(attr)
         self._codecs = tuple(codecs)
         self.attrs = tuple(attrs)
 
+    @classmethod
+    def from_dir(cls, Vocab vocab, data_dir):
+        return cls(vocab, util.read_encoding_freqs(data_dir))
+
     def pack(self, Doc doc):
         array = doc.to_array(self.attrs)
         cdef BitArray bits = BitArray()
@@ -124,6 +138,4 @@ cdef class Packer:
         array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32)
         for i, codec in enumerate(self._codecs):
             codec.decode(bits, array[:, i])
-        doc = Doc.from_ids(self.vocab, array[:, 0], array[:, 1])
-        doc.from_array(self.attrs, array)
-        return doc
+        return array
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index ebd8a0cac..0808fabf8 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -10,6 +10,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
+from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
@@ -309,6 +310,9 @@ cdef class ArcEager(TransitionSystem):
                     label = 'ROOT'
                 gold.c.heads[i] = gold.heads[i]
                 gold.c.labels[i] = self.strings[label]
+                # Count frequencies, for use in encoder
+                self.freqs[HEAD][gold.c.heads[i] - i] += 1
+                self.freqs[DEP][gold.c.labels[i]] += 1
         for end, brackets in gold.brackets.items():
             for start, label_strs in brackets.items():
                 gold.c.brackets[start][end] = 1
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index b145df7ac..8f6a662e8 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -8,6 +8,7 @@ from ..structs cimport TokenC, Entity
 from thinc.typedefs cimport weight_t
 from ..gold cimport GoldParseC
 from ..gold cimport GoldParse
+from ..attrs cimport ENT_TYPE, ENT_IOB
 
 from .stateclass cimport StateClass
 
@@ -74,6 +75,16 @@ cdef class BiluoPushDown(TransitionSystem):
     cdef int preprocess_gold(self, GoldParse gold) except -1:
         for i in range(gold.length):
             gold.c.ner[i] = self.lookup_transition(gold.ner[i])
+            # Count frequencies, for use in encoder
+            if gold.c.ner[i].move in (BEGIN, UNIT):
+                self.freqs[ENT_IOB][3] += 1
+                self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1
+            elif gold.c.ner[i].move in (IN, LAST):
+                self.freqs[ENT_IOB][2] += 1
+                self.freqs[ENT_TYPE][0] += 1
+            elif gold.c.ner[i].move == OUT:
+                self.freqs[ENT_IOB][1] += 1
+                self.freqs[ENT_TYPE][0] += 1
 
     cdef Transition lookup_transition(self, object name) except *:
         if name == '-':
diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd
index 2114df410..387cd0fc9 100644
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@@ -35,6 +35,7 @@ cdef class TransitionSystem:
     cdef bint* _is_valid
     cdef readonly int n_moves
     cdef public int root_label
+    cdef public freqs
 
     cdef int initialize_state(self, StateClass state) except -1
     cdef int finalize_state(self, StateClass state) nogil
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 083a4990b..4d32a4e54 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -1,8 +1,10 @@
 from cymem.cymem cimport Pool
-from ..structs cimport TokenC
 from thinc.typedefs cimport weight_t
+from collections import defaultdict
 
+from ..structs cimport TokenC
 from .stateclass cimport StateClass
+from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
 
 
 cdef weight_t MIN_SCORE = -90000
@@ -28,6 +30,9 @@ cdef class TransitionSystem:
                 i += 1
         self.c = moves
         self.root_label = self.strings['ROOT']
+        self.freqs = {}
+        for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
+            self.freqs[attr] = defaultdict(int)
 
     cdef int initialize_state(self, StateClass state) except -1:
         pass
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2bf6cf519..3f84b9561 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -2,6 +2,7 @@ cimport cython
 from libc.string cimport memcpy, memset
 
 import numpy
+import struct
 
 from ..lexeme cimport EMPTY_LEXEME
 from ..strings cimport slice_unicode
@@ -16,6 +17,7 @@ from ..lexeme cimport get_attr as get_lex_attr
 from .spans import Span
 from ..structs cimport UniStr
 from .token cimport Token
+from ..serialize.bits cimport BitArray
 
 
 DEF PADDING = 5
@@ -54,7 +56,7 @@ cdef class Doc:
     Container class for annotated text.  Constructed via English.__call__ or
     Tokenizer.__call__.
     """
-    def __init__(self, Vocab vocab):
+    def __init__(self, Vocab vocab, orths_and_spaces=None):
         self.vocab = vocab
         size = 20
         self.mem = Pool()
@@ -71,24 +73,17 @@ cdef class Doc:
         self.is_tagged = False
         self.is_parsed = False
         self._py_tokens = []
-
-    @classmethod
-    def from_ids(cls, Vocab vocab, orths, spaces):
-        cdef int i
         cdef const LexemeC* lex
-        cdef Doc self = cls(vocab)
-        cdef bint space = 0
         cdef attr_t orth
-        for i in range(len(orths)):
-            orth = orths[i]
-            lex = <LexemeC*>self.vocab._by_orth.get(orth)
-            if lex != NULL:
-                assert lex.orth == orth
-                space = spaces[i]
-                self.push_back(lex, space)
-            else:
-                raise Exception('Lexeme not found: %d' % orth)
-        return self
+        cdef bint space
+        if orths_and_spaces is not None:
+            for orth, space in orths_and_spaces:
+                lex = <LexemeC*>self.vocab._by_orth.get(orth)
+                if lex != NULL:
+                    assert lex.orth == orth
+                    self.push_back(lex, space)
+                else:
+                    raise Exception('Lexeme not found: %d' % orth)
 
     def __getitem__(self, object i):
         """Get a token.
@@ -389,3 +384,26 @@ cdef class Doc:
             elif attr_id == ENT_TYPE:
                 for i in range(length):
                     tokens[i].ent_type = values[i]
+
+    def to_bytes(self):
+        bits = self.vocab.packer.pack(self)
+        return struct.pack('I', len(bits)) + bits.as_bytes()
+
+    @staticmethod
+    def from_bytes(Vocab vocab, file_):
+        keep_reading = True
+        while keep_reading:
+            try:
+                n_bits_str = file_.read(4)
+                if len(n_bits_str) < 4:
+                    break
+                n_bits = struct.unpack('I', n_bits_str)[0]
+                n_bytes = n_bits // 8 + bool(n_bits % 8)
+                data = file_.read(n_bytes)
+            except StopIteration:
+                keep_reading = False
+            bits = BitArray(data)
+            array = vocab.packer.unpack(bits)
+            doc = Doc(vocab, array[:, :2])
+            doc.from_array(vocab.packer.attrs, array)
+            yield doc
diff --git a/spacy/util.py b/spacy/util.py
index 1d48ab7e9..543479f20 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,6 +2,7 @@ from os import path
 import codecs
 import json
 import re
+from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 
 DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
 
@@ -64,7 +65,17 @@ def read_tokenization(lang):
     return entries
 
 
-def read_detoken_rules(lang):
+def read_encoding_freqs(data_dir):
+    tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
+    heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
+    deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
+    iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
+    ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
+    return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
+            (ENT_TYPE, ne_types)]
+
+
+def read_detoken_rules(lang): # Deprecated?
     loc = path.join(DATA_DIR, lang, 'detokenize')
     entries = []
     with utf8open(loc) as file_:
@@ -73,7 +84,7 @@ def read_detoken_rules(lang):
     return entries
 
 
-def align_tokens(ref, indices):
+def align_tokens(ref, indices): # Deprecated, surely?
     start = 0
     queue = list(indices)
     for token in ref:
@@ -86,7 +97,7 @@ def align_tokens(ref, indices):
     assert not queue
 
 
-def detokenize(token_rules, words):
+def detokenize(token_rules, words): # Deprecated?
     """To align with treebanks, return a list of "chunks", where a chunk is a
     sequence of tokens that are separated by whitespace in actual strings. Each
     chunk should be a tuple of token indices, e.g.
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 9bf9c32b0..f36e415ad 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -29,6 +29,7 @@ cdef class Vocab:
     cpdef readonly StringStore strings
     cdef readonly object pos_tags
     cdef readonly int length
+    cdef public object packer
 
     cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index b8b4b84a8..4fb1d1645 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -16,6 +16,8 @@ from .orth cimport word_shape
 from .typedefs cimport attr_t
 
 from cymem.cymem cimport Address
+from . import util
+from .serialize.packer cimport Packer
 
 
 DEF MAX_VEC_SIZE = 100000
@@ -53,6 +55,8 @@ cdef class Vocab:
             if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
                 self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
 
+        self.packer = Packer(self, util.read_encoding_freqs(data_dir))
+
     def __len__(self):
         """The current number of lexemes stored."""
         return self.length
diff --git a/tests/serialize/test_codecs.py b/tests/serialize/test_codecs.py
index b1d1f99f3..40d56669f 100644
--- a/tests/serialize/test_codecs.py
+++ b/tests/serialize/test_codecs.py
@@ -5,7 +5,6 @@ import numpy
 
 from spacy.vocab import Vocab
 from spacy.serialize.packer import _BinaryCodec
-from spacy.serialize.packer import make_vocab_codec
 from spacy.serialize.packer import _AttributeCodec
 from spacy.serialize.bits import BitArray