diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index d106f172a..5b5892fdc 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -13,16 +13,16 @@ import random
 from os import path
 import re
 
-from cymem.cymem cimport Pool
 from cython.operator cimport preincrement as preinc
 from cython.operator cimport dereference as deref
+from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 
+from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
 
 from .lexeme cimport Lexeme
-from .lexeme cimport from_dict as lexeme_from_dict
-from .lexeme cimport from_string as lexeme_from_string
+from .lexeme cimport init as lexeme_init
 
 from . import orth
 from . import util
@@ -232,26 +232,27 @@ cdef class Lexicon:
         self.mem = Pool()
         self._dict = PreshMap(2 ** 20)
         self.strings = StringStore()
-        self.size = 0
+        self.size = 1
         cdef String string
         cdef Lexeme* lexeme
-        #for py_string, lexeme_dict in lexemes.iteritems():
-        #    string_from_unicode(&string, py_string)
-        #    lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
-        #    lexeme_from_dict(lexeme, lexeme_dict, self.strings)
-        #    self._dict.set(string.key, lexeme)
-        #    self.lexemes.push_back(lexeme)
-        #    self.size += 1
+        for py_string, lexeme_dict in lexemes.iteritems():
+            string_from_unicode(&string, py_string)
+            lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
+            lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.size,
+                                    self.strings, lexeme_dict)
+            self._dict.set(lexeme.hash, lexeme)
+            self.lexemes.push_back(lexeme)
+            self.size += 1
 
     cdef Lexeme* get(self, String* string) except NULL:
         cdef Lexeme* lex
         lex = <Lexeme*>self._dict.get(string.key)
         if lex != NULL:
             return lex
-
-        lex = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
-        lexeme_from_string(lex, string.chars[:string.n], self.strings)
-        self._dict.set(string.key, lex)
+        lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
+        lex[0] = lexeme_init(string.chars[:string.n], string.key, self.size,
+                             self.strings, {})
+        self._dict.set(lex.hash, lex)
         self.lexemes.push_back(lex)
         self.size += 1
         return lex
@@ -270,6 +271,34 @@ cdef class Lexicon:
         cdef Lexeme* lexeme = self.get(&string)
         return lexeme[0]
 
+    def dump(self, loc):
+        if path.exists(loc):
+            assert not path.isdir(loc)
+        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+        cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
+        assert fp != NULL
+        cdef size_t st
+        for i in range(self.size):
+            st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
+            assert st == 1
+        st = fclose(fp)
+        assert st == 0
+
+    def load(self, loc):
+        assert path.exists(loc)
+        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+        cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
+        assert fp != NULL
+        cdef size_t st
+        cdef Lexeme* lexeme
+        while True:
+            lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
+            st = fread(lexeme, sizeof(lexeme), 1, fp)
+            if st == 0:
+                break
+            self.lexemes.push_back(lexeme)
+            self._dict.set(lexeme.hash, lexeme)
+        
 
 cdef void string_from_unicode(String* s, unicode uni):
     cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 3cd65c995..235883e2a 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -23,9 +23,11 @@ cpdef enum:
 
 
 cdef struct Lexeme:
-    atom_t id
+    hash_t hash
+    atom_t i
     atom_t length
-    
+   
+    atom_t sic
     atom_t norm
     atom_t shape
     atom_t vocab10k
@@ -44,12 +46,9 @@ cdef struct Lexeme:
 
 cdef Lexeme EMPTY_LEXEME
 
-
-cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1
-
-
-cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1
-
+cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
+                  StringStore store, dict props) except *
+ 
 
 cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
     return lexeme.flags & (1 << flag_id)
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index d442a262e..03c6e2270 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -1,5 +1,6 @@
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
+from murmurhash.mrmr cimport hash64
 
 from libc.string cimport memset
 
@@ -12,7 +13,7 @@ OOV_DIST_FLAGS = 0
 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
 
 
-def get_flags(unicode string):
+def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
     cdef flag_t flags = 0
     flags |= orth.is_alpha(string) << IS_ALPHA
     flags |= orth.is_ascii(string) << IS_ASCII
@@ -25,20 +26,36 @@ def get_flags(unicode string):
     return flags
 
 
-cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1:
+cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
+                  StringStore store, dict props) except *:
+    cdef Lexeme lex
+    lex.hash = hashed
+    lex.i = i
+    print string, i
+    lex.length = len(string)
+    lex.sic = get_string_id(string, store)
+    
+    lex.cluster = props.get('cluster', 0)
+    lex.pos = props.get('pos', 0)
+    lex.supersense = props.get('supersense', 0)
+    lex.prob = props.get('prob', 0)
+
+    cdef float upper_pc = props.get('upper_pc', 0.0)
+    cdef float lower_pc = props.get('lower_pc', 0.0)
+    cdef float title_pc = props.get('title_pc', 0.0)
+
+    lex.prefix = get_string_id(string[0], store)
+    lex.suffix = get_string_id(string[-3:], store)
+    canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
+    lex.norm = get_string_id(canon_cased, store)
+    lex.shape = get_string_id(orth.word_shape(string), store)
+    lex.asciied = get_string_id(orth.asciied(string), store)
+    non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)
+    lex.vocab10k = get_string_id(non_sparse, store)
+    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
+    return lex
+
+cdef atom_t get_string_id(unicode string, StringStore store) except 0:
     cdef bytes byte_string = string.encode('utf8')
     cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
-    lex.id = orig_str.i
-    lex.cluster = 0
-    lex.length = len(string)
-    lex.flags = get_flags(string)
-    # TODO: Hook this up
-    #lex.norm = norm_str.i
-    #lex.shape = norm_str.i
-    #lex.asciied = asciied_str.i
-    #lex.prefix = prefix_str.i
-    #lex.suffix = suffix_str.i
-
-
-cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1:
-    pass
+    return orig_str.i
diff --git a/spacy/orth.py b/spacy/orth.py
index 4bec8d665..13ae2c0c4 100644
--- a/spacy/orth.py
+++ b/spacy/orth.py
@@ -64,11 +64,7 @@ def can_tag(name, thresh=0.5):
 
 
 # String features
-def canon_case(string, prob, cluster, case_stats, tag_stats):
-    upper_pc = case_stats.get('upper', 0.0)
-    title_pc = case_stats.get('title', 0.0)
-    lower_pc = case_stats.get('lower', 0.0)
-    
+def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
     if upper_pc >= lower_pc and upper_pc >= title_pc:
         return string.upper()
     elif title_pc >= lower_pc:
@@ -77,7 +73,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats):
         return string.lower()
 
 
-def word_shape(string, *args):
+def word_shape(string):
     length = len(string)
     shape = []
     last = ""
@@ -103,15 +99,15 @@ def word_shape(string, *args):
     return ''.join(shape)
 
 
-def non_sparse(string, prob, cluster, case_stats, tag_stats):
+def non_sparse(string, prob, cluster, upper_pc, title_pc, lower_pc):
     if is_alpha(string):
-        return canon_case(string, prob, cluster, case_stats, tag_stats)
+        return canon_case(string, upper_pc, title_pc, lower_pc)
     elif prob >= math.log(0.0001):
         return string
     else:
-        return word_shape(string, prob, cluster, case_stats, tag_stats)
+        return word_shape(string)
 
 
-def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None):
+def asciied(string):
     ascii_string = unidecode(string)
     return ascii_string.decode('ascii')
diff --git a/spacy/pos.pyx b/spacy/pos.pyx
index 0e79cddd7..263f88edb 100644
--- a/spacy/pos.pyx
+++ b/spacy/pos.pyx
@@ -31,10 +31,15 @@ cdef class Tagger:
         self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
         self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
         self._guess = NULL_TAG
-        if path.exists(path.join(model_dir, 'model.gz')):
-            with gzip.open(path.join(model_dir, 'model.gz'), 'r') as file_:
-                self.model.load(file_)
-
+        if path.exists(path.join(model_dir, 'model')):
+            self.model.load(path.join(model_dir, 'model'))
+        tags_loc = path.join(model_dir, 'postags.json')
+        if path.exists(tags_loc):
+            with open(tags_loc) as file_:
+                Tagger.tags.update(ujson.load(file_))
+        if path.exists(path.join(model_dir, 'strings')):
+            EN.lexicon.strings.load(path.join(model_dir, 'strings'))
+            
     cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
         assert i >= 0
         get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
@@ -125,7 +130,7 @@ cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1
 
 
 cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
-    atoms[0] = lex.id
+    atoms[0] = lex.i
     atoms[1] = lex.cluster
     atoms[2] = lex.norm
     atoms[3] = lex.shape
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 9847cdc3c..d6b655074 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -37,6 +37,7 @@ cdef class Token:
     cdef public atom_t lex_pos
     cdef public atom_t lex_supersense
 
+    cdef public atom_t sic
     cdef public atom_t norm
     cdef public atom_t shape
     cdef public atom_t vocab10k
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 56ffc343f..6abfd5b6a 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -101,16 +101,18 @@ cdef class Tokens:
 @cython.freelist(64)
 cdef class Token:
     def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
+        assert i < 1000000
         self._string_store = string_store
-        self.i = i
+        self.id = i
         self.idx = idx
         self.pos = pos
         
-        self.id = lex['id']
+        self.id = lex['i']
         self.cluster = lex['cluster']
         self.length = lex['length']
         self.lex_pos = lex['pos']
         self.lex_supersense = lex['supersense']
+        self.sic = lex['sic']
         self.norm = lex['norm']
         self.shape = lex['shape']
         self.vocab10k = lex['vocab10k']
@@ -122,6 +124,6 @@ cdef class Token:
 
     property string:
         def __get__(self):
-            cdef bytes utf8string = self._string_store[self.id]
+            cdef bytes utf8string = self._string_store[self.sic]
             return utf8string.decode('utf8')
 
diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx
index d3bc3a4fe..8cb2bebd2 100644
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@@ -2,6 +2,8 @@ from libc.string cimport memcpy
 
 from murmurhash.mrmr cimport hash64
 
+import ujson
+
 
 cdef class StringStore:
     def __init__(self):
@@ -51,3 +53,20 @@ cdef class StringStore:
         else:
             i = <size_t>value
         return &self.strings[i]
+
+    def dump(self, loc):
+        strings = []
+        cdef Utf8Str* string
+        cdef bytes py_string
+        for i in range(self.size):
+            string = &self.strings[i]
+            py_string = string.chars[:string.length]
+            strings.append(py_string)
+        with open(loc, 'w') as file_:
+            ujson.dump(strings, file_, ensure_ascii=False)
+
+    def load(self, loc):
+        with open(loc) as file_:
+            strings = ujson.load(file_)
+        for string in strings[1:]:
+            self.intern(string, len(string))
diff --git a/tests/test_canon_case.py b/tests/test_canon_case.py
index 4b0fd21b3..2c8dd255b 100644
--- a/tests/test_canon_case.py
+++ b/tests/test_canon_case.py
@@ -5,16 +5,16 @@ import py.test
 from spacy.orth import canon_case as cc
 
 def test_nasa():
-    assert cc('Nasa', 0.0, 0, {'upper': 0.6, 'title': 0.3, 'lower': 0.1}, {}) == 'NASA'
+    assert cc('Nasa', 0.6, 0.3, 0.1) == 'NASA'
 
 
 def test_john():
-    assert cc('john', 0.0, 0, {'title': 0.6, 'upper': 0.3, 'lower': 0.1}, {}) == 'John'
+    assert cc('john', 0.3, 0.6, 0.1) == 'John'
 
 
 def test_apple():
-    assert cc('apple', 0.0, 0, {'lower': 0.6, 'title': 0.3, 'upper': 0.1}, {}) == 'apple'
+    assert cc('apple', 0.1, 0.3, 0.6) == 'apple'
 
 
 def test_tie():
-    assert cc('I', 0.0, 0, {'lower': 0.0, 'title': 0.0, 'upper': 0.0}, {}) == 'I'
+    assert cc('I', 0.0, 0.0, 0.0) == 'I'
diff --git a/tests/test_contractions.py b/tests/test_contractions.py
index 5a2eaf3a9..b7347a617 100644
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@@ -5,8 +5,8 @@ from spacy.en import EN
 
 def test_possess():
     tokens = EN.tokenize("Mike's")
-    assert EN.lexicon.strings[tokens[0].id] == "Mike"
-    assert EN.lexicon.strings[tokens[1].id] == "'s"
+    assert EN.lexicon.strings[tokens[0].sic] == "Mike"
+    assert EN.lexicon.strings[tokens[1].sic] == "'s"
     assert len(tokens) == 2
 
 
diff --git a/tests/test_non_sparse.py b/tests/test_non_sparse.py
index a7a05adf1..c7b75de85 100644
--- a/tests/test_non_sparse.py
+++ b/tests/test_non_sparse.py
@@ -5,21 +5,21 @@ import math
 
 
 def test_common_case_upper():
-    cases = {'upper': 0.7, 'lower': 0.2, 'title': 0.1}
+    cases = {'u': 0.7, 'l': 0.2, 't': 0.1}
     prob = math.log(0.1)
-    assert non_sparse('usa', prob, 0, cases, {}) == 'USA'
+    assert non_sparse('usa', prob, 0, cases['u'], cases['t'], cases['l']) == 'USA'
 
 def test_same():
-    cases = {'upper': 0.01, 'title': 0.09, 'lower': 0.9}
+    cases = {'u': 0.01, 't': 0.09, 'l': 0.9}
     prob = math.log(0.5)
-    assert non_sparse('the', prob, 0, cases, {}) == 'the'
+    assert non_sparse('the', prob, 0, cases['u'], cases['t'], cases['l']) == 'the'
 
 def test_common_case_lower():
     prob = math.log(0.5)
-    cases = {'upper': 0.01, 'title': 0.09, 'lower': 0.9}
-    assert non_sparse('The', prob, 0, cases, {}) == 'the'
+    cases = {'u': 0.01, 't': 0.09, 'l': 0.9}
+    assert non_sparse('The', prob, 0, cases['u'], cases['t'], cases['l']) == 'the'
 
 def test_shape():
     prob = math.log(0.00001)
-    cases = {'upper': 0.0, 'title': 0.0, 'lower': 0.0}
-    assert non_sparse('1999', prob, 0, cases, {}) == 'dddd'
+    cases = {'u': 0.0, 't': 0.0, 'l': 0.0}
+    assert non_sparse('1999', prob, 0, cases['u'], cases['t'], cases['l']) == 'dddd'
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 73ac91261..4624e2828 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -27,17 +27,17 @@ def test_punct():
 def test_digits():
     tokens = EN.tokenize('The year: 1984.')
     assert len(tokens) == 5
-    assert tokens[0].id == EN.lexicon.lookup('The')['id']
-    assert tokens[3].id == EN.lexicon.lookup('1984')['id']
+    assert tokens[0].sic == EN.lexicon.lookup('The')['sic']
+    assert tokens[3].sic == EN.lexicon.lookup('1984')['sic']
 
 
 def test_contraction():
     tokens = EN.tokenize("don't giggle")
     assert len(tokens) == 3
-    assert tokens[1].id == EN.lexicon.lookup("not")['id']
+    assert tokens[1].sic == EN.lexicon.lookup("not")['sic']
     tokens = EN.tokenize("i said don't!")
     assert len(tokens) == 5
-    assert tokens[4].id == EN.lexicon.lookup('!')['id']
+    assert tokens[4].sic == EN.lexicon.lookup('!')['sic']
 
 
 def test_contraction_punct():
diff --git a/tests/test_vocab.py b/tests/test_vocab.py
index 640fa5041..036e5981c 100644
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@@ -5,19 +5,19 @@ from spacy.en import EN
 
 def test_neq():
     addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('bye')['id'] != addr['id']
+    assert EN.lexicon.lookup('bye')['sic'] != addr['sic']
 
 
 def test_eq():
     addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('Hello')['id'] == addr['id']
+    assert EN.lexicon.lookup('Hello')['sic'] == addr['sic']
 
 
 def test_case_neq():
     addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('hello')['id'] != addr['id']
+    assert EN.lexicon.lookup('hello')['sic'] != addr['sic']
 
 
 def test_punct_neq():
     addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('Hello,')['id'] != addr['id']
+    assert EN.lexicon.lookup('Hello,')['sic'] != addr['sic']