diff --git a/spacy/en.pxd b/spacy/en.pxd
index 6887dbc08..cee754d9c 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -10,6 +10,7 @@ cpdef enum en_person_t:
     FIRST
     SECOND
     THIRD
+    NON_THIRD
 
 
 cpdef enum en_number_t:
@@ -17,14 +18,22 @@ cpdef enum en_number_t:
     SINGULAR
     PLURAL
     MASS
-    CARDINAL
-    ORDINAL
 
 
 cpdef enum en_gender_t:
     NO_GENDER
     MASCULINE
     FEMININE
+    NEUTER
+
+
+cpdef enum en_case_t:
+    NO_CASE
+    NOMINATIVE
+    GENITIVE
+    ACCUSATIVE
+    REFLEXIVE
+    DEMONYM
 
 
 cpdef enum en_tenspect_t:
@@ -37,23 +46,12 @@ cpdef enum en_tenspect_t:
     MODAL
 
 
-cpdef enum en_case_t:
-    NO_CASE
-    NOMINATIVE
-    ACCUSATIVE
-    GENITIVE
-    DEMONYM
-
-
 cpdef enum misc_t:
     NO_MISC
     COMPARATIVE
     SUPERLATIVE
     RELATIVE
     NAME
-    URL
-    EMAIL
-    EMOTICON
 
     
 # Flags
diff --git a/spacy/en.pyx b/spacy/en.pyx
index fa59ef933..0136818f2 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -38,6 +38,8 @@ import orth
 from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from .tagger cimport X, PUNCT, EOL
 
+from .tokens cimport Morphology
+
 
 POS_TAGS = {
     'NULL': (NO_TAG, {}),
@@ -152,7 +154,8 @@ cdef class English(Language):
         for i in range(tokens.length):
             fill_pos_context(context, i, t)
             t[i].pos = self.pos_tagger.predict(context)
-            #self.morphalyser.set_token(&t[i])
+            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
+            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
 
     def train_pos(self, Tokens tokens, golds):
         cdef int i
@@ -162,11 +165,27 @@ cdef class English(Language):
         for i in range(tokens.length):
             fill_pos_context(context, i, t)
             t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            t[i].morph = self.pos_tagger.tags[t[i].pos].morph
-            #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
+            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
+            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
             c += t[i].pos == golds[i]
         return c
 
 
+cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
+    if tok_morph.number == 0:
+        tok_morph.number = pos_morph.number
+    if tok_morph.tenspect == 0:
+        tok_morph.tenspect = pos_morph.tenspect
+    if tok_morph.mood == 0:
+        tok_morph.mood = pos_morph.mood
+    if tok_morph.gender == 0:
+        tok_morph.gender = pos_morph.gender
+    if tok_morph.person == 0:
+        tok_morph.person = pos_morph.person
+    if tok_morph.case == 0:
+        tok_morph.case = pos_morph.case
+    if tok_morph.misc == 0:
+        tok_morph.misc = pos_morph.misc
+
 
 EN = English('en')
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 124281a6b..0307e12fe 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -9,7 +9,7 @@ from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
-from .tagger cimport PosTag
+from .tagger cimport univ_tag_t
 from .utf8string cimport StringStore, UniStr
 
 
@@ -38,11 +38,12 @@ cdef class Language:
     cdef object _suffix_re
     cdef object _infix_re
 
-    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
 
     cpdef Tokens tokens_from_list(self, list strings)
     cpdef Tokens tokenize(self, unicode text)
 
+    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
     cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                              vector[Lexeme*] *suffixes) except NULL
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index fdeb7df66..cdae8644a 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -28,6 +28,7 @@ from .util import read_lang_data
 from .tokens import Tokens
 
 from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
+from .tokens cimport Morphology
 
 
 cdef class Language:
@@ -53,27 +54,27 @@ cdef class Language:
         if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
             self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
 
-    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
         if self.lemmatizer is None:
             return lex.sic
-        if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
+        if pos != NOUN and pos != VERB and pos != ADJ:
             return lex.sic
-        cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
+        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
         if lemma != 0:
             return lemma
         cdef bytes py_string = self.lexicon.strings[lex.sic]
         cdef set lemma_strings
         cdef bytes lemma_string
-        if pos.pos == NOUN:
+        if pos == NOUN:
             lemma_strings = self.lemmatizer.noun(py_string)
-        elif pos.pos == VERB:
+        elif pos == VERB:
             lemma_strings = self.lemmatizer.verb(py_string)
         else:
-            assert pos.pos == ADJ
+            assert pos == ADJ
             lemma_strings = self.lemmatizer.adj(py_string)
         lemma_string = sorted(lemma_strings)[0]
         lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
-        self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
+        self._lemmas.set(pos, lex.sic, <void*>lemma)
         return lemma
 
     cpdef Tokens tokens_from_list(self, list strings):
@@ -111,6 +112,7 @@ cdef class Language:
             return tokens
         cdef int i = 0
         cdef int start = 0
+        cdef bint cache_hit
         cdef Py_UNICODE* chars = string
         cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
         cdef UniStr span
@@ -118,10 +120,8 @@ cdef class Language:
             if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                 if start < i:
                     slice_unicode(&span, chars, start, i)
-                    lexemes = <const Lexeme* const*>self._cache.get(span.key)
-                    if lexemes != NULL:
-                        tokens.extend(start, lexemes, 0)
-                    else: 
+                    cache_hit = self._try_cache(start, span.key, tokens)
+                    if not cache_hit:
                         self._tokenize(tokens, &span, start, i)
                 in_ws = not in_ws
                 start = i
@@ -130,13 +130,32 @@ cdef class Language:
         i += 1
         if start < i:
             slice_unicode(&span, chars, start, i)
-            lexemes = <const Lexeme* const*>self._cache.get(span.key)
-            if lexemes != NULL:
-                tokens.extend(start, lexemes, 0)
-            else: 
+            cache_hit = self._try_cache(start, span.key, tokens)
+            if not cache_hit:
                 self._tokenize(tokens, &span, start, i)
         return tokens
 
+    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
+        cdef int i
+        specials = <TokenC*>self._specials.get(key)
+        if specials != NULL:
+            i = 0
+            while specials[i].lex != NULL:
+                tokens.push_back(idx, specials[i].lex)
+                tokens.data[tokens.length - 1].pos = specials[i].pos
+                tokens.data[tokens.length - 1].morph = specials[i].morph
+                tokens.data[tokens.length - 1].lemma = specials[i].lemma
+                tokens.data[tokens.length - 1].sense = specials[i].sense
+                i += 1
+            return True
+        else:
+            cached = <const Lexeme* const*>self._cache.get(key)
+            if cached != NULL:
+                tokens.extend(i, cached, 0)
+                return True
+            else:
+                return False
+
     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
         cdef vector[Lexeme*] prefixes
         cdef vector[Lexeme*] suffixes
@@ -190,10 +209,10 @@ cdef class Language:
                 break
         return string
 
-    cdef int _attach_tokens(self, Tokens tokens,
-                            int idx, UniStr* string,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
                             vector[const Lexeme*] *prefixes,
                             vector[const Lexeme*] *suffixes) except -1:
+        cdef bint cache_hit
         cdef int split
         cdef const Lexeme* const* lexemes
         cdef Lexeme* lexeme
@@ -201,10 +220,9 @@ cdef class Language:
         if prefixes.size():
             idx = tokens.extend(idx, prefixes.data(), prefixes.size())
         if string.n != 0:
-
-            lexemes = <const Lexeme* const*>self._cache.get(string.key)
-            if lexemes != NULL:
-                idx = tokens.extend(idx, lexemes, 0)
+            cache_hit = self._try_cache(idx, string.key, tokens)
+            if cache_hit:
+                idx = tokens.data[tokens.length - 1].idx + 1
             else:
                 split = self._find_infix(string.chars, string.n)
                 if split == 0 or split == -1:
@@ -247,30 +265,42 @@ cdef class Language:
         match = self._suffix_re.search(string)
         return (match.end() - match.start()) if match is not None else 0
 
-    def _load_special_tokenization(self, token_rules):
-        '''Load special-case tokenization rules.
-
-        Loads special-case tokenization rules into the Language._cache cache,
-        read from data/<lang>/tokenization . The special cases are loaded before
-        any language data is tokenized, giving these priority.  For instance,
-        the English tokenization rules map "ain't" to ["are", "not"].
-
-        Args:
-            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
-                a string and tokens is a list of strings.
+    def _load_special_tokenization(self, object rules):
+        '''Add a special-case tokenization rule.
         '''
+        cdef int i
+        cdef unicode chunk
+        cdef list substrings
+        cdef unicode form
+        cdef unicode lemma
+        cdef dict props
         cdef Lexeme** lexemes
         cdef hash_t hashed
         cdef UniStr string
-        for uni_string, substrings in token_rules:
-            lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
-            for i, substring in enumerate(substrings):
-                slice_unicode(&string, substring, 0, len(substring))
-                lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
-            lexemes[i + 1] = NULL
-            slice_unicode(&string, uni_string, 0, len(uni_string))
-            self._specials.set(string.key, lexemes)
-            self._cache.set(string.key, lexemes)
+        for chunk, substrings in sorted(rules.items()):
+            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
+            for i, props in enumerate(substrings):
+                form = props['F']
+                lemma = props.get("L", None)
+                slice_unicode(&string, form, 0, len(form))
+                tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
+                if lemma:
+                    tokens[i].lemma = self.lexicon.strings[lemma]
+                set_morph_from_dict(&tokens[i].morph, props)
+            # Null-terminated array
+            tokens[i+1].lex = NULL
+            slice_unicode(&string, chunk, 0, len(chunk))
+            self._specials.set(string.key, tokens)
+
+
+cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
+    morph.number = props.get('number', 0)
+    morph.tenspect = props.get('tenspect', 0)
+    morph.mood = props.get('mood', 0)
+    morph.gender = props.get('gender', 0)
+    morph.person = props.get('person', 0)
+    morph.case = props.get('case', 0)
+    morph.misc = props.get('misc', 0)
 
 
 cdef class Lexicon:
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 6f4691716..f3d6011ec 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -21,7 +21,6 @@ cdef struct Morphology:
     uint8_t misc
 
 
-
 cdef struct TokenC:
     const Lexeme* lex
     Morphology morph
diff --git a/spacy/util.py b/spacy/util.py
index 5062ca6db..ff03760a5 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
 
 def read_lang_data(name):
     data_dir = path.join(DATA_DIR, name)
-    tokenization = read_tokenization(name)
+    with open(path.join(data_dir, 'specials.json')) as file_:
+        tokenization = ujson.load(file_)
     prefix = read_prefix(data_dir)
     suffix = read_suffix(data_dir)
     infix = read_infix(data_dir)
@@ -26,12 +27,17 @@ def read_prefix(data_dir):
         expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
     return expression
 
+
 def read_suffix(data_dir):
-    with  utf8open(path.join(data_dir, 'suffix')) as file_:
+    with utf8open(path.join(data_dir, 'suffix')) as file_:
         entries = file_.read().split('\n')
         expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
+    # TODO: Fix this hack!
+    expression += r'|(?<=[a-z0-9])\.$'
+    expression += r'|(?<=[0-9])km$'
     return expression
 
+
 def read_infix(data_dir):
     with utf8open(path.join(data_dir, 'infix')) as file_:
         entries = file_.read().split('\n')
diff --git a/tests/test_contractions.py b/tests/test_contractions.py
index 8334a74a9..1e697afd2 100644
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@@ -20,15 +20,18 @@ def test_apostrophe():
 def test_LL():
     tokens = EN.tokenize("we'll")
     assert len(tokens) == 2
-    assert tokens[1].string == "will"
+    assert tokens[1].string == "'ll"
+    assert tokens[1].lemma == "will"
     assert tokens[0].string == "we"
 
 
 def test_aint():
     tokens = EN.tokenize("ain't")
     assert len(tokens) == 2
-    assert tokens[0].string == "are"
-    assert tokens[1].string == "not"
+    assert tokens[0].string == "ai"
+    assert tokens[0].lemma == "be"
+    assert tokens[1].string == "n't"
+    assert tokens[1].lemma == "not"
 
 
 def test_capitalized():
@@ -38,7 +41,8 @@ def test_capitalized():
     assert len(tokens) == 2
     tokens = EN.tokenize("Ain't")
     assert len(tokens) == 2
-    assert tokens[0].string == "Are"
+    assert tokens[0].string == "Ai"
+    assert tokens[0].lemma == "be"
 
 
 def test_punct():
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index fb5f78ed7..21d115b9b 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -34,7 +34,7 @@ def test_digits():
 def test_contraction():
     tokens = EN.tokenize("don't giggle")
     assert len(tokens) == 3
-    assert tokens[1].sic == EN.lexicon["not"]['sic']
+    assert tokens[1].sic == EN.lexicon["n't"]['sic']
     tokens = EN.tokenize("i said don't!")
     assert len(tokens) == 5
     assert tokens[4].sic == EN.lexicon['!']['sic']
@@ -71,30 +71,39 @@ def test_cnts1():
     tokens = EN.tokenize(text)
     assert len(tokens) == 8
 
+
 def test_cnts2():
     text = u"""U.N. regulations are not a part of their concern."""
     tokens = EN.tokenize(text)
     assert len(tokens) == 10
 
+
 def test_cnts3():
     text = u"“Isn't it?”"
     tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
+    assert len(words) == 6
+
 
 def test_cnts4():
     text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
     tokens = EN.tokenize(text)
-    assert len(tokens) == 15
+    words = [t.string for t in tokens]
+    assert len(words) == 15
+
 
 def test_cnts5():
     text = """'Me too!', Mr. P. Delaware cried. """
     tokens = EN.tokenize(text)
     assert len(tokens) == 11
 
+
 def test_cnts6():
     text = u'They ran about 10km.'
     tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
+    assert len(words) == 6
+
 
 #def test_cnts7():
 #    text = 'But then the 6,000-year ice age came...'