diff --git a/data/en/prefix b/data/en/prefix
index 64a3f1f2f..cb9bb4d7b 100644
--- a/data/en/prefix
+++ b/data/en/prefix
@@ -11,3 +11,8 @@ $
 '
 ``
 `
+#
+US$
+C$
+A$
+a-
diff --git a/data/en/suffix b/data/en/suffix
index 77400d0fd..8ba48296d 100644
--- a/data/en/suffix
+++ b/data/en/suffix
@@ -1,13 +1,13 @@
 ,
-"
-)
-]
-}
-*
-!
-?
+\"
+\)
+\]
+\}
+\*
+\!
+\?
 %
-$
+\$
 >
 :
 ;
@@ -16,7 +16,8 @@ $
 ''
 's
 'S
-.
-..
-...
-....
+\.\.
+\.\.\.
+\.\.\.\.
+(?<=[a-z0-9])\.
+(?<=[0-9])km
diff --git a/data/en/tokenization b/data/en/tokenization
index 6bf0d738b..382b7e383 100644
--- a/data/en/tokenization
+++ b/data/en/tokenization
@@ -4,101 +4,9 @@
 #*---* ---
 #*'s  's
 
-'s  's
-'S  'S
-ain't   are not
-aren't  are not
-can't   can not
-cannot  can not
-could've    could have
-couldn't    could not
-couldn't've could not have
-didn't  did not
-doesn't does not
-don't   do not
-hadn't  had not
-hadn't've   had not have
-hasn't  has not
-haven't have not
-he'd    he would
-he'd've he would have
-he'll   he will
-he's    he 's
-how'd   he would
-how'll  he will
-how's   how 's
-I'd I would
-I'd've  I would have
-I'll    I will
-I'm I am
-I'ma    I will
-I've    I have
-isn't   is not
-it'd    it would
-it'd've it would have
-it'll   it will
-it's    it 's
-let's   let 's
-mightn't    might not
-mightn't've might not have
-might've    might have
-mustn't must not
-must've must have
-needn't need not
-not've  not have
-shan't  shall not
-she'd   she would
-she'd've    she would have
-she'll  she will
-she's   she 's
-should've   should have
-shouldn't   should not
-shouldn't've    should not have
-that's  that 's
-there'd there would
-there'd've  there would have
-there's there is
-they'd  there would
-they'd've   they would have
-they'll they will
-they're they are
-they've they have
-wasn't  was not
-we'd    we would
-we'd've we would have
-we'll   we will
-we're   we are
-we've   we have
-weren't were not
-what'll what will
-what're what are
-what's  what 's
-what've what have
-when's  when 's
-where'd where would
-where's where 's
-where've    where have
-who'd   who would
-who'll  who will
-who're  who are
-who's   who 's
-who've  who have
-why'll  who will
-why're  why are
-why's   why 's
-won't   will not
-would've    would have
-wouldn't    would not
-wouldn't've would not have
-you'd   you would
-you'd've    you would have
-you'll  you will
-you're  you are
-you've  you have
-'em them
-'ol old
 10km    10 km
 U.S.    U.S.
+U.K.    U.K.
 non-U.S.    non-U.S.
 U.N.    U.N.
 Co. Co.
@@ -115,7 +23,12 @@ A.G.    A.G.
 Rep.    Rep.
 Ms. Ms.
 Mr. Mr.
+Mrs.    Mrs.
 a.m.    a.m.
+Sen.    Sen.
+INC.    INC.
+CO. CO.
+COS.    COS.
 p.m.    p.m.
 Nos.    Nos.
 a.k.a.  a.k.a.
@@ -127,6 +40,7 @@ E.  E.
 F.  F.
 G.  G.
 H.  H.
+I.  I.
 J.  J.
 K.  K.
 L.  L.
@@ -205,6 +119,9 @@ Wash.   Wash.
 W.Va.   W.Va.
 Wis.    Wis.
 Wyo.    Wyo.
+L.A.    L.A.
+R.H.    R.H.
+Gov.    Gov.
 ''  ''
 :)  :)
 <3  <3
@@ -262,3 +179,19 @@ V_V V_V
 o.O o.O
 ")  ")
 ....    ....
+a-  a -
+Messrs. Messrs.
+No. No.
+vs. vs.
+Gen.    Gen.
+Cos.    Cos.
+L.J.    L.J.
+D.T.    D.T.
+Prof.   Prof.
+Bros.   Bros.
+J.C.    J.C.
+Neb.    Neb.
+Adm.    Adm.
+U.S.S.R.    U.S.S.R.
+Rev.    Rev.
+H.F.    H.F.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 97681bfd8..fb738aa32 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -3,45 +3,228 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
+================================
 spaCy NLP Tokenizer and Lexicon
 ================================
 
-spaCy is a library for industrial strength NLP in Python.  Its core
-values are:
+spaCy is a library for industrial-strength NLP in Python and Cython.  spaCy's
+take on NLP is that it's mostly about feature extraction --- that's the part
+that's specific to NLP, so that's what an NLP library should focus on.
 
-* **Efficiency**: You won't find faster NLP tools. For shallow analysis, it's 10x
-  faster than Stanford Core NLP, and over 200x faster than NLTK.  Its parser is
-  over 100x faster than Stanford's.
+spaCy also believes that for NLP, **efficiency is critical**.  If you're
+running batch jobs, you probably have an enormous amount of data; if you're
+serving requests one-by-one, you want lower latency and fewer servers.  Even if
+you're doing exploratory research on relatively small samples, you should still
+value efficiency, because it means you can run more experiments.
 
-* **Accuracy**:  All spaCy tools are within 0.5% of the current published
-  state-of-the-art, on both news and web text. NLP moves fast, so always check
-  the numbers --- and don't settle for tools that aren't backed by
-  rigorous recent evaluation.
+Depending on the task, spaCy is between 10 and 200 times faster than NLTK,
+often with much better accuracy.  See Benchmarks for details, and
+Why is spaCy so fast? for a discussion of the algorithms and implementation
+that makes this possible.
 
-* **Minimalism**:  This isn't a library that covers 43 known algorithms to do X. You
-  get 1 --- the best one --- with a simple, low-level interface. This keeps the
-  code-base small and concrete.  Our Python APIs use lists and
-  dictionaries, and our C/Cython APIs use arrays and simple structs.
++---------+----------+-------------+----------+
+| System  | Tokenize | --> Counts  | --> Stem |
++---------+----------+-------------+----------+
+| spaCy   | 1m42s    | 1m59s       | 1m59s    |
++---------+----------+-------------+----------+
+| NLTK    | 20m2s    | 28m24s      | 52m28    |
++---------+----------+-------------+----------+
+
+Times for 100m words of text.
+
+
+Unique Lexicon-centric design
+=============================
+
+spaCy helps you build models that generalise better, by making it easy to use
+more robust features.  Instead of a list of strings, the tokenizer returns
+references to rich lexical types.  Features which ask about the word's Brown cluster,
+its typical part-of-speech tag, how it's usually cased etc require no extra effort:
+
+    >>> from spacy.en import EN
+    >>> from spacy.feature_names import *
+    >>> feats = (
+            SIC, # ID of the original word form
+            STEM, # ID of the stemmed word form
+            CLUSTER, # ID of the word's Brown cluster
+            IS_TITLE, # Was the word title-cased?
+            POS_TYPE # A cluster ID describing what POS tags the word is usually assigned
+        )
+    >>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^')
+    >>> tokens.to_array(feats)[:5]
+        array([[    1,  2,  3,  4],
+               [...],
+               [...],
+               [...]])
+
+
+spaCy is designed to **make the right thing easy**, where the right thing is to:
+
+* **Use rich distributional and orthographic features**. Without these, your model
+  will be very brittle and domain dependent.
+
+* **Compute features per type, not per token**. Because of Zipf's law, you can
+  expect this to be exponentially more efficient.
+
+* **Minimize string processing**, and instead compute with arrays of ID ints.
   
+For the current list of lexical features, see `Lexical Features`_.
 
-Comparison
-----------
+.. _lexical features: features.html
 
-+----------------+-------------+--------+---------------+--------------+
-| Tokenize & Tag | Speed (w/s) | Memory | % Acc. (news) | % Acc. (web) |
-+----------------+-------------+--------+---------------+--------------+
-| spaCy          | 107,000     |  1.3gb | 96.7          |              |
-+----------------+-------------+--------+---------------+--------------+
-| Stanford       | 8,000       |  1.5gb | 96.7          |              |
-+----------------+-------------+--------+---------------+--------------+
-| NLTK           | 543         |  61mb  | 94.0          |              |
-+----------------+-------------+--------+---------------+--------------+
+Tokenization done right
+=======================
+
+Most tokenizers rely on complicated regular expressions.  Often, they leave you
+with no way to align the tokens back to the original string --- a vital feature
+if you want to display some mark-up, such as spelling correction.  The regular
+expressions also interact, making it hard to accommodate special cases.
+
+spaCy introduces a **novel tokenization algorithm** that's much faster and much
+more flexible:
+
+.. code-block:: python
+
+    def tokenize(string, prefixes={}, suffixes={}, specials={}):
+        '''Sketch of spaCy's tokenization algorithm.'''
+        tokens = []
+        cache = {}
+        for chunk in string.split():
+            # Because of Zipf's law, the cache serves the majority of "chunks".
+            if chunk in cache:
+                tokens.extend(cache[chunl])
+                continue
+            key = chunk
+
+            subtokens = []
+            # Process a chunk by splitting off prefixes e.g. ( " { and suffixes e.g. , . :
+            # If we split one off, check whether we're left with a special-case, 
+            # e.g. contractions (can't, won't, etc), emoticons, abbreviations, etc.
+            # This makes the tokenization easy to update and customize.
+            while chunk:
+                prefix, chunk = _consume_prefix(chunk, prefixes)
+                if prefix:
+                    subtokens.append(prefix)
+                    if chunk in specials:
+                        subtokens.extend(specials[chunk])
+                        break
+                suffix, chunk = _consume_suffix(chunk, suffixes)
+                if suffix:
+                    subtokens.append(suffix)
+                    if chunk in specials:
+                        subtokens.extend(specials[chunk])
+                        break
+            cache[key] = subtokens
+
+Your data is going to have its own quirks, so it's really useful to have
+a tokenizer you can easily control.  To see the limitations of the standard
+regex-based approach, check out `CMU's recent work on tokenizing tweets <http://www.ark.cs.cmu.edu/TweetNLP/>`_. Despite a lot of careful attention, they can't handle all of their
+known emoticons correctly --- doing so would interfere with the way they
+process other punctuation.  This isn't a problem for spaCy: we just add them
+all to the special tokenization rules.
+
+spaCy's tokenizer is also incredibly efficient:
+
+spaCy can create an inverted index of the 1.8 billion word Gigaword corpus,
+in under half an hour --- on a Macbook Air.  See the `inverted
+index tutorial`_.
+
+.. _inverted index tutorial: index_tutorial.html
+
+Comparison with NLTK
+====================
+
+`NLTK <http://nltk.org>`_ provides interfaces to a wide-variety of NLP
+tools and resources, and its own implementations of a few algorithms.  It comes
+with comprehensive documentation, and a book introducing concepts in NLP.  For
+these reasons, it's very widely known.  However, if you're trying to make money
+or do cutting-edge research, NLTK is not a good choice.
+
+The `list of stuff in NLTK <http://www.nltk.org/py-modindex.html>`_ looks impressive,
+but almost none of it is useful for real work.  You're not going to make any money,
+or do top research, by using the NLTK chat bots, theorem provers, toy CCG implementation,
+etc.  Most of NLTK is there to assist in the explanation ideas in computational
+linguistics, at roughly an undergraduate level.
+But it also claims to support serious work, by wrapping external tools.
+
+In a pretty well known essay, Joel Spolsky discusses the pain of dealing with 
+`leaky abstractions <http://www.joelonsoftware.com/articles/LeakyAbstractions.html>`_.
+An abstraction tells you to not care about implementation
+details, but sometimes the implementation matters after all. When it
+does, you have to waste time revising your assumptions.
+
+NLTK's wrappers call external tools via subprocesses, and wrap this up so
+that it looks like a native API.  This abstraction leaks *a lot*.  The system
+calls impose far more overhead than a normal Python function call, which makes
+the most natural way to program against the API infeasible. 
+
+
+Case study: POS tagging
+-----------------------
+
+Here's a quick comparison of the following POS taggers:
+
+* **Stanford (CLI)**: The Stanford POS tagger, invoked once as a batch process
+  from the command-line;
+* **nltk.tag.stanford**: The Stanford tagger, invoked document-by-document via
+  NLTK's wrapper;
+* **nltk.pos_tag**: NLTK's own POS tagger, invoked document-by-document.
+* **spacy.en.pos_tag**: spaCy's POS tagger, invoked document-by-document.
+
+
++-------------------+-------------+--------+
+| System            | Speed (w/s) | % Acc. |
++-------------------+-------------+--------+
+| spaCy             | 107,000     | 96.7   |
++-------------------+-------------+--------+
+| Stanford (CLI)    | 8,000       | 96.7   |
++-------------------+-------------+--------+
+| nltk.pos_tag      | 543         | 94.0   |
++-------------------+-------------+--------+
+| nltk.tag.stanford | 209         | 96.7   |
++-------------------+-------------+--------+
+
+Experimental details TODO.  Three things are apparent from this comparison:
+
+1. The native NLTK tagger, nltk.pos_tag, is both slow and inaccurate;
+
+2. Calling the Stanford tagger document-by-document via NLTK is **40x** slower
+   than invoking the model once as a batch process, via the command-line;
+
+3. spaCy is over 10x faster than the Stanford tagger, even when called
+   **sentence-by-sentence**.
+
+The problem is that NLTK simply wraps the command-line
+interfaces of these tools, so communication is via a subprocess.  NLTK does not
+even hold open a pipe for you --- the model is reloaded, again and again.
+
+To use the wrapper effectively, you should batch up your text as much as possible.
+This probably isn't how you would like to structure your pipeline, and you
+might not be able to batch up much text at all, e.g. if serving a single
+request means processing a single document.
+Technically, NLTK does give you Python functions to access lots of different
+systems --- but, you can't use them as you would expect to use a normal Python
+function.  The abstraction leaks.
+
+Here's the bottom-line: the Stanford tools are written in Java, so using them
+from Python sucks.  You shouldn't settle for this.  It's a problem that springs
+purely from the tooling, rather than the domain.
+
+Summary
+-------
+
+NLTK is a well-known Python library for NLP, but for the important bits, you
+don't get actual Python modules.  You get wrappers which throw to external
+tools, via subprocesses.  This is not at all the same thing.
+
+spaCy is implemented in Cython, just like numpy, scikit-learn, lxml and other
+high-performance Python libraries.  So you get a native Python API, but the
+performance you expect from a program written in C.
 
 
 .. toctree::
     :hidden:
     :maxdepth: 3
-    
-    what/index.rst
-    why/index.rst
-    how/index.rst
+
+    features.rst
+    license_stories.rst 
diff --git a/setup.py b/setup.py
index c67bed4a1..827d44fc6 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,8 @@ import os.path
 from os import path
 from glob import glob
 
+import numpy
+
 
 def clean(ext):
     for pyx in ext.sources:
@@ -34,7 +36,7 @@ compile_args = []
 link_args = []
 libs = []
 
-includes = ['.']
+includes = ['.', numpy.get_include()]
 cython_includes = ['.']
 
 
@@ -50,18 +52,20 @@ exts = [
     Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.morphology", ["spacy/morphology.pyx"], language="c++",
+              include_dirs=includes),
+    #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
 ]
 
 
diff --git a/spacy/context.pxd b/spacy/context.pxd
deleted file mode 100644
index 8f798d347..000000000
--- a/spacy/context.pxd
+++ /dev/null
@@ -1,66 +0,0 @@
-from thinc.typedefs cimport atom_t
-from .typedefs cimport hash_t
-from .tokens cimport Tokens
-from .lexeme cimport Lexeme
-
-
-cdef class Token:
-    cdef readonly atom_t sic
-    cdef readonly atom_t cluster
-    cdef readonly atom_t norm
-    cdef readonly atom_t shape
-    cdef readonly atom_t asciied
-    cdef readonly atom_t prefix
-    cdef readonly atom_t suffix
-    cdef readonly atom_t length
-
-    cdef readonly atom_t postype
-    cdef readonly atom_t nertype
-    cdef readonly atom_t sensetype
-
-    cdef readonly atom_t is_alpha
-    cdef readonly atom_t is_ascii
-    cdef readonly atom_t is_digit
-    cdef readonly atom_t is_lower
-    cdef readonly atom_t is_punct
-    cdef readonly atom_t is_space
-    cdef readonly atom_t is_title
-    cdef readonly atom_t is_upper
-    cdef readonly atom_t like_url
-    cdef readonly atom_t like_number
-    cdef readonly atom_t oft_lower
-    cdef readonly atom_t oft_title
-    cdef readonly atom_t oft_upper
-
-    cdef readonly atom_t in_males
-    cdef readonly atom_t in_females
-    cdef readonly atom_t in_surnames
-    cdef readonly atom_t in_places
-    cdef readonly atom_t in_games
-    cdef readonly atom_t in_celebs
-    cdef readonly atom_t in_names
-
-    cdef readonly atom_t pos
-    cdef readonly atom_t sense
-    cdef readonly atom_t ner
-
-
-cdef class Slots:
-    cdef readonly Token P4
-    cdef readonly Token P3
-    cdef readonly Token P2
-    cdef readonly Token P1
-    cdef readonly Token N0
-    cdef readonly Token N1
-    cdef readonly Token N2
-    cdef readonly Token N3
-    cdef readonly Token N4
-
-
-cdef int N_FIELDS
-
-
-cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
-
-
-cpdef Slots FIELD_IDS
diff --git a/spacy/context.pyx b/spacy/context.pyx
deleted file mode 100644
index aeb78ae5c..000000000
--- a/spacy/context.pyx
+++ /dev/null
@@ -1,126 +0,0 @@
-from murmurhash.mrmr cimport hash64
-from .lexeme cimport *
-
-
-cdef class Slots:
-    def __init__(self):
-        self.P4 = Token()
-        self.P3 = Token()
-        self.P2 = Token()
-        self.P1 = Token()
-        self.N0 = Token()
-        self.N1 = Token()
-        self.N2 = Token()
-        self.N3 = Token()
-        self.N4 = Token()
-
-
-cdef void _number_token(Token t, int* n_fields):
-    cdef int i = n_fields[0]
-    t.sic = i; i += 1
-    t.cluster = i; i += 1
-    t.norm = i; i += 1
-    t.shape = i; i += 1
-    t.prefix = i; i += 1
-    t.suffix = i; i += 1
-    t.length = i; i += 1
-
-    t.postype = i; i += 1
-    t.nertype = i; i += 1
-    t.sensetype = i; i += 1
-
-    t.is_alpha = i; i += 1
-    t.is_ascii = i; i += 1
-    t.is_digit = i; i += 1
-    t.is_lower = i; i += 1
-    t.is_punct = i; i += 1
-    t.is_space = i; i += 1
-    t.is_title = i; i += 1
-    t.is_upper = i; i += 1
-
-    t.like_number = i; i += 1
-    t.like_url = i; i += 1
-
-    t.oft_lower = i; i += 1
-    t.oft_title = i; i += 1
-    t.oft_upper = i; i += 1
-
-    t.in_males = i; i += 1
-    t.in_females = i; i += 1
-    t.in_surnames = i; i += 1
-    t.in_places = i; i += 1
-    t.in_games = i; i += 1
-    t.in_celebs = i; i += 1
-    t.in_names = i; i += 1
-
-    t.pos = i; i += 1
-    t.sense = i; i += 1
-    t.ner = i; i += 1
-
-    n_fields[0] = i
-
-
-cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner):
-    c[t.sic] = lex.sic
-    c[t.cluster] = lex.cluster
-    c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
-    c[t.shape] = lex.shape
-    c[t.asciied] = lex.asciied
-    c[t.prefix] = lex.prefix
-    c[t.suffix] = lex.suffix
-    c[t.length] = lex.length
-
-    c[t.postype] = lex.postype
-    c[t.nertype] = 0
-    c[t.sensetype] = 0
-    
-    c[t.is_alpha] = lex.flags & (1 << IS_ALPHA)
-    c[t.is_digit] = lex.flags & (1 << IS_DIGIT)
-    c[t.is_lower] = lex.flags & (1 << IS_LOWER)
-    c[t.is_punct] = lex.flags & (1 << IS_PUNCT)
-    c[t.is_space] = lex.flags & (1 << IS_SPACE)
-    c[t.is_title] = lex.flags & (1 << IS_TITLE)
-    c[t.is_upper] = lex.flags & (1 << IS_UPPER)
-    c[t.like_url] = lex.flags & (1 << LIKE_URL)
-    c[t.like_number] = lex.flags & (1 << LIKE_NUMBER)
-    c[t.oft_lower] = lex.flags & (1 << OFT_LOWER)
-    c[t.oft_title] = lex.flags & (1 << OFT_TITLE)
-    c[t.oft_upper] = lex.flags & (1 << OFT_UPPER)
-
-    c[t.in_males] = lex.flags & (1 << IN_MALES)
-    c[t.in_females] = lex.flags & (1 << IN_FEMALES)
-    c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES)
-    c[t.in_places] = lex.flags & (1 << IN_PLACES)
-    c[t.in_games] = lex.flags & (1 << IN_GAMES)
-    c[t.in_celebs] = lex.flags & (1 << IN_CELEBS)
-    c[t.in_names] = lex.flags & (1 << IN_NAMES)
-
-    c[t.pos] = pos
-    c[t.sense] = 0
-    c[t.ner] = ner
-
-
-cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
-    _fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4])
-    _fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3])
-    _fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
-    _fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
-    _fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
-    _fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
-    _fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
-    _fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3])
-    _fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4])
-    return 1
-
-
-N_FIELDS = 0
-FIELD_IDS = Slots()
-_number_token(FIELD_IDS.P4, &N_FIELDS)
-_number_token(FIELD_IDS.P3, &N_FIELDS)
-_number_token(FIELD_IDS.P2, &N_FIELDS)
-_number_token(FIELD_IDS.P1, &N_FIELDS)
-_number_token(FIELD_IDS.N0, &N_FIELDS)
-_number_token(FIELD_IDS.N1, &N_FIELDS)
-_number_token(FIELD_IDS.N2, &N_FIELDS)
-_number_token(FIELD_IDS.N3, &N_FIELDS)
-_number_token(FIELD_IDS.N4, &N_FIELDS)
diff --git a/spacy/en.pxd b/spacy/en.pxd
index a7c643eba..2ca081e47 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -1,5 +1,133 @@
-from spacy.lang cimport Language
-from spacy.tokens cimport Tokens
+from thinc.typedefs cimport atom_t
+
+from .lang cimport Language
+from .tokens cimport Tokens
+from .tokens cimport TokenC
+
+
+cpdef enum en_person_t:
+    NO_PERSON
+    FIRST
+    SECOND
+    THIRD
+    NON_THIRD
+
+
+cpdef enum en_number_t:
+    NO_NUMBER
+    SINGULAR
+    PLURAL
+    MASS
+
+
+cpdef enum en_gender_t:
+    NO_GENDER
+    MASCULINE
+    FEMININE
+    NEUTER
+
+
+cpdef enum en_case_t:
+    NO_CASE
+    NOMINATIVE
+    GENITIVE
+    ACCUSATIVE
+    REFLEXIVE
+    DEMONYM
+
+
+cpdef enum en_tenspect_t:
+    NO_TENSE
+    BASE_VERB
+    PRESENT
+    PAST
+    PASSIVE
+    ING
+    MODAL
+
+
+cpdef enum misc_t:
+    NO_MISC
+    COMPARATIVE
+    SUPERLATIVE
+    RELATIVE
+    NAME
+
+    
+# Flags
+cpdef enum FlagID:
+    IS_ALPHA
+    IS_ASCII
+    IS_DIGIT
+    IS_LOWER
+    IS_PUNCT
+    IS_SPACE
+    IS_TITLE
+    IS_UPPER
+
+    LIKE_URL
+    LIKE_NUMBER
+
+    OFT_LOWER
+    OFT_TITLE
+    OFT_UPPER
+
+    IN_MALES
+    IN_FEMALES
+    IN_SURNAMES
+    IN_PLACES
+    IN_GAMES
+    IN_CELEBS
+    IN_NAMES
+
+
+cpdef enum:
+    P2_sic
+    P2_cluster
+    P2_shape
+    P2_prefix
+    P2_suffix
+    P2_pos
+    P2_lemma
+    P2_pos_type
+
+    P1_sic
+    P1_cluster
+    P1_shape
+    P1_prefix
+    P1_suffix
+    P1_pos
+    P1_lemma
+    P1_pos_type
+
+    W_sic
+    W_cluster
+    W_shape
+    W_prefix
+    W_suffix
+    W_pos
+    W_lemma
+    W_pos_type
+
+    N1_sic
+    N1_cluster
+    N1_shape
+    N1_prefix
+    N1_suffix
+    N1_pos
+    N1_lemma
+    N1_pos_type
+
+    N2_sic
+    N2_cluster
+    N2_shape
+    N2_prefix
+    N2_suffix
+    N2_pos
+    N2_lemma
+    N2_pos_type
+
+    N_CONTEXT_FIELDS
 
 
 cdef class English(Language):
diff --git a/spacy/en.pyx b/spacy/en.pyx
index 95c1cbd94..3ed0eaaa9 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -30,14 +30,101 @@ same scheme. Tokenization problems are a major cause of poor performance for
 NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 provides a fully Penn Treebank 3-compliant tokenizer.
 '''
-# TODO
-#The script translate_treebank_tokenization can be used to transform a treebank's
-#annotation to use one of the spacy tokenization schemes.
-
-
 from __future__ import unicode_literals
 
 cimport lang
+from .typedefs cimport flags_t
+import orth
+from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
+from .morphology cimport X, PUNCT, EOL
+
+from .tokens cimport Morphology
+
+
+POS_TAGS = {
+    'NULL': (NO_TAG, {}),
+    'EOL': (EOL, {}),
+    'CC': (CONJ, {}),
+    'CD': (NUM, {}),
+    'DT': (DET, {}),
+    'EX': (DET, {}),
+    'FW': (X, {}),
+    'IN': (ADP, {}),
+    'JJ': (ADJ, {}),
+    'JJR': (ADJ, {'misc': COMPARATIVE}),
+    'JJS': (ADJ, {'misc': SUPERLATIVE}),
+    'LS': (X, {}),
+    'MD': (VERB, {'tenspect': MODAL}),
+    'NN': (NOUN, {}),
+    'NNS': (NOUN, {'number': PLURAL}),
+    'NNP': (NOUN, {'misc': NAME}),
+    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
+    'PDT': (DET, {}),
+    'POS': (PRT, {'case': GENITIVE}),
+    'PRP': (NOUN, {}),
+    'PRP$': (NOUN, {'case': GENITIVE}),
+    'RB': (ADV, {}),
+    'RBR': (ADV, {'misc': COMPARATIVE}),
+    'RBS': (ADV, {'misc': SUPERLATIVE}),
+    'RP': (PRT, {}),
+    'SYM': (X, {}),
+    'TO': (PRT, {}),
+    'UH': (X, {}),
+    'VB': (VERB, {}),
+    'VBD': (VERB, {'tenspect': PAST}),
+    'VBG': (VERB, {'tenspect': ING}),
+    'VBN': (VERB, {'tenspect': PASSIVE}),
+    'VBP': (VERB, {'tenspect': PRESENT}),
+    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
+    'WDT': (DET, {'misc': RELATIVE}),
+    'WP': (PRON, {'misc': RELATIVE}),
+    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
+    'WRB': (ADV, {'misc': RELATIVE}),
+    '!': (PUNCT, {}),
+    '#': (PUNCT, {}),
+    '$': (PUNCT, {}),
+    "''": (PUNCT, {}),
+    "(": (PUNCT, {}),
+    ")": (PUNCT, {}),
+    "-LRB-": (PUNCT, {}),
+    "-RRB-": (PUNCT, {}),
+    ".": (PUNCT, {}),
+    ",": (PUNCT, {}),
+    "``": (PUNCT, {}),
+    ":": (PUNCT, {}),
+    "?": (PUNCT, {}),
+}
+
+
+POS_TEMPLATES = (
+    (W_sic,),
+    (P1_lemma, P1_pos),
+    (P2_lemma, P2_pos),
+    (N1_sic,),
+    (N2_sic,),
+
+    (W_suffix,),
+    (W_prefix,),
+
+    (P1_pos,),
+    (P2_pos,),
+    (P1_pos, P2_pos),
+    (P1_pos, W_sic),
+    (P1_suffix,),
+    (N1_suffix,),
+
+    (W_shape,),
+    (W_cluster,),
+    (N1_cluster,),
+    (N2_cluster,),
+    (P1_cluster,),
+    (P2_cluster,),
+
+    (W_pos_type,),
+    (N1_pos_type,),
+    (N1_pos_type,),
+    (P1_pos, W_pos_type, N1_pos_type),
+)
 
 
 cdef class English(Language):
@@ -47,7 +134,68 @@ cdef class English(Language):
         name (unicode): The two letter code used by Wikipedia for the language.
         lexicon (Lexicon): The lexicon. Exposes the lookup method.
     """
-    pass
+    def get_props(self, unicode string):
+        return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
+
+    def set_flags(self, unicode string):
+        cdef flags_t flags = 0
+        flags |= orth.is_alpha(string) << IS_ALPHA
+        flags |= orth.is_ascii(string) << IS_ASCII
+        flags |= orth.is_digit(string) << IS_DIGIT
+        flags |= orth.is_lower(string) << IS_LOWER
+        flags |= orth.is_punct(string) << IS_PUNCT
+        flags |= orth.is_space(string) << IS_SPACE
+        flags |= orth.is_title(string) << IS_TITLE
+        flags |= orth.is_upper(string) << IS_UPPER
+
+        flags |= orth.like_url(string) << LIKE_URL
+        flags |= orth.like_number(string) << LIKE_NUMBER
+        return flags
+
+    def set_pos(self, Tokens tokens):
+        cdef int i
+        cdef atom_t[N_CONTEXT_FIELDS] context
+        cdef TokenC* t = tokens.data
+        assert self.morphologizer is not None
+        cdef dict tagdict = self.pos_tagger.tagdict
+        for i in range(tokens.length):
+            if t[i].lex.sic in tagdict:
+                t[i].pos = tagdict[t[i].lex.sic]
+            else:
+                fill_pos_context(context, i, t)
+                t[i].pos = self.pos_tagger.predict(context)
+            self.morphologizer.set_morph(i, t)
+
+    def train_pos(self, Tokens tokens, golds):
+        cdef int i
+        cdef atom_t[N_CONTEXT_FIELDS] context
+        c = 0
+        cdef TokenC* t = tokens.data
+        for i in range(tokens.length):
+            fill_pos_context(context, i, t)
+            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
+            self.morphologizer.set_morph(i, t)
+            c += t[i].pos == golds[i]
+        return c
+
+
+cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
+    _fill_from_token(&context[P2_sic], &tokens[i-2])
+    _fill_from_token(&context[P1_sic], &tokens[i-1])
+    _fill_from_token(&context[W_sic], &tokens[i])
+    _fill_from_token(&context[N1_sic], &tokens[i+1])
+    _fill_from_token(&context[N2_sic], &tokens[i+2])
+
+
+cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
+    context[0] = t.lex.sic
+    context[1] = t.lex.cluster
+    context[2] = t.lex.shape
+    context[3] = t.lex.prefix
+    context[4] = t.lex.suffix
+    context[5] = t.pos
+    context[6] = t.lemma
+    context[7] = t.lex.pos_type
 
 
 EN = English('en')
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 68f1ee58a..20986f134 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -1,38 +1,38 @@
 from libcpp.vector cimport vector
 
+from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
+
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 
 from .typedefs cimport hash_t
-from .tokens cimport Tokens
+from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
-from .ner.greedy_parser cimport NERParser
-from .utf8string cimport StringStore
+from .utf8string cimport StringStore, UniStr
+from .morphology cimport Morphologizer
 
 
-cdef extern from "Python.h":
-    cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
-    cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
-    cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
-    cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
+cdef union LexemesOrTokens:
+    const Lexeme* const* lexemes
+    TokenC* tokens
 
 
-cdef struct String:
-    Py_UNICODE* chars
-    size_t n
-    hash_t key
+cdef struct Cached:
+    LexemesOrTokens data
+    bint is_lex
+    int length
 
 
 cdef class Lexicon:
+    cpdef public get_lex_props
     cdef Pool mem
-    cpdef readonly size_t size
     cpdef readonly StringStore strings
     cdef vector[Lexeme*] lexemes
 
-    cdef Lexeme* get(self, String* s) except NULL
+    cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
     
-    cdef PreshMap _dict
+    cdef PreshMap _map
     
 
 cdef class Language:
@@ -41,9 +41,8 @@ cdef class Language:
     cdef PreshMap _cache
     cdef PreshMap _specials
     cpdef readonly Lexicon lexicon
-
     cpdef readonly Tagger pos_tagger
-    cpdef readonly NERParser ner_tagger
+    cpdef readonly Morphologizer morphologizer
 
     cdef object _prefix_re
     cdef object _suffix_re
@@ -52,13 +51,14 @@ cdef class Language:
     cpdef Tokens tokens_from_list(self, list strings)
     cpdef Tokens tokenize(self, unicode text)
 
-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                              vector[Lexeme*] *suffixes) except NULL
-    cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
                             vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
     cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
     cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
     cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
-    cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
  
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 79a84e936..4617c3853 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -18,13 +18,14 @@ from preshed.maps cimport PreshMap
 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
+from .lexeme cimport check_flag
+
+from .utf8string cimport slice_unicode
 
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
-
-from .tagger cimport Tagger
-from .ner.greedy_parser cimport NERParser
+from .tokens cimport Morphology
 
 
 cdef class Language:
@@ -37,29 +38,30 @@ cdef class Language:
         self._prefix_re = re.compile(prefix)
         self._suffix_re = re.compile(suffix)
         self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon()
-        if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
-            self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
-            self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
+        self.lexicon = Lexicon(self.get_props)
         self._load_special_tokenization(rules)
-        if path.exists(path.join(util.DATA_DIR, name, 'pos')):
-            self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
-        else:
-            self.pos_tagger = None
-        if path.exists(path.join(util.DATA_DIR, name, 'ner')):
-            self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
+        self.pos_tagger = None
+        self.morphologizer = None
+
+    def load(self):
+        self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
+        self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
+        if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
+            self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
+            self.morphologizer = Morphologizer(self.lexicon.strings,
+                                    path.join(util.DATA_DIR, self.name))
 
     cpdef Tokens tokens_from_list(self, list strings):
         cdef int length = sum([len(s) for s in strings])
-        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
+        cdef Tokens tokens = Tokens(self, length)
         if length == 0:
             return tokens
-        cdef String string_struct
+        cdef UniStr string_struct
         cdef unicode py_string
         cdef int idx = 0
         for i, py_string in enumerate(strings):
-            string_from_unicode(&string_struct, py_string)
-            tokens.push_back(idx, self.lexicon.get(&string_struct))
+            slice_unicode(&string_struct, py_string, 0, len(py_string))
+            tokens.push_back(idx, <const Lexeme*>self.lexicon.get(tokens.mem, &string_struct))
             idx += len(py_string) + 1
         return tokens
 
@@ -79,22 +81,21 @@ cdef class Language:
             tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
         """
         cdef int length = len(string)
-        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
+        cdef Tokens tokens = Tokens(self, length)
         if length == 0:
             return tokens
         cdef int i = 0
         cdef int start = 0
+        cdef bint cache_hit
         cdef Py_UNICODE* chars = string
         cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
-        cdef String span
+        cdef UniStr span
         for i in range(1, length):
             if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                 if start < i:
-                    string_slice(&span, chars, start, i)
-                    lexemes = <Lexeme**>self._cache.get(span.key)
-                    if lexemes != NULL:
-                        tokens.extend(start, lexemes, 0)
-                    else: 
+                    slice_unicode(&span, chars, start, i)
+                    cache_hit = self._try_cache(start, span.key, tokens)
+                    if not cache_hit:
                         self._tokenize(tokens, &span, start, i)
                 in_ws = not in_ws
                 start = i
@@ -102,15 +103,27 @@ cdef class Language:
                     start += 1
         i += 1
         if start < i:
-            string_slice(&span, chars, start, i)
-            lexemes = <Lexeme**>self._cache.get(span.key)
-            if lexemes != NULL:
-                tokens.extend(start, lexemes, 0)
-            else: 
+            slice_unicode(&span, chars, start, i)
+            cache_hit = self._try_cache(start, span.key, tokens)
+            if not cache_hit:
                 self._tokenize(tokens, &span, start, i)
         return tokens
 
-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
+    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
+        #cached = <Cached*>self._specials.get(key)
+        cached = <Cached*>self._cache.get(key)
+        if cached == NULL:
+            return False
+        cdef int i
+        if cached.is_lex:
+            for i in range(cached.length):
+                idx = tokens.push_back(idx, cached.data.lexemes[i])
+        else:
+            for i in range(cached.length):
+                idx = tokens.push_back(idx, &cached.data.tokens[i])
+        return True
+
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
         cdef vector[Lexeme*] prefixes
         cdef vector[Lexeme*] suffixes
         cdef hash_t orig_key
@@ -119,88 +132,95 @@ cdef class Language:
         orig_size = tokens.length
         self._split_affixes(span, &prefixes, &suffixes)
         self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
-        self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
+        self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
 
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
-                                vector[Lexeme*] *suffixes) except NULL:
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
+                                vector[const Lexeme*] *suffixes) except NULL:
         cdef size_t i
-        cdef String prefix
-        cdef String suffix
-        cdef String minus_pre
-        cdef String minus_suf
+        cdef UniStr prefix
+        cdef UniStr suffix
+        cdef UniStr minus_pre
+        cdef UniStr minus_suf
         cdef size_t last_size = 0
         while string.n != 0 and string.n != last_size:
             last_size = string.n
             pre_len = self._find_prefix(string.chars, string.n)
             if pre_len != 0:
-                string_slice(&prefix, string.chars, 0, pre_len)
-                string_slice(&minus_pre, string.chars, pre_len, string.n)
+                slice_unicode(&prefix, string.chars, 0, pre_len)
+                slice_unicode(&minus_pre, string.chars, pre_len, string.n)
                 # Check whether we've hit a special-case
                 if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
                     string[0] = minus_pre
-                    prefixes.push_back(self.lexicon.get(&prefix))
+                    prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
                     break
             suf_len = self._find_suffix(string.chars, string.n)
             if suf_len != 0:
-                string_slice(&suffix, string.chars, string.n - suf_len, string.n)
-                string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
+                slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
+                slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
                 # Check whether we've hit a special-case
                 if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
                     string[0] = minus_suf
-                    suffixes.push_back(self.lexicon.get(&suffix))
+                    suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
                     break
             if pre_len and suf_len and (pre_len + suf_len) <= string.n:
-                string_slice(string, string.chars, pre_len, string.n - suf_len)
-                prefixes.push_back(self.lexicon.get(&prefix))
-                suffixes.push_back(self.lexicon.get(&suffix))
+                slice_unicode(string, string.chars, pre_len, string.n - suf_len)
+                prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
+                suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
             elif pre_len:
                 string[0] = minus_pre
-                prefixes.push_back(self.lexicon.get(&prefix))
+                prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
             elif suf_len:
                 string[0] = minus_suf
-                suffixes.push_back(self.lexicon.get(&suffix))
+                suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
             if self._specials.get(string.key):
                 break
         return string
 
-    cdef int _attach_tokens(self, Tokens tokens,
-                            int idx, String* string,
-                            vector[Lexeme*] *prefixes,
-                            vector[Lexeme*] *suffixes) except -1:
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
+                            vector[const Lexeme*] *prefixes,
+                            vector[const Lexeme*] *suffixes) except -1:
+        cdef bint cache_hit
         cdef int split
-        cdef Lexeme** lexemes
+        cdef const Lexeme* const* lexemes
         cdef Lexeme* lexeme
-        cdef String span
+        cdef UniStr span
+        cdef int i
         if prefixes.size():
-            idx = tokens.extend(idx, prefixes.data(), prefixes.size())
+            for i in range(prefixes.size()):
+                idx = tokens.push_back(idx, prefixes[0][i])
         if string.n != 0:
-
-            lexemes = <Lexeme**>self._cache.get(string.key)
-            if lexemes != NULL:
-                idx = tokens.extend(idx, lexemes, 0)
+            cache_hit = self._try_cache(idx, string.key, tokens)
+            if cache_hit:
+                idx = tokens.data[tokens.length - 1].idx + 1
             else:
                 split = self._find_infix(string.chars, string.n)
                 if split == 0 or split == -1:
-                    idx = tokens.push_back(idx, self.lexicon.get(string))
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, string))
                 else:
-                    string_slice(&span, string.chars, 0, split)
-                    idx = tokens.push_back(idx, self.lexicon.get(&span))
-                    string_slice(&span, string.chars, split, split+1)
-                    idx = tokens.push_back(idx, self.lexicon.get(&span))
-                    string_slice(&span, string.chars, split + 1, string.n)
-                    idx = tokens.push_back(idx, self.lexicon.get(&span))
-        cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
+                    slice_unicode(&span, string.chars, 0, split)
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
+                    slice_unicode(&span, string.chars, split, split+1)
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
+                    slice_unicode(&span, string.chars, split + 1, string.n)
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
+        cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
             idx = tokens.push_back(idx, deref(it))
             preinc(it)
 
-    cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
-        lexemes = <Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
         cdef int i
         for i in range(n):
-            lexemes[i] = tokens[i]
-        lexemes[i + 1] = NULL
-        self._cache.set(key, lexemes)
+            if tokens[i].lex.id == 1:
+                return 0
+        cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
+        cached.length = n
+        cached.is_lex = True
+        lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
+        for i in range(n):
+            lexemes[i] = tokens[i].lex
+        cached.data.lexemes = <const Lexeme* const*>lexemes
+        self._cache.set(key, cached)
 
     cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
         cdef unicode string = chars[:length]
@@ -217,66 +237,120 @@ cdef class Language:
         match = self._suffix_re.search(string)
         return (match.end() - match.start()) if match is not None else 0
 
-    def _load_special_tokenization(self, token_rules):
-        '''Load special-case tokenization rules.
-
-        Loads special-case tokenization rules into the Language._cache cache,
-        read from data/<lang>/tokenization . The special cases are loaded before
-        any language data is tokenized, giving these priority.  For instance,
-        the English tokenization rules map "ain't" to ["are", "not"].
-
-        Args:
-            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
-                a string and tokens is a list of strings.
+    def _load_special_tokenization(self, object rules):
+        '''Add a special-case tokenization rule.
         '''
+        cdef int i
+        cdef unicode chunk
+        cdef list substrings
+        cdef unicode form
+        cdef unicode lemma
+        cdef dict props
         cdef Lexeme** lexemes
         cdef hash_t hashed
-        cdef String string
-        for uni_string, substrings in token_rules:
-            lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
-            for i, substring in enumerate(substrings):
-                string_from_unicode(&string, substring)
-                lexemes[i] = <Lexeme*>self.lexicon.get(&string)
-            lexemes[i + 1] = NULL
-            string_from_unicode(&string, uni_string)
-            self._specials.set(string.key, lexemes)
-            self._cache.set(string.key, lexemes)
+        cdef UniStr string
+        for chunk, substrings in sorted(rules.items()):
+            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
+            for i, props in enumerate(substrings):
+                form = props['F']
+                lemma = props.get("L", None)
+                slice_unicode(&string, form, 0, len(form))
+                tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
+                if lemma:
+                    tokens[i].lemma = self.lexicon.strings[lemma]
+                set_morph_from_dict(&tokens[i].morph, props)
+            cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
+            cached.length = len(substrings)
+            cached.is_lex = False
+            cached.data.tokens = tokens
+            slice_unicode(&string, chunk, 0, len(chunk))
+            self._specials.set(string.key, cached)
+            self._cache.set(string.key, cached)
+
+
+cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
+    morph.number = props.get('number', 0)
+    morph.tenspect = props.get('tenspect', 0)
+    morph.mood = props.get('mood', 0)
+    morph.gender = props.get('gender', 0)
+    morph.person = props.get('person', 0)
+    morph.case = props.get('case', 0)
+    morph.misc = props.get('misc', 0)
 
 
 cdef class Lexicon:
-    def __init__(self):
+    '''A map container for a language's Lexeme structs.
+    
+    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
+    '''
+    def __init__(self, object get_props):
         self.mem = Pool()
-        self._dict = PreshMap(2 ** 20)
+        self._map = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.lexemes.push_back(&EMPTY_LEXEME)
-        self.size = 1
+        self.get_lex_props = get_props
 
-    cdef Lexeme* get(self, String* string) except NULL:
+    def __len__(self):
+        return self.lexemes.size()
+
+    cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
+        '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
+        if necessary, using memory acquired from the given pool.  If the pool
+        is the lexicon's own memory, the lexeme is saved in the lexicon.'''
         cdef Lexeme* lex
-        lex = <Lexeme*>self._dict.get(string.key)
+        lex = <Lexeme*>self._map.get(string.key)
         if lex != NULL:
             return lex
-        lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
-        self._dict.set(string.key, lex)
-        while self.lexemes.size() < (lex.id + 1):
-            self.lexemes.push_back(&EMPTY_LEXEME)
-        self.lexemes[lex.id] = lex
-        self.size += 1
+        if string.n < 3:
+            mem = self.mem
+        cdef unicode py_string = string.chars[:string.n]
+        lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
+        lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
+                             self.get_lex_props(py_string))
+        if mem is self.mem:
+            self._map.set(string.key, lex)
+            while self.lexemes.size() < (lex.id + 1):
+                self.lexemes.push_back(&EMPTY_LEXEME)
+            self.lexemes[lex.id] = lex
+        else:
+            lex[0].id = 1
         return lex
 
     def __getitem__(self,  id_or_string):
+        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
+        unseen unicode string is given, a new Lexeme is created and stored.
+
+        This function relies on Cython's struct-to-dict conversion.  Python clients
+        receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
+        with int values.  Cython clients can instead receive a Lexeme struct value.
+        More efficient Cython access is provided by Lexicon.get, which returns
+        a Lexeme*.
+
+        Args:
+            id_or_string (int or unicode): The integer ID of a word, or its unicode
+                string.  If an int >= Lexicon.size, IndexError is raised.
+                If id_or_string is neither an int nor a unicode string, ValueError
+                is raised.
+
+        Returns:
+            lexeme (dict): A Lexeme struct instance, which Cython translates into
+                a dict if the operator is called from Python.
+        '''
         if type(id_or_string) == int:
+            if id_or_string >= self.lexemes.size():
+                raise IndexError
             return self.lexemes.at(id_or_string)[0]
-        cdef String string
-        string_from_unicode(&string, id_or_string)
-        cdef Lexeme* lexeme = self.get(&string)
+        cdef UniStr string
+        slice_unicode(&string, id_or_string, 0, len(id_or_string))
+        cdef const Lexeme* lexeme = self.get(self.mem, &string)
         return lexeme[0]
 
     def __setitem__(self, unicode uni_string, dict props):
-        cdef String s
-        string_from_unicode(&s, uni_string)
-        cdef Lexeme* lex = self.get(&s)
+        cdef UniStr s
+        slice_unicode(&s, uni_string, 0, len(uni_string))
+        # Cast through the const here, since we're allowed to change our own
+        # Lexemes.
+        lex = <Lexeme*><void*>self.get(self.mem, &s)
         lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
 
     def dump(self, loc):
@@ -287,11 +361,11 @@ cdef class Lexicon:
         assert fp != NULL
         cdef size_t st
         cdef hash_t key
-        for i in range(self._dict.length):
-            key = self._dict.c_map.cells[i].key
+        for i in range(self._map.length):
+            key = self._map.c_map.cells[i].key
             if key == 0:
                 continue
-            lexeme = <Lexeme*>self._dict.c_map.cells[i].value
+            lexeme = <Lexeme*>self._map.c_map.cells[i].value
             st = fwrite(&key, sizeof(key), 1, fp)
             assert st == 1
             st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
@@ -300,7 +374,8 @@ cdef class Lexicon:
         assert st == 0
 
     def load(self, loc):
-        assert path.exists(loc)
+        if not path.exists(loc):
+            raise IOError('Lexemes file not found at %s' % loc)
         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
         cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
         assert fp != NULL
@@ -316,21 +391,9 @@ cdef class Lexicon:
             st = fread(lexeme, sizeof(Lexeme), 1, fp)
             if st != 1:
                 break
-            self._dict.set(key, lexeme)
+            self._map.set(key, lexeme)
             while self.lexemes.size() < (lexeme.id + 1):
                 self.lexemes.push_back(&EMPTY_LEXEME)
             self.lexemes[lexeme.id] = lexeme
             i += 1
-            self.size += 1
         fclose(fp)
-        
-
-cdef void string_from_unicode(String* s, unicode uni):
-    cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
-    string_slice(s, c_uni, 0, len(uni))
-
-
-cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
-    s.chars = &chars[start]
-    s.n = end - start
-    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
new file mode 100644
index 000000000..ce9bbefdc
--- /dev/null
+++ b/spacy/lemmatizer.py
@@ -0,0 +1,90 @@
+from os import path
+
+
+NOUN_RULES = (
+    ('s', ''),
+    ('ses', 's'),
+    ('ves', 'f'),
+    ('xes', 'x'),
+    ('zes', 'z'),
+    ('ches', 'ch'),
+    ('shes', 'sh'),
+    ('men', 'man'),
+    ('ies', 'y')
+)
+
+
+VERB_RULES = (
+    ("s", ""),
+    ("ies", "y"),
+    ("es", "e"),
+    ("es", ""),
+    ("ed", "e"),
+    ("ed", ""),
+    ("ing", "e"),
+    ("ing", "")
+)
+
+
+ADJ_RULES = (
+    ("er", ""),
+    ("est", ""),
+    ("er", "e"),
+    ("est", "e")
+)
+
+
+class Lemmatizer(object):
+    def __init__(self, wn_dict_dir):
+        self.index = {}
+        self.exc = {}
+        for pos in ['adj', 'adv', 'noun', 'verb']:
+            self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
+            self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
+
+    def noun(self, string):
+        return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
+
+    def verb(self, string):
+        return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
+
+    def adj(self, string):
+        return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
+
+
+def lemmatize(string, index, exceptions, rules):
+    string = string.lower()
+    forms = []
+    if string in index:
+        forms.append(string)
+    forms.extend(exceptions.get(string, []))
+    for old, new in rules:
+        if string.endswith(old):
+            form = string[:len(string) - len(old)] + new
+            if form in index:
+                forms.append(form)
+    if not forms:
+        forms.append(string)
+    return set(forms)
+
+
+def read_index(loc):
+    index = set()
+    for line in open(loc):
+        if line.startswith(' '):
+            continue
+        pieces = line.split()
+        word = pieces[0]
+        if word.count('_') == 0:
+            index.add(word)
+    return index
+
+
+def read_exc(loc):
+    exceptions = {}
+    for line in open(loc):
+        if line.startswith(' '):
+            continue
+        pieces = line.split()
+        exceptions[pieces[0]] = tuple(pieces[1:])
+    return exceptions
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 0d7d206e5..a6f20906b 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,61 +1,137 @@
-from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
+from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
 
 from .utf8string cimport StringStore
-from libc.stdint cimport uint16_t
 
-cpdef flag_t OOV_DIST_FLAGS
 
-# Flags
-cpdef enum:
-    IS_ALPHA
-    IS_ASCII
-    IS_DIGIT
-    IS_LOWER
-    IS_PUNCT
-    IS_SPACE
-    IS_TITLE
-    IS_UPPER
+# Reserve 64 values for flag features
+cpdef enum attr_id_t:
+    FLAG0
+    FLAG1
+    FLAG2
+    FLAG3
+    FLAG4
+    FLAG5
+    FLAG6
+    FLAG7
+    FLAG8
+    FLAG9
+    FLAG10
+    FLAG11
+    FLAG12
+    FLAG13
+    FLAG14
+    FLAG15
+    FLAG16
+    FLAG17
+    FLAG18
+    FLAG19
+    FLAG20
+    FLAG21
+    FLAG22
+    FLAG23
+    FLAG24
+    FLAG25
+    FLAG26
+    FLAG27
+    FLAG28
+    FLAG29
+    FLAG30
+    FLAG31
+    FLAG32
+    FLAG33
+    FLAG34
+    FLAG35
+    FLAG36
+    FLAG37
+    FLAG38
+    FLAG39
+    FLAG40
+    FLAG41
+    FLAG42
+    FLAG43
+    FLAG44
+    FLAG45
+    FLAG46
+    FLAG47
+    FLAG48
+    FLAG49
+    FLAG50
+    FLAG51
+    FLAG52
+    FLAG53
+    FLAG54
+    FLAG55
+    FLAG56
+    FLAG57
+    FLAG58
+    FLAG59
+    FLAG60
+    FLAG61
+    FLAG62
+    FLAG63
 
-    LIKE_URL
-    LIKE_NUMBER
+    ID
+    SIC
+    DENSE
+    SHAPE
+    PREFIX
+    SUFFIX
 
-    OFT_LOWER
-    OFT_TITLE
-    OFT_UPPER
-
-    IN_MALES
-    IN_FEMALES
-    IN_SURNAMES
-    IN_PLACES
-    IN_GAMES
-    IN_CELEBS
-    IN_NAMES
+    LENGTH
+    CLUSTER
+    POS_TYPE
+    LEMMA
 
 
 cdef struct Lexeme:
-    flag_t flags
+    flags_t flags
    
-    id_t id
-    id_t sic
-    id_t norm
-    id_t shape
-    id_t asciied
-    id_t prefix
-    id_t suffix
+    attr_t id
+    attr_t sic
+    attr_t dense
+    attr_t shape
+    attr_t prefix
+    attr_t suffix
+ 
+    attr_t length
+    attr_t cluster
+    attr_t pos_type
 
     float prob
-    
-    len_t length
-    tag_t cluster
-    tag_t postype
-    tag_t supersense
+    float sentiment
 
 
 cdef Lexeme EMPTY_LEXEME
 
-cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
-                  StringStore store, dict props) except *
+
+cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
+                  dict props) except *
  
 
-cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
+cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
     return lexeme.flags & (1 << flag_id)
+
+
+cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
+    if feat_name < (sizeof(flags_t) * 8):
+        return check_flag(lex, feat_name)
+    elif feat_name == ID:
+        return lex.id
+    elif feat_name == SIC:
+        return lex.sic
+    elif feat_name == DENSE:
+        return lex.dense
+    elif feat_name == SHAPE:
+        return lex.shape
+    elif feat_name == PREFIX:
+        return lex.prefix
+    elif feat_name == SUFFIX:
+        return lex.suffix
+    elif feat_name == LENGTH:
+        return lex.length
+    elif feat_name == CLUSTER:
+        return lex.cluster
+    elif feat_name == POS_TYPE:
+        return lex.pos_type
+    else:
+        return 0
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 64eb699a6..f1974cbc9 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -6,67 +6,25 @@ from libc.string cimport memset
 
 import orth
 
-from .utf8string cimport Utf8Str
-
-OOV_DIST_FLAGS = 0
 
 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
 
 
-def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
-    cdef flag_t flags = 0
-    flags |= orth.is_alpha(string) << IS_ALPHA
-    flags |= orth.is_ascii(string) << IS_ASCII
-    flags |= orth.is_digit(string) << IS_DIGIT
-    flags |= orth.is_lower(string) << IS_LOWER
-    flags |= orth.is_punct(string) << IS_PUNCT
-    flags |= orth.is_space(string) << IS_SPACE
-    flags |= orth.is_title(string) << IS_TITLE
-    flags |= orth.is_upper(string) << IS_UPPER
-
-    flags |= orth.like_url(string) << LIKE_URL
-    flags |= orth.like_number(string) << LIKE_NUMBER
-    return flags
-
-
 cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
-                  StringStore store, dict props) except *:
+                  StringStore string_store, dict props) except *:
     cdef Lexeme lex
     lex.id = i
     lex.length = len(string)
-    lex.sic = get_string_id(string, store)
+    lex.sic = string_store[string]
     
     lex.cluster = props.get('cluster', 0)
-    lex.postype = props.get('postype', 0)
-    lex.supersense = props.get('supersense', 0)
+    lex.pos_type = props.get('pos_type', 0)
     lex.prob = props.get('prob', 0)
 
-    cdef float upper_pc = props.get('upper_pc', 0.0)
-    cdef float lower_pc = props.get('lower_pc', 0.0)
-    cdef float title_pc = props.get('title_pc', 0.0)
-
-    lex.prefix = get_string_id(string[0], store)
-    lex.suffix = get_string_id(string[-3:], store)
-    if upper_pc or lower_pc or title_pc:
-        canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
-        lex.norm = get_string_id(canon_cased, store)
-    else:
-        lex.norm = lex.sic
-    lex.shape = get_string_id(orth.word_shape(string), store)
-    lex.asciied = get_string_id(orth.asciied(string), store)
-    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
-    
-    lex.flags |= props.get('in_males', 0) << IN_MALES
-    lex.flags |= props.get('in_females', 0) << IN_FEMALES
-    lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
-    lex.flags |= props.get('in_places', 0) << IN_PLACES
-    lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
-    lex.flags |= props.get('in_games', 0) << IN_GAMES
-    lex.flags |= props.get('in_names', 0) << IN_NAMES
+    lex.prefix = string_store[string[:1]]
+    lex.suffix = string_store[string[-3:]]
+    lex.shape = string_store[orth.word_shape(string)]
+    lex.dense = string_store[props['dense']]
+   
+    lex.flags = props.get('flags', 0)
     return lex
-
-
-cdef id_t get_string_id(unicode string, StringStore store) except 0:
-    cdef bytes byte_string = string.encode('utf8')
-    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
-    return orig_str.i
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
new file mode 100644
index 000000000..9c5d342e9
--- /dev/null
+++ b/spacy/morphology.pxd
@@ -0,0 +1,45 @@
+
+from .tokens cimport TokenC
+from .lexeme cimport Lexeme
+from .utf8string cimport StringStore
+from .typedefs cimport id_t, Morphology
+
+from preshed.maps cimport PreshMapArray
+from cymem.cymem cimport Pool
+
+
+# Google universal tag set
+cpdef enum univ_tag_t:
+    NO_TAG
+    ADJ
+    ADV
+    ADP
+    CONJ
+    DET
+    NOUN
+    NUM
+    PRON
+    PRT
+    VERB
+    X
+    PUNCT
+    EOL
+    N_UNIV_TAGS
+
+
+cdef struct PosTag:
+    Morphology morph
+    int id
+    univ_tag_t pos
+
+
+cdef class Morphologizer:
+    cdef Pool mem
+    cdef StringStore strings
+    cdef object lemmatizer
+    cdef PosTag* tags
+    cdef readonly list tag_names
+
+    cdef PreshMapArray _cache
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
+    cdef int set_morph(self, const int i, TokenC* tokens) except -1
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
new file mode 100644
index 000000000..346c778a9
--- /dev/null
+++ b/spacy/morphology.pyx
@@ -0,0 +1,117 @@
+# cython: profile=True
+# cython: embedsignature=True
+from os import path
+import json
+
+from .lemmatizer import Lemmatizer
+from .typedefs cimport id_t
+
+UNIV_TAGS = {
+    'NULL': NO_TAG,
+    'ADJ': ADJ,
+    'ADV': ADV,
+    'ADP': ADP,
+    'CONJ': CONJ,
+    'DET': DET,
+    'NOUN': NOUN,
+    'NUM': NUM,
+    'PRON': PRON,
+    'PRT': PRT,
+    'VERB': VERB,
+    'X': X,
+    '.': PUNCT,
+    'EOL': EOL
+}
+
+
+cdef struct _Cached:
+    Morphology morph
+    int lemma
+
+
+cdef class Morphologizer:
+    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
+    """
+    def __init__(self, StringStore strings, data_dir):
+        self.mem = Pool()
+        self.strings = strings
+        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
+        tag_map = cfg['tag_map']
+        self.tag_names = cfg['tag_names']
+        self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
+        self._cache = PreshMapArray(len(self.tag_names))
+        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
+        for i, tag in enumerate(self.tag_names):
+            pos, props = tag_map[tag]
+            self.tags[i].id = i
+            self.tags[i].pos = pos
+            self.tags[i].morph.number = props.get('number', 0)
+            self.tags[i].morph.tenspect = props.get('tenspect', 0)
+            self.tags[i].morph.mood = props.get('mood', 0)
+            self.tags[i].morph.gender = props.get('gender', 0)
+            self.tags[i].morph.person = props.get('person', 0)
+            self.tags[i].morph.case = props.get('case', 0)
+            self.tags[i].morph.misc = props.get('misc', 0)
+        if path.exists(path.join(data_dir, 'morphs.json')):
+            with open(path.join(data_dir, 'morphs.json')) as file_:
+                self.load_exceptions(json.load(file_))
+
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
+        if self.lemmatizer is None:
+            return lex.sic
+        if pos != NOUN and pos != VERB and pos != ADJ:
+            return lex.sic
+        cdef bytes py_string = self.strings[lex.sic]
+        cdef set lemma_strings
+        cdef bytes lemma_string
+        if pos == NOUN:
+            lemma_strings = self.lemmatizer.noun(py_string)
+        elif pos == VERB:
+            lemma_strings = self.lemmatizer.verb(py_string)
+        else:
+            assert pos == ADJ
+            lemma_strings = self.lemmatizer.adj(py_string)
+        lemma_string = sorted(lemma_strings)[0]
+        lemma = self.strings.intern(lemma_string, len(lemma_string)).i
+        return lemma
+
+    cdef int set_morph(self, const int i, TokenC* tokens) except -1:
+        cdef const PosTag* tag = &self.tags[tokens[i].pos]
+        cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
+        if cached is NULL:
+            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
+            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
+            cached.morph = tag.morph
+            self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
+
+        tokens[i].lemma = cached.lemma
+        tokens[i].morph = cached.morph
+
+    def load_exceptions(self, dict exc):
+        cdef unicode pos_str
+        cdef unicode form_str
+        cdef unicode lemma_str
+        cdef dict entries
+        cdef dict props
+        cdef int lemma
+        cdef id_t sic
+        cdef univ_tag_t pos
+        for pos_str, entries in exc.items():
+            pos = self.tag_names.index(pos_str)
+            for form_str, props in entries.items():
+                lemma_str = props.get('L', form_str)
+                sic = self.strings[form_str]
+                cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
+                cached.lemma = self.strings[lemma_str]
+                set_morph_from_dict(&cached.morph, props)
+                self._cache.set(pos, sic, <void*>cached)
+                
+
+cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
+    morph.number = props.get('number', 0)
+    morph.tenspect = props.get('tenspect', 0)
+    morph.mood = props.get('mood', 0)
+    morph.gender = props.get('gender', 0)
+    morph.person = props.get('person', 0)
+    morph.case = props.get('case', 0)
+    morph.misc = props.get('misc', 0)
diff --git a/spacy/orth.py b/spacy/orth.py
index 0462d15df..2400b38a6 100644
--- a/spacy/orth.py
+++ b/spacy/orth.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 import unicodedata
 from unidecode import unidecode
+import re
 
 import math
 
diff --git a/spacy/pos_util.py b/spacy/pos_util.py
index e5716665e..489f03dde 100644
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@@ -147,6 +147,7 @@ Y	PRT
 Z	NOUN
 ^	NOUN
 ~	X
-``	.""".strip().split('\n'))
+``	.
+EOL EOL""".strip().split('\n'))
     return mapping[tag]
 
diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index 11d8d2a4c..33732f987 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -1,34 +1,23 @@
+from libc.stdint cimport uint8_t
+
 from cymem.cymem cimport Pool
 
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 
-from .typedefs cimport hash_t
-from .context cimport Slots
+from preshed.maps cimport PreshMapArray
+
+from .typedefs cimport hash_t, id_t
 from .tokens cimport Tokens
 
 
-cpdef enum TagType:
-    POS
-    ENTITY
-    SENSE
-
-
 cdef class Tagger:
-    cpdef int set_tags(self, Tokens tokens) except -1
-    cpdef class_t predict(self, int i, Tokens tokens) except 0
-    cpdef int tell_answer(self, list gold) except -1
+    cdef class_t predict(self, const atom_t* context, object golds=*) except *
  
     cpdef readonly Pool mem
     cpdef readonly Extractor extractor
     cpdef readonly LinearModel model
 
-    cpdef readonly TagType tag_type
     cpdef readonly list tag_names
-
-    cdef class_t _guess
-    cdef atom_t* _context
-    cdef feat_t* _feats
-    cdef weight_t* _values
-    cdef weight_t* _scores
+    cdef dict tagdict
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 428814f70..9890e95e1 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -1,5 +1,4 @@
 # cython: profile=True
-from __future__ import print_function
 from __future__ import unicode_literals
 from __future__ import division
 
@@ -10,155 +9,59 @@ import random
 import json
 import cython
 
-
-from .context cimport fill_context
-from .context cimport N_FIELDS
-
-from thinc.features cimport ConjFeat
+from thinc.features cimport Feature, count_feats
 
 
-NULL_TAG = 0
-
-
-def setup_model_dir(tag_type, tag_names, templates, model_dir):
+def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
     if path.exists(model_dir):
         shutil.rmtree(model_dir)
     os.mkdir(model_dir)
     config = {
-        'tag_type': tag_type,
         'templates': templates,
         'tag_names': tag_names,
+        'tag_map': tag_map,
+        'tag_counts': tag_counts,
     }
     with open(path.join(model_dir, 'config.json'), 'w') as file_:
         json.dump(config, file_)
 
 
-def train(train_sents, model_dir, nr_iter=10):
-    cdef Tokens tokens
-    tagger = Tagger(model_dir)
-    for _ in range(nr_iter):
-        n_corr = 0
-        total = 0
-        for tokens, golds in train_sents:
-            assert len(tokens) == len(golds), [t.string for t in tokens]
-            for i in range(tokens.length):
-                if tagger.tag_type == POS:
-                    gold = _get_gold_pos(i, golds, tokens.pos)
-                elif tagger.tag_type == ENTITY:
-                    gold = _get_gold_ner(i, golds, tokens.ner)
-                guess = tagger.predict(i, tokens)
-                tokens.set_tag(i, tagger.tag_type, guess)
-                if gold is not None:
-                    tagger.tell_answer(gold)
-                    total += 1
-                    n_corr += guess in gold
-                #print('%s\t%d\t%d' % (tokens[i].string, guess, gold))
-        print('%.4f' % ((n_corr / total) * 100))
-        random.shuffle(train_sents)
-    tagger.model.end_training()
-    tagger.model.dump(path.join(model_dir, 'model'))
-
-
-cdef object _get_gold_pos(i, golds, int* pred):
-    if golds[i] == 0:
-        return None
-    else:
-        return [golds[i]]
-
-
-cdef object _get_gold_ner(i, golds, int* ner):
-    if golds[i] == 0:
-        return None
-    else:
-        return [golds[i]]
-
-
-def evaluate(tagger, sents):
-    n_corr = 0
-    total = 0
-    for tokens, golds in sents:
-        for i, gold in enumerate(golds):
-            guess = tagger.predict(i, tokens)
-            tokens.set_tag(i, tagger.tag_type, guess)
-            if gold != NULL_TAG:
-                total += 1
-                n_corr += guess == gold
-    return n_corr / total
-
-
 cdef class Tagger:
-    """Assign part-of-speech, named entity or supersense tags, using greedy
-    decoding.  The tagger reads its model and configuration from disk.
+    """Predict some type of tag, using greedy decoding.  The tagger reads its
+    model and configuration from disk.
     """
     def __init__(self, model_dir):
         self.mem = Pool()
         cfg = json.load(open(path.join(model_dir, 'config.json')))
         templates = cfg['templates']
+        univ_counts = {}
+        cdef unicode tag
+        cdef unicode univ_tag
         self.tag_names = cfg['tag_names']
-        self.tag_type = cfg['tag_type']
-        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
-        self.model = LinearModel(len(self.tag_names))
+        self.tagdict = _make_tag_dict(cfg['tag_counts'])
+        self.extractor = Extractor(templates)
+        self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
         if path.exists(path.join(model_dir, 'model')):
             self.model.load(path.join(model_dir, 'model'))
 
-        self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
-        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
-        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
-        self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
-        self._guess = NULL_TAG
-
-    cpdef int set_tags(self, Tokens tokens) except -1:
-        """Assign tags to a Tokens object.
-
-        >>> tokens = EN.tokenize(u'An example sentence.')
-        >>> assert tokens[0].pos == 'NO_TAG'
-        >>> EN.pos_tagger.set_tags(tokens)
-        >>> assert tokens[0].pos == 'DT'
-        """
-        cdef int i
-        for i in range(tokens.length):
-            tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
-
-    cpdef class_t predict(self, int i, Tokens tokens) except 0:
-        """Predict the tag of tokens[i].  The tagger remembers the features and
-        prediction, in case you later call tell_answer.
+    cdef class_t predict(self, atom_t* context, object golds=None) except *:
+        """Predict the tag of tokens[i].
 
         >>> tokens = EN.tokenize(u'An example sentence.')
         >>> tag = EN.pos_tagger.predict(0, tokens)
         >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
         """
-        fill_context(self._context, i, tokens)
-        self.extractor.extract(self._feats, self._values, self._context, NULL)
-        self._guess = self.model.score(self._scores, self._feats, self._values)
-        return self._guess
-
-    cpdef int tell_answer(self, list golds) except -1:
-        """Provide the correct tag for the word the tagger was last asked to predict.
-        During Tagger.predict, the tagger remembers the features and prediction
-        for the example. These are used to calculate a weight update given the
-        correct label.
-
-        >>> tokens = EN.tokenize('An example sentence.')
-        >>> guess = EN.pos_tagger.predict(1, tokens)
-        >>> JJ = EN.pos_tagger.tag_id('JJ')
-        >>> JJ
-        7
-        >>> EN.pos_tagger.tell_answer(JJ)
-        """
-        cdef class_t guess = self._guess
-        if guess in golds:
-            self.model.update({})
-            return 0
-        best_gold = golds[0]
-        best_score = self._scores[best_gold-1]
-        for gold in golds[1:]:
-            if self._scores[gold-1] > best_gold:
-                best_score = self._scores[best_gold-1]
-                best_gold = gold
-        counts = {guess: {}, best_gold: {}}
-        self.extractor.count(counts[best_gold], self._feats, 1)
-        self.extractor.count(counts[guess], self._feats, -1)
-        self.model.update(counts)
+        cdef int n_feats
+        cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
+        cdef weight_t* scores = self.model.get_scores(feats, n_feats)
+        guess = _arg_max(scores, self.model.nr_class)
+        if golds is not None and guess not in golds:
+            best = _arg_max_among(scores, golds)
+            counts = {guess: {}, best: {}}
+            count_feats(counts[guess], feats, n_feats, -1)
+            count_feats(counts[best], feats, n_feats, 1)
+            self.model.update(counts)
+        return guess
 
     def tag_id(self, object tag_name):
         """Encode tag_name into a tag ID integer."""
@@ -167,3 +70,41 @@ cdef class Tagger:
             tag_id = len(self.tag_names)
             self.tag_names.append(tag_name)
         return tag_id
+
+
+def _make_tag_dict(counts):
+    freq_thresh = 20
+    ambiguity_thresh = 0.97
+    tagdict = {}
+    cdef atom_t word
+    cdef atom_t tag
+    for word_str, tag_freqs in counts.items():
+        tag_str, mode = max(tag_freqs.items(), key=lambda item: item[1])
+        n = sum(tag_freqs.values())
+        word = int(word_str)
+        tag = int(tag_str)
+        if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
+            tagdict[word] = tag
+    return tagdict
+
+
+cdef class_t _arg_max(weight_t* scores, int n_classes) except 9000:
+    cdef int best = 0
+    cdef weight_t score = scores[best]
+    cdef int i
+    for i in range(1, n_classes):
+        if scores[i] >= score:
+            score = scores[i]
+            best = i
+    return best
+
+
+cdef class_t _arg_max_among(weight_t* scores, list classes):
+    cdef int best = classes[0]
+    cdef weight_t score = scores[best]
+    cdef class_t clas
+    for clas in classes:
+        if scores[clas] > score:
+            score = scores[clas]
+            best = clas
+    return best
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index d1b2ef10b..43aa7b442 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -1,40 +1,55 @@
+import numpy as np
+cimport numpy as np
+
 from cymem.cymem cimport Pool
+from thinc.typedefs cimport atom_t
 
 from .lexeme cimport Lexeme
-from .typedefs cimport flag_t
-from .utf8string cimport StringStore
-from .tagger cimport TagType
 
-from thinc.typedefs cimport atom_t
+from .typedefs cimport flags_t
+from .typedefs cimport Morphology
+from .lang cimport Language
+
+
+
+cdef struct TokenC:
+    const Lexeme* lex
+    Morphology morph
+    int idx
+    int pos
+    int lemma
+    int sense
+
+
+ctypedef const Lexeme* const_Lexeme_ptr
+ctypedef TokenC* TokenC_ptr
+
+ctypedef fused LexemeOrToken:
+    const_Lexeme_ptr
+    TokenC_ptr
 
 
 cdef class Tokens:
     cdef Pool mem
-    cdef StringStore _string_store
+    cdef Language lang
+    cdef list tag_names
 
-    cdef Lexeme** _lex_ptr
-    cdef int* _idx_ptr
-    cdef int* _pos_ptr
-    cdef int* _ner_ptr
-    cdef Lexeme** lex
-    cdef int* idx
-    cdef int* pos
-    cdef int* ner
+    cdef TokenC* data
 
     cdef int length
     cdef int max_length
 
-    cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
-    cdef int push_back(self, int i, Lexeme* lexeme) except -1
-    cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1
+    cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
+
+    cpdef np.ndarray[long, ndim=2] get_array(self, list features)
 
 
 cdef class Token:
-    cdef StringStore _string_store
+    cdef public Language lang
     cdef public int i
     cdef public int idx
-    cdef public int pos
-    cdef public int ner
+    cdef int pos
+    cdef int lemma
 
     cdef public atom_t id
     cdef public atom_t cluster
@@ -51,4 +66,4 @@ cdef class Token:
 
     cdef public float prob
 
-    cdef public flag_t flags
+    cdef public flags_t flags
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 721e6bb80..617feb269 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -1,7 +1,15 @@
 # cython: profile=True
+from preshed.maps cimport PreshMap
+from preshed.counter cimport PreshCounter
+
 from .lexeme cimport *
 cimport cython
-from .tagger cimport POS, ENTITY
+
+import numpy as np
+cimport numpy as np
+
+POS = 0
+ENTITY = 0
 
 DEF PADDING = 5
 
@@ -17,23 +25,13 @@ cdef class Tokens:
     """A sequence of references to Lexeme objects.
 
     The Tokens class provides fast and memory-efficient access to lexical features,
-    and can efficiently export the data to a numpy array.  Specific languages
-    create their own Tokens subclasses, to provide more convenient access to
-    language-specific features.
+    and can efficiently export the data to a numpy array.
 
     >>> from spacy.en import EN
     >>> tokens = EN.tokenize('An example sentence.')
-    >>> tokens.string(0)
-    'An'
-    >>> tokens.prob(0) > tokens.prob(1)
-    True
-    >>> tokens.can_noun(0)
-    False
-    >>> tokens.can_noun(1)
-    True
     """
-    def __init__(self, StringStore string_store, string_length=0):
-        self._string_store = string_store
+    def __init__(self, Language lang, string_length=0):
+        self.lang = lang
         if string_length >= 3:
             size = int(string_length / 3.0)
         else:
@@ -42,28 +40,18 @@ cdef class Tokens:
         # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
         # However, we need to remember the true starting places, so that we can
         # realloc.
-        self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
-        self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
-        self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
-        self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
-        self.lex = self._lex_ptr
-        self.idx = self._idx_ptr
-        self.pos = self._pos_ptr
-        self.ner = self._ner_ptr
+        data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
         cdef int i
         for i in range(size + (PADDING*2)):
-            self.lex[i] = &EMPTY_LEXEME
-        self.lex += PADDING
-        self.idx += PADDING
-        self.pos += PADDING
-        self.ner += PADDING
+            data_start[i].lex = &EMPTY_LEXEME
+        self.data = data_start + PADDING
         self.max_length = size
         self.length = 0
 
     def __getitem__(self, i):
         bounds_check(i, self.length, PADDING)
-        return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
-                     self.lex[i][0])
+        return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
+                     self.data[i].lemma, self.data[i].lex[0])
 
     def __iter__(self):
         for i in range(self.length):
@@ -72,70 +60,78 @@ cdef class Tokens:
     def __len__(self):
         return self.length
 
-    cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
+    cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
         if self.length == self.max_length:
             self._realloc(self.length * 2)
-        self.lex[self.length] = lexeme
-        self.idx[self.length] = idx
-        self.pos[self.length] = 0
-        self.ner[self.length] = 0
-        self.length += 1
-        return idx + lexeme.length
-
-    cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
-        cdef int i
-        if lexemes == NULL:
-            return idx
-        elif n == 0:
-            i = 0
-            while lexemes[i] != NULL:
-                idx = self.push_back(idx, lexemes[i])
-                i += 1
+        cdef TokenC* t = &self.data[self.length]
+        if LexemeOrToken is TokenC_ptr:
+            t[0] = lex_or_tok[0]
         else:
-            for i in range(n):
-                idx = self.push_back(idx, lexemes[i])
-        return idx
+            t.lex = lex_or_tok
+        self.length += 1
+        return idx + t.lex.length
 
-    cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
-        if tag_type == POS:
-            self.pos[i] = tag
-        elif tag_type == ENTITY:
-            self.ner[i] = tag
+    @cython.boundscheck(False)
+    cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
+        cdef int i, j
+        cdef attr_id_t feature
+        cdef np.ndarray[long, ndim=2] output
+        output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
+        for i in range(self.length):
+            for j, feature in enumerate(attr_ids):
+                output[i, j] = get_attr(self.data[i].lex, feature)
+        return output
+
+    def count_by(self, attr_id_t attr_id):
+        cdef int i
+        cdef attr_t attr
+        cdef size_t count
+
+        cdef PreshCounter counts = PreshCounter(2 ** 8)
+        for i in range(self.length):
+            if attr_id == LEMMA:
+                attr = self.data[i].lemma
+            else:
+                attr = get_attr(self.data[i].lex, attr_id)
+            counts.inc(attr, 1)
+        return dict(counts)
 
     def _realloc(self, new_size):
         self.max_length = new_size
         n = new_size + (PADDING * 2)
-        self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
-        self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
-        self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
-        self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
-        self.lex = self._lex_ptr + PADDING
-        self.idx = self._idx_ptr + PADDING
-        self.pos = self._pos_ptr + PADDING
-        self.ner = self._ner_ptr + PADDING
+        # What we're storing is a "padded" array. We've jumped forward PADDING
+        # places, and are storing the pointer to that. This way, we can access
+        # words out-of-bounds, and get out-of-bounds markers.
+        # Now that we want to realloc, we need the address of the true start,
+        # so we jump the pointer back PADDING places.
+        cdef TokenC* data_start = self.data - PADDING
+        data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
+        self.data = data_start + PADDING
+        cdef int i
         for i in range(self.length, self.max_length + PADDING):
-            self.lex[i] = &EMPTY_LEXEME
+            self.data[i].lex = &EMPTY_LEXEME
 
 
 @cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
-                 dict lex):
-        self._string_store = string_store
+    def __init__(self, Language lang, int i, int idx,
+                 int pos, int lemma, dict lex):
+        self.lang = lang
         self.idx = idx
         self.pos = pos
-        self.ner = ner
         self.i = i
         self.id = lex['id']
+
+        self.lemma = lemma
         
         self.cluster = lex['cluster']
         self.length = lex['length']
-        self.postype = lex['postype']
-        self.sensetype = lex['supersense']
+        self.postype = lex['pos_type']
+        self.sensetype = 0
         self.sic = lex['sic']
-        self.norm = lex['norm']
+        self.norm = lex['dense']
         self.shape = lex['shape']
-        self.suffix = lex['asciied']
+        self.suffix = lex['suffix']
         self.prefix = lex['prefix']
 
         self.prob = lex['prob']
@@ -145,5 +141,16 @@ cdef class Token:
         def __get__(self):
             if self.sic == 0:
                 return ''
-            cdef bytes utf8string = self._string_store[self.sic]
+            cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
             return utf8string.decode('utf8')
+
+    property lemma:
+        def __get__(self):
+            if self.lemma == 0:
+                return self.string
+            cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
+            return utf8string.decode('utf8')
+
+    property pos:
+        def __get__(self):
+            return self.lang.pos_tagger.tag_names[self.pos]
diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd
index 21818f05e..02d327b72 100644
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@@ -1,8 +1,20 @@
 from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
+from libc.stdint cimport uint8_t
 
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
-ctypedef uint64_t flag_t
+ctypedef uint32_t attr_t
+ctypedef uint64_t flags_t
 ctypedef uint32_t id_t
 ctypedef uint16_t len_t
 ctypedef uint16_t tag_t
+
+
+cdef struct Morphology:
+    uint8_t number
+    uint8_t tenspect # Tense/aspect/voice
+    uint8_t mood
+    uint8_t gender
+    uint8_t person
+    uint8_t case
+    uint8_t misc
diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd
index 82ae50022..5ef4113d5 100644
--- a/spacy/utf8string.pxd
+++ b/spacy/utf8string.pxd
@@ -1,5 +1,6 @@
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
+from murmurhash.mrmr cimport hash64
 
 from .typedefs cimport utf8_t, id_t, hash_t
 
@@ -11,11 +12,23 @@ cdef struct Utf8Str:
     int length
 
 
+cdef struct UniStr:
+    Py_UNICODE* chars
+    size_t n
+    hash_t key
+
+
+cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
+    s.chars = &chars[start]
+    s.n = end - start
+    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
+
+
 cdef class StringStore:
     cdef Pool mem
-    cdef PreshMap table
+    cdef PreshMap _map
     cdef Utf8Str* strings
     cdef int size
     cdef int _resize_at
     
-    cdef Utf8Str* intern(self, char* chars, int length) except NULL
+    cdef const Utf8Str* intern(self, char* chars, int length) except NULL
diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx
index 18d4a4e5e..1d2b7a264 100644
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@@ -5,10 +5,11 @@ import codecs
 
 SEPARATOR = '\n|-SEP-|\n'
 
+
 cdef class StringStore:
     def __init__(self):
         self.mem = Pool()
-        self.table = PreshMap()
+        self._map = PreshMap()
         self._resize_at = 10000
         self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
         self.size = 1
@@ -17,26 +18,30 @@ cdef class StringStore:
         def __get__(self):
             return self.size-1
 
-    def __getitem__(self, string_or_id):
+    def __getitem__(self, object string_or_id):
         cdef bytes byte_string
-        cdef Utf8Str* utf8str
-        if type(string_or_id) == int or type(string_or_id) == long:
+        cdef const Utf8Str* utf8str
+        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
             if string_or_id < 1 or string_or_id >= self.size:
                 raise IndexError(string_or_id)
             utf8str = &self.strings[<int>string_or_id]
             return utf8str.chars[:utf8str.length]
-        elif type(string_or_id) == bytes:
+        elif isinstance(string_or_id, bytes):
             utf8str = self.intern(<char*>string_or_id, len(string_or_id))
             return utf8str.i
+        elif isinstance(string_or_id, unicode):
+            byte_string = string_or_id.encode('utf8')
+            utf8str = self.intern(<char*>byte_string, len(byte_string))
+            return utf8str.i
         else:
             raise TypeError(type(string_or_id))
 
-    cdef Utf8Str* intern(self, char* chars, int length) except NULL:
+    cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
         # 0 means missing, but we don't bother offsetting the index. We waste
         # slot 0 to simplify the code, because it doesn't matter.
         assert length != 0
         cdef hash_t key = hash64(chars, length * sizeof(char), 0)
-        cdef void* value = self.table.get(key)
+        cdef void* value = self._map.get(key)
         cdef size_t i
         if value == NULL:
             if self.size == self._resize_at:
@@ -48,7 +53,7 @@ cdef class StringStore:
             self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
             memcpy(self.strings[i].chars, chars, length)
             self.strings[i].length = length
-            self.table.set(key, <void*>self.size)
+            self._map.set(key, <void*>self.size)
             self.size += 1
         else:
             i = <size_t>value
diff --git a/spacy/util.py b/spacy/util.py
index 5062ca6db..1c25aeaf2 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
 
 def read_lang_data(name):
     data_dir = path.join(DATA_DIR, name)
-    tokenization = read_tokenization(name)
+    with open(path.join(data_dir, 'specials.json')) as file_:
+        tokenization = ujson.load(file_)
     prefix = read_prefix(data_dir)
     suffix = read_suffix(data_dir)
     infix = read_infix(data_dir)
@@ -26,12 +27,14 @@ def read_prefix(data_dir):
         expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
     return expression
 
+
 def read_suffix(data_dir):
-    with  utf8open(path.join(data_dir, 'suffix')) as file_:
+    with utf8open(path.join(data_dir, 'suffix')) as file_:
         entries = file_.read().split('\n')
-        expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
+        expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
     return expression
 
+
 def read_infix(data_dir):
     with utf8open(path.join(data_dir, 'infix')) as file_:
         entries = file_.read().split('\n')
diff --git a/tests/test_ner.py b/tests/depr_test_ner.py
similarity index 100%
rename from tests/test_ner.py
rename to tests/depr_test_ner.py
diff --git a/tests/test_contractions.py b/tests/test_contractions.py
index b7347a617..1e697afd2 100644
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@@ -20,15 +20,18 @@ def test_apostrophe():
 def test_LL():
     tokens = EN.tokenize("we'll")
     assert len(tokens) == 2
-    assert tokens[1].string == "will"
+    assert tokens[1].string == "'ll"
+    assert tokens[1].lemma == "will"
     assert tokens[0].string == "we"
 
 
 def test_aint():
     tokens = EN.tokenize("ain't")
     assert len(tokens) == 2
-    assert tokens[0].string == "are"
-    assert tokens[1].string == "not"
+    assert tokens[0].string == "ai"
+    assert tokens[0].lemma == "be"
+    assert tokens[1].string == "n't"
+    assert tokens[1].lemma == "not"
 
 
 def test_capitalized():
@@ -38,4 +41,12 @@ def test_capitalized():
     assert len(tokens) == 2
     tokens = EN.tokenize("Ain't")
     assert len(tokens) == 2
-    assert tokens[0].string == "Are"
+    assert tokens[0].string == "Ai"
+    assert tokens[0].lemma == "be"
+
+
+def test_punct():
+    tokens = EN.tokenize("We've")
+    assert len(tokens) == 2
+    tokens = EN.tokenize("``We've")
+    assert len(tokens) == 3
diff --git a/tests/test_emoticons.py b/tests/test_emoticons.py
index 6bb58e661..143be607d 100644
--- a/tests/test_emoticons.py
+++ b/tests/test_emoticons.py
@@ -27,3 +27,9 @@ def test_tweebo_challenge():
     assert tokens[19].string == '")'
     assert tokens[20].string == ':>'
     assert tokens[21].string == '....'
+
+
+def test_false_positive():
+    text = "example:)"
+    tokens = EN.tokenize(text)
+    assert len(tokens) == 3
diff --git a/tests/test_intern.py b/tests/test_intern.py
index 63b4b3433..a7a801b05 100644
--- a/tests/test_intern.py
+++ b/tests/test_intern.py
@@ -19,8 +19,12 @@ def test_save_bytes(sstore):
 
 
 def test_save_unicode(sstore):
-    with pytest.raises(TypeError):
-        A_i = sstore['A']
+    Hello_i = sstore[u'Hello']
+    assert Hello_i == 1
+    assert sstore[u'Hello'] == 1
+    assert sstore[u'goodbye'] != Hello_i
+    assert sstore[u'hello'] != Hello_i
+    assert Hello_i == 1
 
 
 def test_zero_id(sstore):
diff --git a/tests/test_iter_lexicon.py b/tests/test_iter_lexicon.py
new file mode 100644
index 000000000..379ebd3bb
--- /dev/null
+++ b/tests/test_iter_lexicon.py
@@ -0,0 +1,15 @@
+import pytest
+
+from spacy.en import EN
+
+def test_range_iter():
+    EN.load()
+    for i in range(len(EN.lexicon)):
+        lex = EN.lexicon[i]
+
+
+def test_iter():
+    EN.load()
+    i = 0
+    for lex in EN.lexicon:
+        i += 1
diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py
new file mode 100644
index 000000000..2047e4d2c
--- /dev/null
+++ b/tests/test_lemmatizer.py
@@ -0,0 +1,34 @@
+from spacy.lemmatizer import Lemmatizer, read_index, read_exc
+from spacy.util import DATA_DIR
+from os import path
+
+import pytest
+
+
+def test_read_index():
+    wn = path.join(DATA_DIR, 'wordnet')
+    index = read_index(path.join(wn, 'index.noun'))
+    assert 'man' in index
+    assert 'plantes' not in index
+    assert 'plant' in index
+
+
+def test_read_exc():
+    wn = path.join(DATA_DIR, 'wordnet')
+    exc = read_exc(path.join(wn, 'verb.exc'))
+    assert exc['was'] == ('be',)
+
+
+@pytest.fixture
+def lemmatizer():
+    return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
+
+
+def test_noun_lemmas(lemmatizer):
+    do = lemmatizer.noun
+
+    assert do('aardwolves') == set(['aardwolf'])
+    assert do('aardwolf') == set(['aardwolf'])
+    assert do('planets') == set(['planet'])
+    assert do('ring') == set(['ring'])
+    assert do('axes') == set(['axis', 'axe', 'ax'])
diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py
index 10276d8ea..c1fe2d847 100644
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@@ -7,6 +7,7 @@ from spacy.lexeme import *
 
 
 def test_is_alpha():
+    EN.load()
     the = EN.lexicon['the']
     assert the['flags'] & (1 << IS_ALPHA)
     year = EN.lexicon['1999']
@@ -16,6 +17,7 @@ def test_is_alpha():
 
 
 def test_is_digit():
+    EN.load()
     the = EN.lexicon['the']
     assert not the['flags'] & (1 << IS_DIGIT)
     year = EN.lexicon['1999']
diff --git a/tests/test_rules.py b/tests/test_rules.py
deleted file mode 100644
index b19a1c3f1..000000000
--- a/tests/test_rules.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from spacy import util
-
-
-def test_load_en():
-    rules = util.read_tokenization('en')
-    assert len(rules) != 0
-    aint = [rule for rule in rules if rule[0] == "ain't"][0]
-    chunk, pieces = aint
-    assert chunk == "ain't"
-    assert pieces[0] == "are"
-    assert pieces[1] == "not"
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index fb5f78ed7..21d115b9b 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -34,7 +34,7 @@ def test_digits():
 def test_contraction():
     tokens = EN.tokenize("don't giggle")
     assert len(tokens) == 3
-    assert tokens[1].sic == EN.lexicon["not"]['sic']
+    assert tokens[1].sic == EN.lexicon["n't"]['sic']
     tokens = EN.tokenize("i said don't!")
     assert len(tokens) == 5
     assert tokens[4].sic == EN.lexicon['!']['sic']
@@ -71,30 +71,39 @@ def test_cnts1():
     tokens = EN.tokenize(text)
     assert len(tokens) == 8
 
+
 def test_cnts2():
     text = u"""U.N. regulations are not a part of their concern."""
     tokens = EN.tokenize(text)
     assert len(tokens) == 10
 
+
 def test_cnts3():
     text = u"“Isn't it?”"
     tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
+    assert len(words) == 6
+
 
 def test_cnts4():
     text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
     tokens = EN.tokenize(text)
-    assert len(tokens) == 15
+    words = [t.string for t in tokens]
+    assert len(words) == 15
+
 
 def test_cnts5():
     text = """'Me too!', Mr. P. Delaware cried. """
     tokens = EN.tokenize(text)
     assert len(tokens) == 11
 
+
 def test_cnts6():
     text = u'They ran about 10km.'
     tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
+    assert len(words) == 6
+
 
 #def test_cnts7():
 #    text = 'But then the 6,000-year ice age came...'