* Merge setup.py

2025-06-30 18:03:04 +03:00 · 2014-12-10 15:21:27 +11:00 · 2014-12-10 15:21:27 +11:00 · ca54d58638
commit ca54d58638
parent 80baa2e3db 9959a64f7b
35 changed files with 1470 additions and 854 deletions
--- a/data/en/prefix
+++ b/data/en/prefix
@ -11,3 +11,8 @@ $
 '
 ``
 `
 #
 US$
 C$
 A$
 a-
--- a/data/en/suffix
+++ b/data/en/suffix
@ -1,13 +1,13 @@
 ,
-"
+\"
-)
+\)
-]
+\]
-}
+\}
-*
+\*
-!
+\!
-?
+\?
 %
-$
+\$
 >
 :
 ;
@ -16,7 +16,8 @@ $
 ''
 's
 'S
-.
+\.\.
-..
+\.\.\.
-...
+\.\.\.\.
-....
+(?<=[a-z0-9])\.
 (?<=[0-9])km
--- a/data/en/tokenization
+++ b/data/en/tokenization
@ -4,101 +4,9 @@
 #*---* ---
 #*'s  's
 's  's
 'S  'S
 ain't   are not
 aren't  are not
 can't   can not
 cannot  can not
 could've    could have
 couldn't    could not
 couldn't've could not have
 didn't  did not
 doesn't does not
 don't   do not
 hadn't  had not
 hadn't've   had not have
 hasn't  has not
 haven't have not
 he'd    he would
 he'd've he would have
 he'll   he will
 he's    he 's
 how'd   he would
 how'll  he will
 how's   how 's
 I'd I would
 I'd've  I would have
 I'll    I will
 I'm I am
 I'ma    I will
 I've    I have
 isn't   is not
 it'd    it would
 it'd've it would have
 it'll   it will
 it's    it 's
 let's   let 's
 mightn't    might not
 mightn't've might not have
 might've    might have
 mustn't must not
 must've must have
 needn't need not
 not've  not have
 shan't  shall not
 she'd   she would
 she'd've    she would have
 she'll  she will
 she's   she 's
 should've   should have
 shouldn't   should not
 shouldn't've    should not have
 that's  that 's
 there'd there would
 there'd've  there would have
 there's there is
 they'd  there would
 they'd've   they would have
 they'll they will
 they're they are
 they've they have
 wasn't  was not
 we'd    we would
 we'd've we would have
 we'll   we will
 we're   we are
 we've   we have
 weren't were not
 what'll what will
 what're what are
 what's  what 's
 what've what have
 when's  when 's
 where'd where would
 where's where 's
 where've    where have
 who'd   who would
 who'll  who will
 who're  who are
 who's   who 's
 who've  who have
 why'll  who will
 why're  why are
 why's   why 's
 won't   will not
 would've    would have
 wouldn't    would not
 wouldn't've would not have
 you'd   you would
 you'd've    you would have
 you'll  you will
 you're  you are
 you've  you have
 'em them
 'ol old
 10km    10 km
 U.S.    U.S.
 U.K.    U.K.
 non-U.S.    non-U.S.
 U.N.    U.N.
 Co. Co.
@ -115,7 +23,12 @@ A.G.    A.G.
 Rep.    Rep.
 Ms. Ms.
 Mr. Mr.
 Mrs.    Mrs.
 a.m.    a.m.
 Sen.    Sen.
 INC.    INC.
 CO. CO.
 COS.    COS.
 p.m.    p.m.
 Nos.    Nos.
 a.k.a.  a.k.a.
@ -127,6 +40,7 @@ E.  E.
 F.  F.
 G.  G.
 H.  H.
 I.  I.
 J.  J.
 K.  K.
 L.  L.
@ -205,6 +119,9 @@ Wash.   Wash.
 W.Va.   W.Va.
 Wis.    Wis.
 Wyo.    Wyo.
 L.A.    L.A.
 R.H.    R.H.
 Gov.    Gov.
 ''  ''
 :)  :)
 <3  <3
@ -262,3 +179,19 @@ V_V V_V
 o.O o.O
 ")  ")
 ....    ....
 a-  a -
 Messrs. Messrs.
 No. No.
 vs. vs.
 Gen.    Gen.
 Cos.    Cos.
 L.J.    L.J.
 D.T.    D.T.
 Prof.   Prof.
 Bros.   Bros.
 J.C.    J.C.
 Neb.    Neb.
 Adm.    Adm.
 U.S.S.R.    U.S.S.R.
 Rev.    Rev.
 H.F.    H.F.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -3,45 +3,228 @@
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
 ================================
 spaCy NLP Tokenizer and Lexicon
 ================================
-spaCy is a library for industrial strength NLP in Python.  Its core
+spaCy is a library for industrial-strength NLP in Python and Cython.  spaCy's
-values are:
+take on NLP is that it's mostly about feature extraction --- that's the part
 that's specific to NLP, so that's what an NLP library should focus on.
-* **Efficiency**: You won't find faster NLP tools. For shallow analysis, it's 10x
+spaCy also believes that for NLP, **efficiency is critical**.  If you're
-  faster than Stanford Core NLP, and over 200x faster than NLTK.  Its parser is
+running batch jobs, you probably have an enormous amount of data; if you're
-  over 100x faster than Stanford's.
+serving requests one-by-one, you want lower latency and fewer servers.  Even if
 you're doing exploratory research on relatively small samples, you should still
 value efficiency, because it means you can run more experiments.
-* **Accuracy**:  All spaCy tools are within 0.5% of the current published
+Depending on the task, spaCy is between 10 and 200 times faster than NLTK,
-  state-of-the-art, on both news and web text. NLP moves fast, so always check
+often with much better accuracy.  See Benchmarks for details, and
-  the numbers --- and don't settle for tools that aren't backed by
+Why is spaCy so fast? for a discussion of the algorithms and implementation
-  rigorous recent evaluation.
+that makes this possible.
-* **Minimalism**:  This isn't a library that covers 43 known algorithms to do X. You
+---------+----------+-------------+----------+
-  get 1 --- the best one --- with a simple, low-level interface. This keeps the
+| System  | Tokenize | --> Counts  | --> Stem |
-  code-base small and concrete.  Our Python APIs use lists and
+---------+----------+-------------+----------+
-  dictionaries, and our C/Cython APIs use arrays and simple structs.
+| spaCy   | 1m42s    | 1m59s       | 1m59s    |
 +---------+----------+-------------+----------+
 | NLTK    | 20m2s    | 28m24s      | 52m28    |
 +---------+----------+-------------+----------+
 Times for 100m words of text.
-Comparison
+Unique Lexicon-centric design
----------
+=============================
-+----------------+-------------+--------+---------------+--------------+
+spaCy helps you build models that generalise better, by making it easy to use
-| Tokenize & Tag | Speed (w/s) | Memory | % Acc. (news) | % Acc. (web) |
+more robust features.  Instead of a list of strings, the tokenizer returns
-+----------------+-------------+--------+---------------+--------------+
+references to rich lexical types.  Features which ask about the word's Brown cluster,
-| spaCy          | 107,000     |  1.3gb | 96.7          |              |
+its typical part-of-speech tag, how it's usually cased etc require no extra effort:
-+----------------+-------------+--------+---------------+--------------+
+
-| Stanford       | 8,000       |  1.5gb | 96.7          |              |
+    >>> from spacy.en import EN
-+----------------+-------------+--------+---------------+--------------+
+    >>> from spacy.feature_names import *
-| NLTK           | 543         |  61mb  | 94.0          |              |
+    >>> feats = (
-+----------------+-------------+--------+---------------+--------------+
+            SIC, # ID of the original word form
            STEM, # ID of the stemmed word form
            CLUSTER, # ID of the word's Brown cluster
            IS_TITLE, # Was the word title-cased?
            POS_TYPE # A cluster ID describing what POS tags the word is usually assigned
        )
    >>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^')
    >>> tokens.to_array(feats)[:5]
        array([[    1,  2,  3,  4],
               [...],
               [...],
               [...]])
 spaCy is designed to **make the right thing easy**, where the right thing is to:
 * **Use rich distributional and orthographic features**. Without these, your model
  will be very brittle and domain dependent.
 * **Compute features per type, not per token**. Because of Zipf's law, you can
  expect this to be exponentially more efficient.
 * **Minimize string processing**, and instead compute with arrays of ID ints.
 For the current list of lexical features, see `Lexical Features`_.
 .. _lexical features: features.html
 Tokenization done right
 =======================
 Most tokenizers rely on complicated regular expressions.  Often, they leave you
 with no way to align the tokens back to the original string --- a vital feature
 if you want to display some mark-up, such as spelling correction.  The regular
 expressions also interact, making it hard to accommodate special cases.
 spaCy introduces a **novel tokenization algorithm** that's much faster and much
 more flexible:
 .. code-block:: python
    def tokenize(string, prefixes={}, suffixes={}, specials={}):
        '''Sketch of spaCy's tokenization algorithm.'''
        tokens = []
        cache = {}
        for chunk in string.split():
            # Because of Zipf's law, the cache serves the majority of "chunks".
            if chunk in cache:
                tokens.extend(cache[chunl])
                continue
            key = chunk
            subtokens = []
            # Process a chunk by splitting off prefixes e.g. ( " { and suffixes e.g. , . :
            # If we split one off, check whether we're left with a special-case, 
            # e.g. contractions (can't, won't, etc), emoticons, abbreviations, etc.
            # This makes the tokenization easy to update and customize.
            while chunk:
                prefix, chunk = _consume_prefix(chunk, prefixes)
                if prefix:
                    subtokens.append(prefix)
                    if chunk in specials:
                        subtokens.extend(specials[chunk])
                        break
                suffix, chunk = _consume_suffix(chunk, suffixes)
                if suffix:
                    subtokens.append(suffix)
                    if chunk in specials:
                        subtokens.extend(specials[chunk])
                        break
            cache[key] = subtokens
 Your data is going to have its own quirks, so it's really useful to have
 a tokenizer you can easily control.  To see the limitations of the standard
 regex-based approach, check out `CMU's recent work on tokenizing tweets <http://www.ark.cs.cmu.edu/TweetNLP/>`_. Despite a lot of careful attention, they can't handle all of their
 known emoticons correctly --- doing so would interfere with the way they
 process other punctuation.  This isn't a problem for spaCy: we just add them
 all to the special tokenization rules.
 spaCy's tokenizer is also incredibly efficient:
 spaCy can create an inverted index of the 1.8 billion word Gigaword corpus,
 in under half an hour --- on a Macbook Air.  See the `inverted
 index tutorial`_.
 .. _inverted index tutorial: index_tutorial.html
 Comparison with NLTK
 ====================
 `NLTK <http://nltk.org>`_ provides interfaces to a wide-variety of NLP
 tools and resources, and its own implementations of a few algorithms.  It comes
 with comprehensive documentation, and a book introducing concepts in NLP.  For
 these reasons, it's very widely known.  However, if you're trying to make money
 or do cutting-edge research, NLTK is not a good choice.
 The `list of stuff in NLTK <http://www.nltk.org/py-modindex.html>`_ looks impressive,
 but almost none of it is useful for real work.  You're not going to make any money,
 or do top research, by using the NLTK chat bots, theorem provers, toy CCG implementation,
 etc.  Most of NLTK is there to assist in the explanation ideas in computational
 linguistics, at roughly an undergraduate level.
 But it also claims to support serious work, by wrapping external tools.
 In a pretty well known essay, Joel Spolsky discusses the pain of dealing with 
 `leaky abstractions <http://www.joelonsoftware.com/articles/LeakyAbstractions.html>`_.
 An abstraction tells you to not care about implementation
 details, but sometimes the implementation matters after all. When it
 does, you have to waste time revising your assumptions.
 NLTK's wrappers call external tools via subprocesses, and wrap this up so
 that it looks like a native API.  This abstraction leaks *a lot*.  The system
 calls impose far more overhead than a normal Python function call, which makes
 the most natural way to program against the API infeasible. 
 Case study: POS tagging
 -----------------------
 Here's a quick comparison of the following POS taggers:
 * **Stanford (CLI)**: The Stanford POS tagger, invoked once as a batch process
  from the command-line;
 * **nltk.tag.stanford**: The Stanford tagger, invoked document-by-document via
  NLTK's wrapper;
 * **nltk.pos_tag**: NLTK's own POS tagger, invoked document-by-document.
 * **spacy.en.pos_tag**: spaCy's POS tagger, invoked document-by-document.
 +-------------------+-------------+--------+
 | System            | Speed (w/s) | % Acc. |
 +-------------------+-------------+--------+
 | spaCy             | 107,000     | 96.7   |
 +-------------------+-------------+--------+
 | Stanford (CLI)    | 8,000       | 96.7   |
 +-------------------+-------------+--------+
 | nltk.pos_tag      | 543         | 94.0   |
 +-------------------+-------------+--------+
 | nltk.tag.stanford | 209         | 96.7   |
 +-------------------+-------------+--------+
 Experimental details TODO.  Three things are apparent from this comparison:
 1. The native NLTK tagger, nltk.pos_tag, is both slow and inaccurate;
 2. Calling the Stanford tagger document-by-document via NLTK is **40x** slower
   than invoking the model once as a batch process, via the command-line;
 3. spaCy is over 10x faster than the Stanford tagger, even when called
   **sentence-by-sentence**.
 The problem is that NLTK simply wraps the command-line
 interfaces of these tools, so communication is via a subprocess.  NLTK does not
 even hold open a pipe for you --- the model is reloaded, again and again.
 To use the wrapper effectively, you should batch up your text as much as possible.
 This probably isn't how you would like to structure your pipeline, and you
 might not be able to batch up much text at all, e.g. if serving a single
 request means processing a single document.
 Technically, NLTK does give you Python functions to access lots of different
 systems --- but, you can't use them as you would expect to use a normal Python
 function.  The abstraction leaks.
 Here's the bottom-line: the Stanford tools are written in Java, so using them
 from Python sucks.  You shouldn't settle for this.  It's a problem that springs
 purely from the tooling, rather than the domain.
 Summary
 -------
 NLTK is a well-known Python library for NLP, but for the important bits, you
 don't get actual Python modules.  You get wrappers which throw to external
 tools, via subprocesses.  This is not at all the same thing.
 spaCy is implemented in Cython, just like numpy, scikit-learn, lxml and other
 high-performance Python libraries.  So you get a native Python API, but the
 performance you expect from a program written in C.
 .. toctree::
    :hidden:
    :maxdepth: 3
-    what/index.rst
+    features.rst
-    why/index.rst
+    license_stories.rst 
    how/index.rst
--- a/setup.py
+++ b/setup.py
@ -10,6 +10,8 @@ import os.path
 from os import path
 from glob import glob
 import numpy
 def clean(ext):
    for pyx in ext.sources:
@ -34,7 +36,7 @@ compile_args = []
 link_args = []
 libs = []
-includes = ['.']
+includes = ['.', numpy.get_include()]
 cython_includes = ['.']
@ -50,18 +52,20 @@ exts = [
    Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.morphology", ["spacy/morphology.pyx"], language="c++",
-    Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
+              include_dirs=includes),
    #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
    #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
    #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
    #Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
    #Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
 ]
--- a/spacy/context.pxd
+++ b/spacy/context.pxd
@ -1,66 +0,0 @@
 from thinc.typedefs cimport atom_t
 from .typedefs cimport hash_t
 from .tokens cimport Tokens
 from .lexeme cimport Lexeme
 cdef class Token:
    cdef readonly atom_t sic
    cdef readonly atom_t cluster
    cdef readonly atom_t norm
    cdef readonly atom_t shape
    cdef readonly atom_t asciied
    cdef readonly atom_t prefix
    cdef readonly atom_t suffix
    cdef readonly atom_t length
    cdef readonly atom_t postype
    cdef readonly atom_t nertype
    cdef readonly atom_t sensetype
    cdef readonly atom_t is_alpha
    cdef readonly atom_t is_ascii
    cdef readonly atom_t is_digit
    cdef readonly atom_t is_lower
    cdef readonly atom_t is_punct
    cdef readonly atom_t is_space
    cdef readonly atom_t is_title
    cdef readonly atom_t is_upper
    cdef readonly atom_t like_url
    cdef readonly atom_t like_number
    cdef readonly atom_t oft_lower
    cdef readonly atom_t oft_title
    cdef readonly atom_t oft_upper
    cdef readonly atom_t in_males
    cdef readonly atom_t in_females
    cdef readonly atom_t in_surnames
    cdef readonly atom_t in_places
    cdef readonly atom_t in_games
    cdef readonly atom_t in_celebs
    cdef readonly atom_t in_names
    cdef readonly atom_t pos
    cdef readonly atom_t sense
    cdef readonly atom_t ner
 cdef class Slots:
    cdef readonly Token P4
    cdef readonly Token P3
    cdef readonly Token P2
    cdef readonly Token P1
    cdef readonly Token N0
    cdef readonly Token N1
    cdef readonly Token N2
    cdef readonly Token N3
    cdef readonly Token N4
 cdef int N_FIELDS
 cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
 cpdef Slots FIELD_IDS
--- a/spacy/context.pyx
+++ b/spacy/context.pyx
@ -1,126 +0,0 @@
 from murmurhash.mrmr cimport hash64
 from .lexeme cimport *
 cdef class Slots:
    def __init__(self):
        self.P4 = Token()
        self.P3 = Token()
        self.P2 = Token()
        self.P1 = Token()
        self.N0 = Token()
        self.N1 = Token()
        self.N2 = Token()
        self.N3 = Token()
        self.N4 = Token()
 cdef void _number_token(Token t, int* n_fields):
    cdef int i = n_fields[0]
    t.sic = i; i += 1
    t.cluster = i; i += 1
    t.norm = i; i += 1
    t.shape = i; i += 1
    t.prefix = i; i += 1
    t.suffix = i; i += 1
    t.length = i; i += 1
    t.postype = i; i += 1
    t.nertype = i; i += 1
    t.sensetype = i; i += 1
    t.is_alpha = i; i += 1
    t.is_ascii = i; i += 1
    t.is_digit = i; i += 1
    t.is_lower = i; i += 1
    t.is_punct = i; i += 1
    t.is_space = i; i += 1
    t.is_title = i; i += 1
    t.is_upper = i; i += 1
    t.like_number = i; i += 1
    t.like_url = i; i += 1
    t.oft_lower = i; i += 1
    t.oft_title = i; i += 1
    t.oft_upper = i; i += 1
    t.in_males = i; i += 1
    t.in_females = i; i += 1
    t.in_surnames = i; i += 1
    t.in_places = i; i += 1
    t.in_games = i; i += 1
    t.in_celebs = i; i += 1
    t.in_names = i; i += 1
    t.pos = i; i += 1
    t.sense = i; i += 1
    t.ner = i; i += 1
    n_fields[0] = i
 cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner):
    c[t.sic] = lex.sic
    c[t.cluster] = lex.cluster
    c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
    c[t.shape] = lex.shape
    c[t.asciied] = lex.asciied
    c[t.prefix] = lex.prefix
    c[t.suffix] = lex.suffix
    c[t.length] = lex.length
    c[t.postype] = lex.postype
    c[t.nertype] = 0
    c[t.sensetype] = 0
    c[t.is_alpha] = lex.flags & (1 << IS_ALPHA)
    c[t.is_digit] = lex.flags & (1 << IS_DIGIT)
    c[t.is_lower] = lex.flags & (1 << IS_LOWER)
    c[t.is_punct] = lex.flags & (1 << IS_PUNCT)
    c[t.is_space] = lex.flags & (1 << IS_SPACE)
    c[t.is_title] = lex.flags & (1 << IS_TITLE)
    c[t.is_upper] = lex.flags & (1 << IS_UPPER)
    c[t.like_url] = lex.flags & (1 << LIKE_URL)
    c[t.like_number] = lex.flags & (1 << LIKE_NUMBER)
    c[t.oft_lower] = lex.flags & (1 << OFT_LOWER)
    c[t.oft_title] = lex.flags & (1 << OFT_TITLE)
    c[t.oft_upper] = lex.flags & (1 << OFT_UPPER)
    c[t.in_males] = lex.flags & (1 << IN_MALES)
    c[t.in_females] = lex.flags & (1 << IN_FEMALES)
    c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES)
    c[t.in_places] = lex.flags & (1 << IN_PLACES)
    c[t.in_games] = lex.flags & (1 << IN_GAMES)
    c[t.in_celebs] = lex.flags & (1 << IN_CELEBS)
    c[t.in_names] = lex.flags & (1 << IN_NAMES)
    c[t.pos] = pos
    c[t.sense] = 0
    c[t.ner] = ner
 cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
    _fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4])
    _fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3])
    _fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
    _fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
    _fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
    _fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
    _fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
    _fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3])
    _fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4])
    return 1
 N_FIELDS = 0
 FIELD_IDS = Slots()
 _number_token(FIELD_IDS.P4, &N_FIELDS)
 _number_token(FIELD_IDS.P3, &N_FIELDS)
 _number_token(FIELD_IDS.P2, &N_FIELDS)
 _number_token(FIELD_IDS.P1, &N_FIELDS)
 _number_token(FIELD_IDS.N0, &N_FIELDS)
 _number_token(FIELD_IDS.N1, &N_FIELDS)
 _number_token(FIELD_IDS.N2, &N_FIELDS)
 _number_token(FIELD_IDS.N3, &N_FIELDS)
 _number_token(FIELD_IDS.N4, &N_FIELDS)
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,5 +1,133 @@
-from spacy.lang cimport Language
+from thinc.typedefs cimport atom_t
-from spacy.tokens cimport Tokens
+
 from .lang cimport Language
 from .tokens cimport Tokens
 from .tokens cimport TokenC
 cpdef enum en_person_t:
    NO_PERSON
    FIRST
    SECOND
    THIRD
    NON_THIRD
 cpdef enum en_number_t:
    NO_NUMBER
    SINGULAR
    PLURAL
    MASS
 cpdef enum en_gender_t:
    NO_GENDER
    MASCULINE
    FEMININE
    NEUTER
 cpdef enum en_case_t:
    NO_CASE
    NOMINATIVE
    GENITIVE
    ACCUSATIVE
    REFLEXIVE
    DEMONYM
 cpdef enum en_tenspect_t:
    NO_TENSE
    BASE_VERB
    PRESENT
    PAST
    PASSIVE
    ING
    MODAL
 cpdef enum misc_t:
    NO_MISC
    COMPARATIVE
    SUPERLATIVE
    RELATIVE
    NAME
 # Flags
 cpdef enum FlagID:
    IS_ALPHA
    IS_ASCII
    IS_DIGIT
    IS_LOWER
    IS_PUNCT
    IS_SPACE
    IS_TITLE
    IS_UPPER
    LIKE_URL
    LIKE_NUMBER
    OFT_LOWER
    OFT_TITLE
    OFT_UPPER
    IN_MALES
    IN_FEMALES
    IN_SURNAMES
    IN_PLACES
    IN_GAMES
    IN_CELEBS
    IN_NAMES
 cpdef enum:
    P2_sic
    P2_cluster
    P2_shape
    P2_prefix
    P2_suffix
    P2_pos
    P2_lemma
    P2_pos_type
    P1_sic
    P1_cluster
    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_lemma
    P1_pos_type
    W_sic
    W_cluster
    W_shape
    W_prefix
    W_suffix
    W_pos
    W_lemma
    W_pos_type
    N1_sic
    N1_cluster
    N1_shape
    N1_prefix
    N1_suffix
    N1_pos
    N1_lemma
    N1_pos_type
    N2_sic
    N2_cluster
    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_lemma
    N2_pos_type
    N_CONTEXT_FIELDS
 cdef class English(Language):
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -30,14 +30,101 @@ same scheme. Tokenization problems are a major cause of poor performance for
 NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 provides a fully Penn Treebank 3-compliant tokenizer.
 '''
 # TODO
 #The script translate_treebank_tokenization can be used to transform a treebank's
 #annotation to use one of the spacy tokenization schemes.
 from __future__ import unicode_literals
 cimport lang
 from .typedefs cimport flags_t
 import orth
 from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from .morphology cimport X, PUNCT, EOL
 from .tokens cimport Morphology
 POS_TAGS = {
    'NULL': (NO_TAG, {}),
    'EOL': (EOL, {}),
    'CC': (CONJ, {}),
    'CD': (NUM, {}),
    'DT': (DET, {}),
    'EX': (DET, {}),
    'FW': (X, {}),
    'IN': (ADP, {}),
    'JJ': (ADJ, {}),
    'JJR': (ADJ, {'misc': COMPARATIVE}),
    'JJS': (ADJ, {'misc': SUPERLATIVE}),
    'LS': (X, {}),
    'MD': (VERB, {'tenspect': MODAL}),
    'NN': (NOUN, {}),
    'NNS': (NOUN, {'number': PLURAL}),
    'NNP': (NOUN, {'misc': NAME}),
    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
    'PDT': (DET, {}),
    'POS': (PRT, {'case': GENITIVE}),
    'PRP': (NOUN, {}),
    'PRP$': (NOUN, {'case': GENITIVE}),
    'RB': (ADV, {}),
    'RBR': (ADV, {'misc': COMPARATIVE}),
    'RBS': (ADV, {'misc': SUPERLATIVE}),
    'RP': (PRT, {}),
    'SYM': (X, {}),
    'TO': (PRT, {}),
    'UH': (X, {}),
    'VB': (VERB, {}),
    'VBD': (VERB, {'tenspect': PAST}),
    'VBG': (VERB, {'tenspect': ING}),
    'VBN': (VERB, {'tenspect': PASSIVE}),
    'VBP': (VERB, {'tenspect': PRESENT}),
    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    'WDT': (DET, {'misc': RELATIVE}),
    'WP': (PRON, {'misc': RELATIVE}),
    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
    'WRB': (ADV, {'misc': RELATIVE}),
    '!': (PUNCT, {}),
    '#': (PUNCT, {}),
    '$': (PUNCT, {}),
    "''": (PUNCT, {}),
    "(": (PUNCT, {}),
    ")": (PUNCT, {}),
    "-LRB-": (PUNCT, {}),
    "-RRB-": (PUNCT, {}),
    ".": (PUNCT, {}),
    ",": (PUNCT, {}),
    "``": (PUNCT, {}),
    ":": (PUNCT, {}),
    "?": (PUNCT, {}),
 }
 POS_TEMPLATES = (
    (W_sic,),
    (P1_lemma, P1_pos),
    (P2_lemma, P2_pos),
    (N1_sic,),
    (N2_sic,),
    (W_suffix,),
    (W_prefix,),
    (P1_pos,),
    (P2_pos,),
    (P1_pos, P2_pos),
    (P1_pos, W_sic),
    (P1_suffix,),
    (N1_suffix,),
    (W_shape,),
    (W_cluster,),
    (N1_cluster,),
    (N2_cluster,),
    (P1_cluster,),
    (P2_cluster,),
    (W_pos_type,),
    (N1_pos_type,),
    (N1_pos_type,),
    (P1_pos, W_pos_type, N1_pos_type),
 )
 cdef class English(Language):
@ -47,7 +134,68 @@ cdef class English(Language):
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
-    pass
+    def get_props(self, unicode string):
        return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
    def set_flags(self, unicode string):
        cdef flags_t flags = 0
        flags |= orth.is_alpha(string) << IS_ALPHA
        flags |= orth.is_ascii(string) << IS_ASCII
        flags |= orth.is_digit(string) << IS_DIGIT
        flags |= orth.is_lower(string) << IS_LOWER
        flags |= orth.is_punct(string) << IS_PUNCT
        flags |= orth.is_space(string) << IS_SPACE
        flags |= orth.is_title(string) << IS_TITLE
        flags |= orth.is_upper(string) << IS_UPPER
        flags |= orth.like_url(string) << LIKE_URL
        flags |= orth.like_number(string) << LIKE_NUMBER
        return flags
    def set_pos(self, Tokens tokens):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        cdef TokenC* t = tokens.data
        assert self.morphologizer is not None
        cdef dict tagdict = self.pos_tagger.tagdict
        for i in range(tokens.length):
            if t[i].lex.sic in tagdict:
                t[i].pos = tagdict[t[i].lex.sic]
            else:
                fill_pos_context(context, i, t)
                t[i].pos = self.pos_tagger.predict(context)
            self.morphologizer.set_morph(i, t)
    def train_pos(self, Tokens tokens, golds):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        c = 0
        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
            self.morphologizer.set_morph(i, t)
            c += t[i].pos == golds[i]
        return c
 cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
    _fill_from_token(&context[P2_sic], &tokens[i-2])
    _fill_from_token(&context[P1_sic], &tokens[i-1])
    _fill_from_token(&context[W_sic], &tokens[i])
    _fill_from_token(&context[N1_sic], &tokens[i+1])
    _fill_from_token(&context[N2_sic], &tokens[i+2])
 cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.sic
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.pos
    context[6] = t.lemma
    context[7] = t.lex.pos_type
 EN = English('en')
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -1,38 +1,38 @@
 from libcpp.vector cimport vector
 from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
-from .tokens cimport Tokens
+from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
-from .ner.greedy_parser cimport NERParser
+from .utf8string cimport StringStore, UniStr
-from .utf8string cimport StringStore
+from .morphology cimport Morphologizer
-cdef extern from "Python.h":
+cdef union LexemesOrTokens:
-    cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
+    const Lexeme* const* lexemes
-    cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
+    TokenC* tokens
    cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
    cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
-cdef struct String:
+cdef struct Cached:
-    Py_UNICODE* chars
+    LexemesOrTokens data
-    size_t n
+    bint is_lex
-    hash_t key
+    int length
 cdef class Lexicon:
    cpdef public get_lex_props
    cdef Pool mem
    cpdef readonly size_t size
    cpdef readonly StringStore strings
    cdef vector[Lexeme*] lexemes
-    cdef Lexeme* get(self, String* s) except NULL
+    cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
-    cdef PreshMap _dict
+    cdef PreshMap _map
 cdef class Language:
@ -41,9 +41,8 @@ cdef class Language:
    cdef PreshMap _cache
    cdef PreshMap _specials
    cpdef readonly Lexicon lexicon
    cpdef readonly Tagger pos_tagger
-    cpdef readonly NERParser ner_tagger
+    cpdef readonly Morphologizer morphologizer
    cdef object _prefix_re
    cdef object _suffix_re
@ -52,13 +51,14 @@ cdef class Language:
    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)
-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
+    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                             vector[Lexeme*] *suffixes) except NULL
-    cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
                            vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
-    cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -18,13 +18,14 @@ from preshed.maps cimport PreshMap
 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
 from .lexeme cimport check_flag
 from .utf8string cimport slice_unicode
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
-
+from .tokens cimport Morphology
 from .tagger cimport Tagger
 from .ner.greedy_parser cimport NERParser
 cdef class Language:
@ -37,29 +38,30 @@ cdef class Language:
        self._prefix_re = re.compile(prefix)
        self._suffix_re = re.compile(suffix)
        self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon()
+        self.lexicon = Lexicon(self.get_props)
        if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
            self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
            self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
        self._load_special_tokenization(rules)
        if path.exists(path.join(util.DATA_DIR, name, 'pos')):
            self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
        else:
        self.pos_tagger = None
-        if path.exists(path.join(util.DATA_DIR, name, 'ner')):
+        self.morphologizer = None
-            self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
+
    def load(self):
        self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
        self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
        if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
            self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
            self.morphologizer = Morphologizer(self.lexicon.strings,
                                    path.join(util.DATA_DIR, self.name))
    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
-        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
+        cdef Tokens tokens = Tokens(self, length)
        if length == 0:
            return tokens
-        cdef String string_struct
+        cdef UniStr string_struct
        cdef unicode py_string
        cdef int idx = 0
        for i, py_string in enumerate(strings):
-            string_from_unicode(&string_struct, py_string)
+            slice_unicode(&string_struct, py_string, 0, len(py_string))
-            tokens.push_back(idx, self.lexicon.get(&string_struct))
+            tokens.push_back(idx, <const Lexeme*>self.lexicon.get(tokens.mem, &string_struct))
            idx += len(py_string) + 1
        return tokens
@ -79,22 +81,21 @@ cdef class Language:
            tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
        """
        cdef int length = len(string)
-        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
+        cdef Tokens tokens = Tokens(self, length)
        if length == 0:
            return tokens
        cdef int i = 0
        cdef int start = 0
        cdef bint cache_hit
        cdef Py_UNICODE* chars = string
        cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
-        cdef String span
+        cdef UniStr span
        for i in range(1, length):
            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                if start < i:
-                    string_slice(&span, chars, start, i)
+                    slice_unicode(&span, chars, start, i)
-                    lexemes = <Lexeme**>self._cache.get(span.key)
+                    cache_hit = self._try_cache(start, span.key, tokens)
-                    if lexemes != NULL:
+                    if not cache_hit:
                        tokens.extend(start, lexemes, 0)
                    else: 
                        self._tokenize(tokens, &span, start, i)
                in_ws = not in_ws
                start = i
@ -102,15 +103,27 @@ cdef class Language:
                    start += 1
        i += 1
        if start < i:
-            string_slice(&span, chars, start, i)
+            slice_unicode(&span, chars, start, i)
-            lexemes = <Lexeme**>self._cache.get(span.key)
+            cache_hit = self._try_cache(start, span.key, tokens)
-            if lexemes != NULL:
+            if not cache_hit:
                tokens.extend(start, lexemes, 0)
            else: 
                self._tokenize(tokens, &span, start, i)
        return tokens
-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
+    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
        #cached = <Cached*>self._specials.get(key)
        cached = <Cached*>self._cache.get(key)
        if cached == NULL:
            return False
        cdef int i
        if cached.is_lex:
            for i in range(cached.length):
                idx = tokens.push_back(idx, cached.data.lexemes[i])
        else:
            for i in range(cached.length):
                idx = tokens.push_back(idx, &cached.data.tokens[i])
        return True
    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
        cdef vector[Lexeme*] prefixes
        cdef vector[Lexeme*] suffixes
        cdef hash_t orig_key
@ -119,88 +132,95 @@ cdef class Language:
        orig_size = tokens.length
        self._split_affixes(span, &prefixes, &suffixes)
        self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
-        self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
+        self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
-                                vector[Lexeme*] *suffixes) except NULL:
+                                vector[const Lexeme*] *suffixes) except NULL:
        cdef size_t i
-        cdef String prefix
+        cdef UniStr prefix
-        cdef String suffix
+        cdef UniStr suffix
-        cdef String minus_pre
+        cdef UniStr minus_pre
-        cdef String minus_suf
+        cdef UniStr minus_suf
        cdef size_t last_size = 0
        while string.n != 0 and string.n != last_size:
            last_size = string.n
            pre_len = self._find_prefix(string.chars, string.n)
            if pre_len != 0:
-                string_slice(&prefix, string.chars, 0, pre_len)
+                slice_unicode(&prefix, string.chars, 0, pre_len)
-                string_slice(&minus_pre, string.chars, pre_len, string.n)
+                slice_unicode(&minus_pre, string.chars, pre_len, string.n)
                # Check whether we've hit a special-case
                if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
                    string[0] = minus_pre
-                    prefixes.push_back(self.lexicon.get(&prefix))
+                    prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
                    break
            suf_len = self._find_suffix(string.chars, string.n)
            if suf_len != 0:
-                string_slice(&suffix, string.chars, string.n - suf_len, string.n)
+                slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
-                string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
+                slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
                # Check whether we've hit a special-case
                if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
                    string[0] = minus_suf
-                    suffixes.push_back(self.lexicon.get(&suffix))
+                    suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
                    break
            if pre_len and suf_len and (pre_len + suf_len) <= string.n:
-                string_slice(string, string.chars, pre_len, string.n - suf_len)
+                slice_unicode(string, string.chars, pre_len, string.n - suf_len)
-                prefixes.push_back(self.lexicon.get(&prefix))
+                prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
-                suffixes.push_back(self.lexicon.get(&suffix))
+                suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
            elif pre_len:
                string[0] = minus_pre
-                prefixes.push_back(self.lexicon.get(&prefix))
+                prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
            elif suf_len:
                string[0] = minus_suf
-                suffixes.push_back(self.lexicon.get(&suffix))
+                suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
            if self._specials.get(string.key):
                break
        return string
-    cdef int _attach_tokens(self, Tokens tokens,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
-                            int idx, String* string,
+                            vector[const Lexeme*] *prefixes,
-                            vector[Lexeme*] *prefixes,
+                            vector[const Lexeme*] *suffixes) except -1:
-                            vector[Lexeme*] *suffixes) except -1:
+        cdef bint cache_hit
        cdef int split
-        cdef Lexeme** lexemes
+        cdef const Lexeme* const* lexemes
        cdef Lexeme* lexeme
-        cdef String span
+        cdef UniStr span
        cdef int i
        if prefixes.size():
-            idx = tokens.extend(idx, prefixes.data(), prefixes.size())
+            for i in range(prefixes.size()):
                idx = tokens.push_back(idx, prefixes[0][i])
        if string.n != 0:
-
+            cache_hit = self._try_cache(idx, string.key, tokens)
-            lexemes = <Lexeme**>self._cache.get(string.key)
+            if cache_hit:
-            if lexemes != NULL:
+                idx = tokens.data[tokens.length - 1].idx + 1
                idx = tokens.extend(idx, lexemes, 0)
            else:
                split = self._find_infix(string.chars, string.n)
                if split == 0 or split == -1:
-                    idx = tokens.push_back(idx, self.lexicon.get(string))
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, string))
                else:
-                    string_slice(&span, string.chars, 0, split)
+                    slice_unicode(&span, string.chars, 0, split)
-                    idx = tokens.push_back(idx, self.lexicon.get(&span))
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
-                    string_slice(&span, string.chars, split, split+1)
+                    slice_unicode(&span, string.chars, split, split+1)
-                    idx = tokens.push_back(idx, self.lexicon.get(&span))
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
-                    string_slice(&span, string.chars, split + 1, string.n)
+                    slice_unicode(&span, string.chars, split + 1, string.n)
-                    idx = tokens.push_back(idx, self.lexicon.get(&span))
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
-        cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
+        cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
            idx = tokens.push_back(idx, deref(it))
            preinc(it)
-    cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
        lexemes = <Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
        cdef int i
        for i in range(n):
-            lexemes[i] = tokens[i]
+            if tokens[i].lex.id == 1:
-        lexemes[i + 1] = NULL
+                return 0
-        self._cache.set(key, lexemes)
+        cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
        cached.length = n
        cached.is_lex = True
        lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
        for i in range(n):
            lexemes[i] = tokens[i].lex
        cached.data.lexemes = <const Lexeme* const*>lexemes
        self._cache.set(key, cached)
    cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
        cdef unicode string = chars[:length]
@ -217,66 +237,120 @@ cdef class Language:
        match = self._suffix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0
-    def _load_special_tokenization(self, token_rules):
+    def _load_special_tokenization(self, object rules):
-        '''Load special-case tokenization rules.
+        '''Add a special-case tokenization rule.
        Loads special-case tokenization rules into the Language._cache cache,
        read from data/<lang>/tokenization . The special cases are loaded before
        any language data is tokenized, giving these priority.  For instance,
        the English tokenization rules map "ain't" to ["are", "not"].
        Args:
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                a string and tokens is a list of strings.
        '''
        cdef int i
        cdef unicode chunk
        cdef list substrings
        cdef unicode form
        cdef unicode lemma
        cdef dict props
        cdef Lexeme** lexemes
        cdef hash_t hashed
-        cdef String string
+        cdef UniStr string
-        for uni_string, substrings in token_rules:
+        for chunk, substrings in sorted(rules.items()):
-            lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
+            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
-            for i, substring in enumerate(substrings):
+            for i, props in enumerate(substrings):
-                string_from_unicode(&string, substring)
+                form = props['F']
-                lexemes[i] = <Lexeme*>self.lexicon.get(&string)
+                lemma = props.get("L", None)
-            lexemes[i + 1] = NULL
+                slice_unicode(&string, form, 0, len(form))
-            string_from_unicode(&string, uni_string)
+                tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
-            self._specials.set(string.key, lexemes)
+                if lemma:
-            self._cache.set(string.key, lexemes)
+                    tokens[i].lemma = self.lexicon.strings[lemma]
                set_morph_from_dict(&tokens[i].morph, props)
            cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
            cached.length = len(substrings)
            cached.is_lex = False
            cached.data.tokens = tokens
            slice_unicode(&string, chunk, 0, len(chunk))
            self._specials.set(string.key, cached)
            self._cache.set(string.key, cached)
 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
    morph.number = props.get('number', 0)
    morph.tenspect = props.get('tenspect', 0)
    morph.mood = props.get('mood', 0)
    morph.gender = props.get('gender', 0)
    morph.person = props.get('person', 0)
    morph.case = props.get('case', 0)
    morph.misc = props.get('misc', 0)
 cdef class Lexicon:
-    def __init__(self):
+    '''A map container for a language's Lexeme structs.
    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
    '''
    def __init__(self, object get_props):
        self.mem = Pool()
-        self._dict = PreshMap(2 ** 20)
+        self._map = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.lexemes.push_back(&EMPTY_LEXEME)
-        self.size = 1
+        self.get_lex_props = get_props
-    cdef Lexeme* get(self, String* string) except NULL:
+    def __len__(self):
        return self.lexemes.size()
    cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
        '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
        if necessary, using memory acquired from the given pool.  If the pool
        is the lexicon's own memory, the lexeme is saved in the lexicon.'''
        cdef Lexeme* lex
-        lex = <Lexeme*>self._dict.get(string.key)
+        lex = <Lexeme*>self._map.get(string.key)
        if lex != NULL:
            return lex
-        lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
+        if string.n < 3:
-        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
+            mem = self.mem
-        self._dict.set(string.key, lex)
+        cdef unicode py_string = string.chars[:string.n]
        lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
        lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
                             self.get_lex_props(py_string))
        if mem is self.mem:
            self._map.set(string.key, lex)
            while self.lexemes.size() < (lex.id + 1):
                self.lexemes.push_back(&EMPTY_LEXEME)
            self.lexemes[lex.id] = lex
-        self.size += 1
+        else:
            lex[0].id = 1
        return lex
    def __getitem__(self,  id_or_string):
        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
        unseen unicode string is given, a new Lexeme is created and stored.
        This function relies on Cython's struct-to-dict conversion.  Python clients
        receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
        with int values.  Cython clients can instead receive a Lexeme struct value.
        More efficient Cython access is provided by Lexicon.get, which returns
        a Lexeme*.
        Args:
            id_or_string (int or unicode): The integer ID of a word, or its unicode
                string.  If an int >= Lexicon.size, IndexError is raised.
                If id_or_string is neither an int nor a unicode string, ValueError
                is raised.
        Returns:
            lexeme (dict): A Lexeme struct instance, which Cython translates into
                a dict if the operator is called from Python.
        '''
        if type(id_or_string) == int:
            if id_or_string >= self.lexemes.size():
                raise IndexError
            return self.lexemes.at(id_or_string)[0]
-        cdef String string
+        cdef UniStr string
-        string_from_unicode(&string, id_or_string)
+        slice_unicode(&string, id_or_string, 0, len(id_or_string))
-        cdef Lexeme* lexeme = self.get(&string)
+        cdef const Lexeme* lexeme = self.get(self.mem, &string)
        return lexeme[0]
    def __setitem__(self, unicode uni_string, dict props):
-        cdef String s
+        cdef UniStr s
-        string_from_unicode(&s, uni_string)
+        slice_unicode(&s, uni_string, 0, len(uni_string))
-        cdef Lexeme* lex = self.get(&s)
+        # Cast through the const here, since we're allowed to change our own
        # Lexemes.
        lex = <Lexeme*><void*>self.get(self.mem, &s)
        lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
    def dump(self, loc):
@ -287,11 +361,11 @@ cdef class Lexicon:
        assert fp != NULL
        cdef size_t st
        cdef hash_t key
-        for i in range(self._dict.length):
+        for i in range(self._map.length):
-            key = self._dict.c_map.cells[i].key
+            key = self._map.c_map.cells[i].key
            if key == 0:
                continue
-            lexeme = <Lexeme*>self._dict.c_map.cells[i].value
+            lexeme = <Lexeme*>self._map.c_map.cells[i].value
            st = fwrite(&key, sizeof(key), 1, fp)
            assert st == 1
            st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
@ -300,7 +374,8 @@ cdef class Lexicon:
        assert st == 0
    def load(self, loc):
-        assert path.exists(loc)
+        if not path.exists(loc):
            raise IOError('Lexemes file not found at %s' % loc)
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
        cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
        assert fp != NULL
@ -316,21 +391,9 @@ cdef class Lexicon:
            st = fread(lexeme, sizeof(Lexeme), 1, fp)
            if st != 1:
                break
-            self._dict.set(key, lexeme)
+            self._map.set(key, lexeme)
            while self.lexemes.size() < (lexeme.id + 1):
                self.lexemes.push_back(&EMPTY_LEXEME)
            self.lexemes[lexeme.id] = lexeme
            i += 1
            self.size += 1
        fclose(fp)
 cdef void string_from_unicode(String* s, unicode uni):
    cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
    string_slice(s, c_uni, 0, len(uni))
 cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
    s.chars = &chars[start]
    s.n = end - start
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -0,0 +1,90 @@
 from os import path
 NOUN_RULES = (
    ('s', ''),
    ('ses', 's'),
    ('ves', 'f'),
    ('xes', 'x'),
    ('zes', 'z'),
    ('ches', 'ch'),
    ('shes', 'sh'),
    ('men', 'man'),
    ('ies', 'y')
 )
 VERB_RULES = (
    ("s", ""),
    ("ies", "y"),
    ("es", "e"),
    ("es", ""),
    ("ed", "e"),
    ("ed", ""),
    ("ing", "e"),
    ("ing", "")
 )
 ADJ_RULES = (
    ("er", ""),
    ("est", ""),
    ("er", "e"),
    ("est", "e")
 )
 class Lemmatizer(object):
    def __init__(self, wn_dict_dir):
        self.index = {}
        self.exc = {}
        for pos in ['adj', 'adv', 'noun', 'verb']:
            self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
            self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
    def noun(self, string):
        return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
    def verb(self, string):
        return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
    def adj(self, string):
        return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
 def lemmatize(string, index, exceptions, rules):
    string = string.lower()
    forms = []
    if string in index:
        forms.append(string)
    forms.extend(exceptions.get(string, []))
    for old, new in rules:
        if string.endswith(old):
            form = string[:len(string) - len(old)] + new
            if form in index:
                forms.append(form)
    if not forms:
        forms.append(string)
    return set(forms)
 def read_index(loc):
    index = set()
    for line in open(loc):
        if line.startswith(' '):
            continue
        pieces = line.split()
        word = pieces[0]
        if word.count('_') == 0:
            index.add(word)
    return index
 def read_exc(loc):
    exceptions = {}
    for line in open(loc):
        if line.startswith(' '):
            continue
        pieces = line.split()
        exceptions[pieces[0]] = tuple(pieces[1:])
    return exceptions
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,61 +1,137 @@
-from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
+from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
 from .utf8string cimport StringStore
 from libc.stdint cimport uint16_t
 cpdef flag_t OOV_DIST_FLAGS
-# Flags
+# Reserve 64 values for flag features
-cpdef enum:
+cpdef enum attr_id_t:
-    IS_ALPHA
+    FLAG0
-    IS_ASCII
+    FLAG1
-    IS_DIGIT
+    FLAG2
-    IS_LOWER
+    FLAG3
-    IS_PUNCT
+    FLAG4
-    IS_SPACE
+    FLAG5
-    IS_TITLE
+    FLAG6
-    IS_UPPER
+    FLAG7
    FLAG8
    FLAG9
    FLAG10
    FLAG11
    FLAG12
    FLAG13
    FLAG14
    FLAG15
    FLAG16
    FLAG17
    FLAG18
    FLAG19
    FLAG20
    FLAG21
    FLAG22
    FLAG23
    FLAG24
    FLAG25
    FLAG26
    FLAG27
    FLAG28
    FLAG29
    FLAG30
    FLAG31
    FLAG32
    FLAG33
    FLAG34
    FLAG35
    FLAG36
    FLAG37
    FLAG38
    FLAG39
    FLAG40
    FLAG41
    FLAG42
    FLAG43
    FLAG44
    FLAG45
    FLAG46
    FLAG47
    FLAG48
    FLAG49
    FLAG50
    FLAG51
    FLAG52
    FLAG53
    FLAG54
    FLAG55
    FLAG56
    FLAG57
    FLAG58
    FLAG59
    FLAG60
    FLAG61
    FLAG62
    FLAG63
-    LIKE_URL
+    ID
-    LIKE_NUMBER
+    SIC
    DENSE
    SHAPE
    PREFIX
    SUFFIX
-    OFT_LOWER
+    LENGTH
-    OFT_TITLE
+    CLUSTER
-    OFT_UPPER
+    POS_TYPE
-
+    LEMMA
    IN_MALES
    IN_FEMALES
    IN_SURNAMES
    IN_PLACES
    IN_GAMES
    IN_CELEBS
    IN_NAMES
 cdef struct Lexeme:
-    flag_t flags
+    flags_t flags
-    id_t id
+    attr_t id
-    id_t sic
+    attr_t sic
-    id_t norm
+    attr_t dense
-    id_t shape
+    attr_t shape
-    id_t asciied
+    attr_t prefix
-    id_t prefix
+    attr_t suffix
-    id_t suffix
+ 
    attr_t length
    attr_t cluster
    attr_t pos_type
    float prob
-    
+    float sentiment
    len_t length
    tag_t cluster
    tag_t postype
    tag_t supersense
 cdef Lexeme EMPTY_LEXEME
-cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
+
-                  StringStore store, dict props) except *
+cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
                  dict props) except *
-cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
+cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)
 cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
    if feat_name < (sizeof(flags_t) * 8):
        return check_flag(lex, feat_name)
    elif feat_name == ID:
        return lex.id
    elif feat_name == SIC:
        return lex.sic
    elif feat_name == DENSE:
        return lex.dense
    elif feat_name == SHAPE:
        return lex.shape
    elif feat_name == PREFIX:
        return lex.prefix
    elif feat_name == SUFFIX:
        return lex.suffix
    elif feat_name == LENGTH:
        return lex.length
    elif feat_name == CLUSTER:
        return lex.cluster
    elif feat_name == POS_TYPE:
        return lex.pos_type
    else:
        return 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -6,67 +6,25 @@ from libc.string cimport memset
 import orth
 from .utf8string cimport Utf8Str
 OOV_DIST_FLAGS = 0
 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
 def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
    cdef flag_t flags = 0
    flags |= orth.is_alpha(string) << IS_ALPHA
    flags |= orth.is_ascii(string) << IS_ASCII
    flags |= orth.is_digit(string) << IS_DIGIT
    flags |= orth.is_lower(string) << IS_LOWER
    flags |= orth.is_punct(string) << IS_PUNCT
    flags |= orth.is_space(string) << IS_SPACE
    flags |= orth.is_title(string) << IS_TITLE
    flags |= orth.is_upper(string) << IS_UPPER
    flags |= orth.like_url(string) << LIKE_URL
    flags |= orth.like_number(string) << LIKE_NUMBER
    return flags
 cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
-                  StringStore store, dict props) except *:
+                  StringStore string_store, dict props) except *:
    cdef Lexeme lex
    lex.id = i
    lex.length = len(string)
-    lex.sic = get_string_id(string, store)
+    lex.sic = string_store[string]
    lex.cluster = props.get('cluster', 0)
-    lex.postype = props.get('postype', 0)
+    lex.pos_type = props.get('pos_type', 0)
    lex.supersense = props.get('supersense', 0)
    lex.prob = props.get('prob', 0)
-    cdef float upper_pc = props.get('upper_pc', 0.0)
+    lex.prefix = string_store[string[:1]]
-    cdef float lower_pc = props.get('lower_pc', 0.0)
+    lex.suffix = string_store[string[-3:]]
-    cdef float title_pc = props.get('title_pc', 0.0)
+    lex.shape = string_store[orth.word_shape(string)]
    lex.dense = string_store[props['dense']]
-    lex.prefix = get_string_id(string[0], store)
+    lex.flags = props.get('flags', 0)
    lex.suffix = get_string_id(string[-3:], store)
    if upper_pc or lower_pc or title_pc:
        canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
        lex.norm = get_string_id(canon_cased, store)
    else:
        lex.norm = lex.sic
    lex.shape = get_string_id(orth.word_shape(string), store)
    lex.asciied = get_string_id(orth.asciied(string), store)
    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
    lex.flags |= props.get('in_males', 0) << IN_MALES
    lex.flags |= props.get('in_females', 0) << IN_FEMALES
    lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
    lex.flags |= props.get('in_places', 0) << IN_PLACES
    lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
    lex.flags |= props.get('in_games', 0) << IN_GAMES
    lex.flags |= props.get('in_names', 0) << IN_NAMES
    return lex
 cdef id_t get_string_id(unicode string, StringStore store) except 0:
    cdef bytes byte_string = string.encode('utf8')
    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
    return orig_str.i
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -0,0 +1,45 @@
 from .tokens cimport TokenC
 from .lexeme cimport Lexeme
 from .utf8string cimport StringStore
 from .typedefs cimport id_t, Morphology
 from preshed.maps cimport PreshMapArray
 from cymem.cymem cimport Pool
 # Google universal tag set
 cpdef enum univ_tag_t:
    NO_TAG
    ADJ
    ADV
    ADP
    CONJ
    DET
    NOUN
    NUM
    PRON
    PRT
    VERB
    X
    PUNCT
    EOL
    N_UNIV_TAGS
 cdef struct PosTag:
    Morphology morph
    int id
    univ_tag_t pos
 cdef class Morphologizer:
    cdef Pool mem
    cdef StringStore strings
    cdef object lemmatizer
    cdef PosTag* tags
    cdef readonly list tag_names
    cdef PreshMapArray _cache
    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
    cdef int set_morph(self, const int i, TokenC* tokens) except -1
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -0,0 +1,117 @@
 # cython: profile=True
 # cython: embedsignature=True
 from os import path
 import json
 from .lemmatizer import Lemmatizer
 from .typedefs cimport id_t
 UNIV_TAGS = {
    'NULL': NO_TAG,
    'ADJ': ADJ,
    'ADV': ADV,
    'ADP': ADP,
    'CONJ': CONJ,
    'DET': DET,
    'NOUN': NOUN,
    'NUM': NUM,
    'PRON': PRON,
    'PRT': PRT,
    'VERB': VERB,
    'X': X,
    '.': PUNCT,
    'EOL': EOL
 }
 cdef struct _Cached:
    Morphology morph
    int lemma
 cdef class Morphologizer:
    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
    """
    def __init__(self, StringStore strings, data_dir):
        self.mem = Pool()
        self.strings = strings
        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
        tag_map = cfg['tag_map']
        self.tag_names = cfg['tag_names']
        self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
        self._cache = PreshMapArray(len(self.tag_names))
        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
        for i, tag in enumerate(self.tag_names):
            pos, props = tag_map[tag]
            self.tags[i].id = i
            self.tags[i].pos = pos
            self.tags[i].morph.number = props.get('number', 0)
            self.tags[i].morph.tenspect = props.get('tenspect', 0)
            self.tags[i].morph.mood = props.get('mood', 0)
            self.tags[i].morph.gender = props.get('gender', 0)
            self.tags[i].morph.person = props.get('person', 0)
            self.tags[i].morph.case = props.get('case', 0)
            self.tags[i].morph.misc = props.get('misc', 0)
        if path.exists(path.join(data_dir, 'morphs.json')):
            with open(path.join(data_dir, 'morphs.json')) as file_:
                self.load_exceptions(json.load(file_))
    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
        if self.lemmatizer is None:
            return lex.sic
        if pos != NOUN and pos != VERB and pos != ADJ:
            return lex.sic
        cdef bytes py_string = self.strings[lex.sic]
        cdef set lemma_strings
        cdef bytes lemma_string
        if pos == NOUN:
            lemma_strings = self.lemmatizer.noun(py_string)
        elif pos == VERB:
            lemma_strings = self.lemmatizer.verb(py_string)
        else:
            assert pos == ADJ
            lemma_strings = self.lemmatizer.adj(py_string)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.strings.intern(lemma_string, len(lemma_string)).i
        return lemma
    cdef int set_morph(self, const int i, TokenC* tokens) except -1:
        cdef const PosTag* tag = &self.tags[tokens[i].pos]
        cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
        if cached is NULL:
            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
            cached.morph = tag.morph
            self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
        tokens[i].lemma = cached.lemma
        tokens[i].morph = cached.morph
    def load_exceptions(self, dict exc):
        cdef unicode pos_str
        cdef unicode form_str
        cdef unicode lemma_str
        cdef dict entries
        cdef dict props
        cdef int lemma
        cdef id_t sic
        cdef univ_tag_t pos
        for pos_str, entries in exc.items():
            pos = self.tag_names.index(pos_str)
            for form_str, props in entries.items():
                lemma_str = props.get('L', form_str)
                sic = self.strings[form_str]
                cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
                cached.lemma = self.strings[lemma_str]
                set_morph_from_dict(&cached.morph, props)
                self._cache.set(pos, sic, <void*>cached)
 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
    morph.number = props.get('number', 0)
    morph.tenspect = props.get('tenspect', 0)
    morph.mood = props.get('mood', 0)
    morph.gender = props.get('gender', 0)
    morph.person = props.get('person', 0)
    morph.case = props.get('case', 0)
    morph.misc = props.get('misc', 0)
--- a/spacy/orth.py
+++ b/spacy/orth.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 import unicodedata
 from unidecode import unidecode
 import re
 import math
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@ -147,6 +147,7 @@ Y	PRT
 Z	NOUN
 ^	NOUN
 ~	X
-``	.""".strip().split('\n'))
+``	.
 EOL EOL""".strip().split('\n'))
    return mapping[tag]
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -1,34 +1,23 @@
 from libc.stdint cimport uint8_t
 from cymem.cymem cimport Pool
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
-from .typedefs cimport hash_t
+from preshed.maps cimport PreshMapArray
-from .context cimport Slots
+
 from .typedefs cimport hash_t, id_t
 from .tokens cimport Tokens
 cpdef enum TagType:
    POS
    ENTITY
    SENSE
 cdef class Tagger:
-    cpdef int set_tags(self, Tokens tokens) except -1
+    cdef class_t predict(self, const atom_t* context, object golds=*) except *
    cpdef class_t predict(self, int i, Tokens tokens) except 0
    cpdef int tell_answer(self, list gold) except -1
    cpdef readonly Pool mem
    cpdef readonly Extractor extractor
    cpdef readonly LinearModel model
    cpdef readonly TagType tag_type
    cpdef readonly list tag_names
-
+    cdef dict tagdict
    cdef class_t _guess
    cdef atom_t* _context
    cdef feat_t* _feats
    cdef weight_t* _values
    cdef weight_t* _scores
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -1,5 +1,4 @@
 # cython: profile=True
 from __future__ import print_function
 from __future__ import unicode_literals
 from __future__ import division
@ -10,155 +9,59 @@ import random
 import json
 import cython
-
+from thinc.features cimport Feature, count_feats
 from .context cimport fill_context
 from .context cimport N_FIELDS
 from thinc.features cimport ConjFeat
-NULL_TAG = 0
+def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
 def setup_model_dir(tag_type, tag_names, templates, model_dir):
    if path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.mkdir(model_dir)
    config = {
        'tag_type': tag_type,
        'templates': templates,
        'tag_names': tag_names,
        'tag_map': tag_map,
        'tag_counts': tag_counts,
    }
    with open(path.join(model_dir, 'config.json'), 'w') as file_:
        json.dump(config, file_)
 def train(train_sents, model_dir, nr_iter=10):
    cdef Tokens tokens
    tagger = Tagger(model_dir)
    for _ in range(nr_iter):
        n_corr = 0
        total = 0
        for tokens, golds in train_sents:
            assert len(tokens) == len(golds), [t.string for t in tokens]
            for i in range(tokens.length):
                if tagger.tag_type == POS:
                    gold = _get_gold_pos(i, golds, tokens.pos)
                elif tagger.tag_type == ENTITY:
                    gold = _get_gold_ner(i, golds, tokens.ner)
                guess = tagger.predict(i, tokens)
                tokens.set_tag(i, tagger.tag_type, guess)
                if gold is not None:
                    tagger.tell_answer(gold)
                    total += 1
                    n_corr += guess in gold
                #print('%s\t%d\t%d' % (tokens[i].string, guess, gold))
        print('%.4f' % ((n_corr / total) * 100))
        random.shuffle(train_sents)
    tagger.model.end_training()
    tagger.model.dump(path.join(model_dir, 'model'))
 cdef object _get_gold_pos(i, golds, int* pred):
    if golds[i] == 0:
        return None
    else:
        return [golds[i]]
 cdef object _get_gold_ner(i, golds, int* ner):
    if golds[i] == 0:
        return None
    else:
        return [golds[i]]
 def evaluate(tagger, sents):
    n_corr = 0
    total = 0
    for tokens, golds in sents:
        for i, gold in enumerate(golds):
            guess = tagger.predict(i, tokens)
            tokens.set_tag(i, tagger.tag_type, guess)
            if gold != NULL_TAG:
                total += 1
                n_corr += guess == gold
    return n_corr / total
 cdef class Tagger:
-    """Assign part-of-speech, named entity or supersense tags, using greedy
+    """Predict some type of tag, using greedy decoding.  The tagger reads its
-    decoding.  The tagger reads its model and configuration from disk.
+    model and configuration from disk.
    """
    def __init__(self, model_dir):
        self.mem = Pool()
        cfg = json.load(open(path.join(model_dir, 'config.json')))
        templates = cfg['templates']
        univ_counts = {}
        cdef unicode tag
        cdef unicode univ_tag
        self.tag_names = cfg['tag_names']
-        self.tag_type = cfg['tag_type']
+        self.tagdict = _make_tag_dict(cfg['tag_counts'])
-        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
+        self.extractor = Extractor(templates)
-        self.model = LinearModel(len(self.tag_names))
+        self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
-        self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
+    cdef class_t predict(self, atom_t* context, object golds=None) except *:
-        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
+        """Predict the tag of tokens[i].
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
        self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
        self._guess = NULL_TAG
    cpdef int set_tags(self, Tokens tokens) except -1:
        """Assign tags to a Tokens object.
        >>> tokens = EN.tokenize(u'An example sentence.')
        >>> assert tokens[0].pos == 'NO_TAG'
        >>> EN.pos_tagger.set_tags(tokens)
        >>> assert tokens[0].pos == 'DT'
        """
        cdef int i
        for i in range(tokens.length):
            tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
    cpdef class_t predict(self, int i, Tokens tokens) except 0:
        """Predict the tag of tokens[i].  The tagger remembers the features and
        prediction, in case you later call tell_answer.
        >>> tokens = EN.tokenize(u'An example sentence.')
        >>> tag = EN.pos_tagger.predict(0, tokens)
        >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
        """
-        fill_context(self._context, i, tokens)
+        cdef int n_feats
-        self.extractor.extract(self._feats, self._values, self._context, NULL)
+        cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
-        self._guess = self.model.score(self._scores, self._feats, self._values)
+        cdef weight_t* scores = self.model.get_scores(feats, n_feats)
-        return self._guess
+        guess = _arg_max(scores, self.model.nr_class)
-
+        if golds is not None and guess not in golds:
-    cpdef int tell_answer(self, list golds) except -1:
+            best = _arg_max_among(scores, golds)
-        """Provide the correct tag for the word the tagger was last asked to predict.
+            counts = {guess: {}, best: {}}
-        During Tagger.predict, the tagger remembers the features and prediction
+            count_feats(counts[guess], feats, n_feats, -1)
-        for the example. These are used to calculate a weight update given the
+            count_feats(counts[best], feats, n_feats, 1)
        correct label.
        >>> tokens = EN.tokenize('An example sentence.')
        >>> guess = EN.pos_tagger.predict(1, tokens)
        >>> JJ = EN.pos_tagger.tag_id('JJ')
        >>> JJ
        7
        >>> EN.pos_tagger.tell_answer(JJ)
        """
        cdef class_t guess = self._guess
        if guess in golds:
            self.model.update({})
            return 0
        best_gold = golds[0]
        best_score = self._scores[best_gold-1]
        for gold in golds[1:]:
            if self._scores[gold-1] > best_gold:
                best_score = self._scores[best_gold-1]
                best_gold = gold
        counts = {guess: {}, best_gold: {}}
        self.extractor.count(counts[best_gold], self._feats, 1)
        self.extractor.count(counts[guess], self._feats, -1)
            self.model.update(counts)
        return guess
    def tag_id(self, object tag_name):
        """Encode tag_name into a tag ID integer."""
@ -167,3 +70,41 @@ cdef class Tagger:
            tag_id = len(self.tag_names)
            self.tag_names.append(tag_name)
        return tag_id
 def _make_tag_dict(counts):
    freq_thresh = 20
    ambiguity_thresh = 0.97
    tagdict = {}
    cdef atom_t word
    cdef atom_t tag
    for word_str, tag_freqs in counts.items():
        tag_str, mode = max(tag_freqs.items(), key=lambda item: item[1])
        n = sum(tag_freqs.values())
        word = int(word_str)
        tag = int(tag_str)
        if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
            tagdict[word] = tag
    return tagdict
 cdef class_t _arg_max(weight_t* scores, int n_classes) except 9000:
    cdef int best = 0
    cdef weight_t score = scores[best]
    cdef int i
    for i in range(1, n_classes):
        if scores[i] >= score:
            score = scores[i]
            best = i
    return best
 cdef class_t _arg_max_among(weight_t* scores, list classes):
    cdef int best = classes[0]
    cdef weight_t score = scores[best]
    cdef class_t clas
    for clas in classes:
        if scores[clas] > score:
            score = scores[clas]
            best = clas
    return best
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,40 +1,55 @@
 import numpy as np
 cimport numpy as np
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
 from .lexeme cimport Lexeme
 from .typedefs cimport flag_t
 from .utf8string cimport StringStore
 from .tagger cimport TagType
-from thinc.typedefs cimport atom_t
+from .typedefs cimport flags_t
 from .typedefs cimport Morphology
 from .lang cimport Language
 cdef struct TokenC:
    const Lexeme* lex
    Morphology morph
    int idx
    int pos
    int lemma
    int sense
 ctypedef const Lexeme* const_Lexeme_ptr
 ctypedef TokenC* TokenC_ptr
 ctypedef fused LexemeOrToken:
    const_Lexeme_ptr
    TokenC_ptr
 cdef class Tokens:
    cdef Pool mem
-    cdef StringStore _string_store
+    cdef Language lang
    cdef list tag_names
-    cdef Lexeme** _lex_ptr
+    cdef TokenC* data
    cdef int* _idx_ptr
    cdef int* _pos_ptr
    cdef int* _ner_ptr
    cdef Lexeme** lex
    cdef int* idx
    cdef int* pos
    cdef int* ner
    cdef int length
    cdef int max_length
-    cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
+    cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
-    cdef int push_back(self, int i, Lexeme* lexeme) except -1
+
-    cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1
+    cpdef np.ndarray[long, ndim=2] get_array(self, list features)
 cdef class Token:
-    cdef StringStore _string_store
+    cdef public Language lang
    cdef public int i
    cdef public int idx
-    cdef public int pos
+    cdef int pos
-    cdef public int ner
+    cdef int lemma
    cdef public atom_t id
    cdef public atom_t cluster
@ -51,4 +66,4 @@ cdef class Token:
    cdef public float prob
-    cdef public flag_t flags
+    cdef public flags_t flags
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,7 +1,15 @@
 # cython: profile=True
 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
 from .lexeme cimport *
 cimport cython
-from .tagger cimport POS, ENTITY
+
 import numpy as np
 cimport numpy as np
 POS = 0
 ENTITY = 0
 DEF PADDING = 5
@ -17,23 +25,13 @@ cdef class Tokens:
    """A sequence of references to Lexeme objects.
    The Tokens class provides fast and memory-efficient access to lexical features,
-    and can efficiently export the data to a numpy array.  Specific languages
+    and can efficiently export the data to a numpy array.
    create their own Tokens subclasses, to provide more convenient access to
    language-specific features.
    >>> from spacy.en import EN
    >>> tokens = EN.tokenize('An example sentence.')
    >>> tokens.string(0)
    'An'
    >>> tokens.prob(0) > tokens.prob(1)
    True
    >>> tokens.can_noun(0)
    False
    >>> tokens.can_noun(1)
    True
    """
-    def __init__(self, StringStore string_store, string_length=0):
+    def __init__(self, Language lang, string_length=0):
-        self._string_store = string_store
+        self.lang = lang
        if string_length >= 3:
            size = int(string_length / 3.0)
        else:
@ -42,28 +40,18 @@ cdef class Tokens:
        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
        # However, we need to remember the true starting places, so that we can
        # realloc.
-        self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
+        data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
        self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self.lex = self._lex_ptr
        self.idx = self._idx_ptr
        self.pos = self._pos_ptr
        self.ner = self._ner_ptr
        cdef int i
        for i in range(size + (PADDING*2)):
-            self.lex[i] = &EMPTY_LEXEME
+            data_start[i].lex = &EMPTY_LEXEME
-        self.lex += PADDING
+        self.data = data_start + PADDING
        self.idx += PADDING
        self.pos += PADDING
        self.ner += PADDING
        self.max_length = size
        self.length = 0
    def __getitem__(self, i):
        bounds_check(i, self.length, PADDING)
-        return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
+        return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
-                     self.lex[i][0])
+                     self.data[i].lemma, self.data[i].lex[0])
    def __iter__(self):
        for i in range(self.length):
@ -72,70 +60,78 @@ cdef class Tokens:
    def __len__(self):
        return self.length
-    cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
+    cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
        if self.length == self.max_length:
            self._realloc(self.length * 2)
-        self.lex[self.length] = lexeme
+        cdef TokenC* t = &self.data[self.length]
-        self.idx[self.length] = idx
+        if LexemeOrToken is TokenC_ptr:
-        self.pos[self.length] = 0
+            t[0] = lex_or_tok[0]
        self.ner[self.length] = 0
        self.length += 1
        return idx + lexeme.length
    cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
        cdef int i
        if lexemes == NULL:
            return idx
        elif n == 0:
            i = 0
            while lexemes[i] != NULL:
                idx = self.push_back(idx, lexemes[i])
                i += 1
        else:
-            for i in range(n):
+            t.lex = lex_or_tok
-                idx = self.push_back(idx, lexemes[i])
+        self.length += 1
-        return idx
+        return idx + t.lex.length
-    cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
+    @cython.boundscheck(False)
-        if tag_type == POS:
+    cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
-            self.pos[i] = tag
+        cdef int i, j
-        elif tag_type == ENTITY:
+        cdef attr_id_t feature
-            self.ner[i] = tag
+        cdef np.ndarray[long, ndim=2] output
        output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
        for i in range(self.length):
            for j, feature in enumerate(attr_ids):
                output[i, j] = get_attr(self.data[i].lex, feature)
        return output
    def count_by(self, attr_id_t attr_id):
        cdef int i
        cdef attr_t attr
        cdef size_t count
        cdef PreshCounter counts = PreshCounter(2 ** 8)
        for i in range(self.length):
            if attr_id == LEMMA:
                attr = self.data[i].lemma
            else:
                attr = get_attr(self.data[i].lex, attr_id)
            counts.inc(attr, 1)
        return dict(counts)
    def _realloc(self, new_size):
        self.max_length = new_size
        n = new_size + (PADDING * 2)
-        self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
+        # What we're storing is a "padded" array. We've jumped forward PADDING
-        self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
+        # places, and are storing the pointer to that. This way, we can access
-        self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
+        # words out-of-bounds, and get out-of-bounds markers.
-        self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
+        # Now that we want to realloc, we need the address of the true start,
-        self.lex = self._lex_ptr + PADDING
+        # so we jump the pointer back PADDING places.
-        self.idx = self._idx_ptr + PADDING
+        cdef TokenC* data_start = self.data - PADDING
-        self.pos = self._pos_ptr + PADDING
+        data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
-        self.ner = self._ner_ptr + PADDING
+        self.data = data_start + PADDING
        cdef int i
        for i in range(self.length, self.max_length + PADDING):
-            self.lex[i] = &EMPTY_LEXEME
+            self.data[i].lex = &EMPTY_LEXEME
@cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
+    def __init__(self, Language lang, int i, int idx,
-                 dict lex):
+                 int pos, int lemma, dict lex):
-        self._string_store = string_store
+        self.lang = lang
        self.idx = idx
        self.pos = pos
        self.ner = ner
        self.i = i
        self.id = lex['id']
        self.lemma = lemma
        self.cluster = lex['cluster']
        self.length = lex['length']
-        self.postype = lex['postype']
+        self.postype = lex['pos_type']
-        self.sensetype = lex['supersense']
+        self.sensetype = 0
        self.sic = lex['sic']
-        self.norm = lex['norm']
+        self.norm = lex['dense']
        self.shape = lex['shape']
-        self.suffix = lex['asciied']
+        self.suffix = lex['suffix']
        self.prefix = lex['prefix']
        self.prob = lex['prob']
@ -145,5 +141,16 @@ cdef class Token:
        def __get__(self):
            if self.sic == 0:
                return ''
-            cdef bytes utf8string = self._string_store[self.sic]
+            cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
            return utf8string.decode('utf8')
    property lemma:
        def __get__(self):
            if self.lemma == 0:
                return self.string
            cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
            return utf8string.decode('utf8')
    property pos:
        def __get__(self):
            return self.lang.pos_tagger.tag_names[self.pos]
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -1,8 +1,20 @@
 from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
 from libc.stdint cimport uint8_t
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
-ctypedef uint64_t flag_t
+ctypedef uint32_t attr_t
 ctypedef uint64_t flags_t
 ctypedef uint32_t id_t
 ctypedef uint16_t len_t
 ctypedef uint16_t tag_t
 cdef struct Morphology:
    uint8_t number
    uint8_t tenspect # Tense/aspect/voice
    uint8_t mood
    uint8_t gender
    uint8_t person
    uint8_t case
    uint8_t misc
--- a/spacy/utf8string.pxd
+++ b/spacy/utf8string.pxd
@ -1,5 +1,6 @@
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from .typedefs cimport utf8_t, id_t, hash_t
@ -11,11 +12,23 @@ cdef struct Utf8Str:
    int length
 cdef struct UniStr:
    Py_UNICODE* chars
    size_t n
    hash_t key
 cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
    s.chars = &chars[start]
    s.n = end - start
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
 cdef class StringStore:
    cdef Pool mem
-    cdef PreshMap table
+    cdef PreshMap _map
    cdef Utf8Str* strings
    cdef int size
    cdef int _resize_at
-    cdef Utf8Str* intern(self, char* chars, int length) except NULL
+    cdef const Utf8Str* intern(self, char* chars, int length) except NULL
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@ -5,10 +5,11 @@ import codecs
 SEPARATOR = '\n|-SEP-|\n'
 cdef class StringStore:
    def __init__(self):
        self.mem = Pool()
-        self.table = PreshMap()
+        self._map = PreshMap()
        self._resize_at = 10000
        self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.size = 1
@ -17,26 +18,30 @@ cdef class StringStore:
        def __get__(self):
            return self.size-1
-    def __getitem__(self, string_or_id):
+    def __getitem__(self, object string_or_id):
        cdef bytes byte_string
-        cdef Utf8Str* utf8str
+        cdef const Utf8Str* utf8str
-        if type(string_or_id) == int or type(string_or_id) == long:
+        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
            if string_or_id < 1 or string_or_id >= self.size:
                raise IndexError(string_or_id)
            utf8str = &self.strings[<int>string_or_id]
            return utf8str.chars[:utf8str.length]
-        elif type(string_or_id) == bytes:
+        elif isinstance(string_or_id, bytes):
            utf8str = self.intern(<char*>string_or_id, len(string_or_id))
            return utf8str.i
        elif isinstance(string_or_id, unicode):
            byte_string = string_or_id.encode('utf8')
            utf8str = self.intern(<char*>byte_string, len(byte_string))
            return utf8str.i
        else:
            raise TypeError(type(string_or_id))
-    cdef Utf8Str* intern(self, char* chars, int length) except NULL:
+    cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
        # 0 means missing, but we don't bother offsetting the index. We waste
        # slot 0 to simplify the code, because it doesn't matter.
        assert length != 0
        cdef hash_t key = hash64(chars, length * sizeof(char), 0)
-        cdef void* value = self.table.get(key)
+        cdef void* value = self._map.get(key)
        cdef size_t i
        if value == NULL:
            if self.size == self._resize_at:
@ -48,7 +53,7 @@ cdef class StringStore:
            self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
            memcpy(self.strings[i].chars, chars, length)
            self.strings[i].length = length
-            self.table.set(key, <void*>self.size)
+            self._map.set(key, <void*>self.size)
            self.size += 1
        else:
            i = <size_t>value
--- a/spacy/util.py
+++ b/spacy/util.py
@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
 def read_lang_data(name):
    data_dir = path.join(DATA_DIR, name)
-    tokenization = read_tokenization(name)
+    with open(path.join(data_dir, 'specials.json')) as file_:
        tokenization = ujson.load(file_)
    prefix = read_prefix(data_dir)
    suffix = read_suffix(data_dir)
    infix = read_infix(data_dir)
@ -26,12 +27,14 @@ def read_prefix(data_dir):
        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return expression
 def read_suffix(data_dir):
    with utf8open(path.join(data_dir, 'suffix')) as file_:
        entries = file_.read().split('\n')
-        expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
+        expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
    return expression
 def read_infix(data_dir):
    with utf8open(path.join(data_dir, 'infix')) as file_:
        entries = file_.read().split('\n')
--- a/tests/depr_test_ner.py
+++ b/tests/depr_test_ner.py
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -20,15 +20,18 @@ def test_apostrophe():
 def test_LL():
    tokens = EN.tokenize("we'll")
    assert len(tokens) == 2
-    assert tokens[1].string == "will"
+    assert tokens[1].string == "'ll"
    assert tokens[1].lemma == "will"
    assert tokens[0].string == "we"
 def test_aint():
    tokens = EN.tokenize("ain't")
    assert len(tokens) == 2
-    assert tokens[0].string == "are"
+    assert tokens[0].string == "ai"
-    assert tokens[1].string == "not"
+    assert tokens[0].lemma == "be"
    assert tokens[1].string == "n't"
    assert tokens[1].lemma == "not"
 def test_capitalized():
@ -38,4 +41,12 @@ def test_capitalized():
    assert len(tokens) == 2
    tokens = EN.tokenize("Ain't")
    assert len(tokens) == 2
-    assert tokens[0].string == "Are"
+    assert tokens[0].string == "Ai"
    assert tokens[0].lemma == "be"
 def test_punct():
    tokens = EN.tokenize("We've")
    assert len(tokens) == 2
    tokens = EN.tokenize("``We've")
    assert len(tokens) == 3
--- a/tests/test_emoticons.py
+++ b/tests/test_emoticons.py
@ -27,3 +27,9 @@ def test_tweebo_challenge():
    assert tokens[19].string == '")'
    assert tokens[20].string == ':>'
    assert tokens[21].string == '....'
 def test_false_positive():
    text = "example:)"
    tokens = EN.tokenize(text)
    assert len(tokens) == 3
--- a/tests/test_intern.py
+++ b/tests/test_intern.py
@ -19,8 +19,12 @@ def test_save_bytes(sstore):
 def test_save_unicode(sstore):
-    with pytest.raises(TypeError):
+    Hello_i = sstore[u'Hello']
-        A_i = sstore['A']
+    assert Hello_i == 1
    assert sstore[u'Hello'] == 1
    assert sstore[u'goodbye'] != Hello_i
    assert sstore[u'hello'] != Hello_i
    assert Hello_i == 1
 def test_zero_id(sstore):
--- a/tests/test_iter_lexicon.py
+++ b/tests/test_iter_lexicon.py
@ -0,0 +1,15 @@
 import pytest
 from spacy.en import EN
 def test_range_iter():
    EN.load()
    for i in range(len(EN.lexicon)):
        lex = EN.lexicon[i]
 def test_iter():
    EN.load()
    i = 0
    for lex in EN.lexicon:
        i += 1
--- a/tests/test_lemmatizer.py
+++ b/tests/test_lemmatizer.py
@ -0,0 +1,34 @@
 from spacy.lemmatizer import Lemmatizer, read_index, read_exc
 from spacy.util import DATA_DIR
 from os import path
 import pytest
 def test_read_index():
    wn = path.join(DATA_DIR, 'wordnet')
    index = read_index(path.join(wn, 'index.noun'))
    assert 'man' in index
    assert 'plantes' not in index
    assert 'plant' in index
 def test_read_exc():
    wn = path.join(DATA_DIR, 'wordnet')
    exc = read_exc(path.join(wn, 'verb.exc'))
    assert exc['was'] == ('be',)
@pytest.fixture
 def lemmatizer():
    return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
 def test_noun_lemmas(lemmatizer):
    do = lemmatizer.noun
    assert do('aardwolves') == set(['aardwolf'])
    assert do('aardwolf') == set(['aardwolf'])
    assert do('planets') == set(['planet'])
    assert do('ring') == set(['ring'])
    assert do('axes') == set(['axis', 'axe', 'ax'])
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@ -7,6 +7,7 @@ from spacy.lexeme import *
 def test_is_alpha():
    EN.load()
    the = EN.lexicon['the']
    assert the['flags'] & (1 << IS_ALPHA)
    year = EN.lexicon['1999']
@ -16,6 +17,7 @@ def test_is_alpha():
 def test_is_digit():
    EN.load()
    the = EN.lexicon['the']
    assert not the['flags'] & (1 << IS_DIGIT)
    year = EN.lexicon['1999']
--- a/tests/test_rules.py
+++ b/tests/test_rules.py
@ -1,11 +0,0 @@
 from spacy import util
 def test_load_en():
    rules = util.read_tokenization('en')
    assert len(rules) != 0
    aint = [rule for rule in rules if rule[0] == "ain't"][0]
    chunk, pieces = aint
    assert chunk == "ain't"
    assert pieces[0] == "are"
    assert pieces[1] == "not"
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -34,7 +34,7 @@ def test_digits():
 def test_contraction():
    tokens = EN.tokenize("don't giggle")
    assert len(tokens) == 3
-    assert tokens[1].sic == EN.lexicon["not"]['sic']
+    assert tokens[1].sic == EN.lexicon["n't"]['sic']
    tokens = EN.tokenize("i said don't!")
    assert len(tokens) == 5
    assert tokens[4].sic == EN.lexicon['!']['sic']
@ -71,30 +71,39 @@ def test_cnts1():
    tokens = EN.tokenize(text)
    assert len(tokens) == 8
 def test_cnts2():
    text = u"""U.N. regulations are not a part of their concern."""
    tokens = EN.tokenize(text)
    assert len(tokens) == 10
 def test_cnts3():
    text = u"“Isn't it?”"
    tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
    assert len(words) == 6
 def test_cnts4():
    text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
    tokens = EN.tokenize(text)
-    assert len(tokens) == 15
+    words = [t.string for t in tokens]
    assert len(words) == 15
 def test_cnts5():
    text = """'Me too!', Mr. P. Delaware cried. """
    tokens = EN.tokenize(text)
    assert len(tokens) == 11
 def test_cnts6():
    text = u'They ran about 10km.'
    tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
    assert len(words) == 6
 #def test_cnts7():
 #    text = 'But then the 6,000-year ice age came...'