From 33dfb4933c3c82c1ed60f7fff9228b44a945817e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 26 Nov 2014 19:53:29 +1100
Subject: [PATCH 01/56] * Remove taggers from Language class. Work on doc
 strings

---
 spacy/lang.pyx | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 79a84e936..bc9677e6c 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -23,9 +23,6 @@ from . import util
 from .util import read_lang_data
 from .tokens import Tokens
 
-from .tagger cimport Tagger
-from .ner.greedy_parser cimport NERParser
-
 
 cdef class Language:
     def __init__(self, name):
@@ -42,12 +39,6 @@ cdef class Language:
             self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
             self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
         self._load_special_tokenization(rules)
-        if path.exists(path.join(util.DATA_DIR, name, 'pos')):
-            self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
-        else:
-            self.pos_tagger = None
-        if path.exists(path.join(util.DATA_DIR, name, 'ner')):
-            self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
 
     cpdef Tokens tokens_from_list(self, list strings):
         cdef int length = sum([len(s) for s in strings])
@@ -244,6 +235,10 @@ cdef class Language:
 
 
 cdef class Lexicon:
+    '''A map container for a language's Lexeme structs.
+    
+    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
+    '''
     def __init__(self):
         self.mem = Pool()
         self._dict = PreshMap(2 ** 20)
@@ -252,6 +247,7 @@ cdef class Lexicon:
         self.size = 1
 
     cdef Lexeme* get(self, String* string) except NULL:
+        '''Retrieve a pointer to a Lexeme from the lexicon.'''
         cdef Lexeme* lex
         lex = <Lexeme*>self._dict.get(string.key)
         if lex != NULL:
@@ -266,6 +262,25 @@ cdef class Lexicon:
         return lex
 
     def __getitem__(self,  id_or_string):
+        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
+        unseen unicode string is given, a new Lexeme is created and stored.
+
+        This function relies on Cython's struct-to-dict conversion.  Python clients
+        receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
+        with int values.  Cython clients can instead receive a Lexeme struct value.
+        More efficient Cython access is provided by Lexicon.get, which returns
+        a Lexeme*.
+
+        Args:
+            id_or_string (int or unicode): The integer ID of a word, or its unicode
+                string.  If an int >= Lexicon.size, IndexError is raised.
+                If id_or_string is neither an int nor a unicode string, ValueError
+                is raised.
+
+        Returns:
+            lexeme (dict): A Lexeme struct instance, which Cython translates into
+                a dict if the operator is called from Python.
+        '''
         if type(id_or_string) == int:
             return self.lexemes.at(id_or_string)[0]
         cdef String string

From 3430d5f629cc40d48c4e65eb0d4338395f5d94a1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 1 Dec 2014 22:55:13 +1100
Subject: [PATCH 02/56] * Revise intro copy. Add NLTK comparison

---
 docs/source/index.rst | 167 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 140 insertions(+), 27 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 97681bfd8..dbadd9fc3 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -3,45 +3,158 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
+================================
 spaCy NLP Tokenizer and Lexicon
 ================================
 
-spaCy is a library for industrial strength NLP in Python.  Its core
-values are:
+spaCy is a library for industrial-strength NLP in Python and Cython.  It
+assumes that NLP is mostly about solving machine learning problems, and that
+solving these problems is mostly about feature extraction.  So, spaCy helps you
+do feature extraction --- it helps you represent a linguistic context as
+a vector of numbers.  It's also a great way to create an inverted index,
+particularly if you want to index documents on fancier properties.
 
-* **Efficiency**: You won't find faster NLP tools. For shallow analysis, it's 10x
-  faster than Stanford Core NLP, and over 200x faster than NLTK.  Its parser is
-  over 100x faster than Stanford's.
+For commercial users, a trial license costs $0, with a one-time license fee of
+$1,000 to use spaCy in production.  For non-commercial users, a GPL license is
+available.  To quickly get the gist of the license terms, check out the license
+user stories.
 
-* **Accuracy**:  All spaCy tools are within 0.5% of the current published
-  state-of-the-art, on both news and web text. NLP moves fast, so always check
-  the numbers --- and don't settle for tools that aren't backed by
-  rigorous recent evaluation.
 
-* **Minimalism**:  This isn't a library that covers 43 known algorithms to do X. You
-  get 1 --- the best one --- with a simple, low-level interface. This keeps the
-  code-base small and concrete.  Our Python APIs use lists and
-  dictionaries, and our C/Cython APIs use arrays and simple structs.
+Unique Lexicon-centric design
+=============================
+
+spaCy takes care of all string-processing, efficiently and accurately.  This
+makes a night-and-day difference to your feature extraction code.
+Instead of a list of strings, spaCy's tokenizer gives you references to feature-rich
+lexeme objects:
+
+    >>> from spacy.en import EN
+    >>> from spacy.feature_names import SIC, NORM, SHAPE, ASCIIED, PREFIX, SUFFIX, \
+            LENGTH, CLUSTER, POS_TYPE, SENSE_TYPE, \
+            IS_ALPHA, IS_ASCII, IS_DIGIT, IS_PUNCT, IS_SPACE, IS_TITLE, IS_UPPER, \
+            LIKE_URL, LIKE_NUMBER
+    >>> feats = (
+            SIC, # ID of the original word form
+            NORM, # ID of the normalized word form
+            CLUSTER, # ID of the word's Brown cluster
+            IS_TITLE, # Was the word title-cased?
+            POS_TYPE # A cluster ID describing what POS tags the word is usually assigned
+        )
+    >>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^')
+    >>> tokens.to_strings()
+    [u'Split', u'words', u',', u'punctuation', u',', u'emoticons', u'etc.', u'!', u'^_^']
+    >>> tokens.to_array(feats)[:5]
+        array([[    1,  2,  3,  4],
+               [...],
+               [...],
+               [...]])
+
+
+spaCy is designed to **make the right thing easy**, where the right thing is to:
+
+* **Use rich distributional and orthographic features**. Without these, your model
+  will be very brittle and domain dependent.
+
+* **Compute features per type, not per token**. Because of Zipf's law, you can
+  expect this to be exponentially more efficient.
+
+* **Minimize string processing**, and instead compute with arrays of ID ints.
   
 
-Comparison
-----------
+Comparison with NLTK
+====================
 
-+----------------+-------------+--------+---------------+--------------+
-| Tokenize & Tag | Speed (w/s) | Memory | % Acc. (news) | % Acc. (web) |
-+----------------+-------------+--------+---------------+--------------+
-| spaCy          | 107,000     |  1.3gb | 96.7          |              |
-+----------------+-------------+--------+---------------+--------------+
-| Stanford       | 8,000       |  1.5gb | 96.7          |              |
-+----------------+-------------+--------+---------------+--------------+
-| NLTK           | 543         |  61mb  | 94.0          |              |
-+----------------+-------------+--------+---------------+--------------+
+`NLTK <http://nltk.org>`_ provides interfaces to a wide-variety of NLP
+tools and resources, and its own implementations of a few algorithms.  It comes
+with comprehensive documentation, and a book introducing concepts in NLP.  For
+these reasons, it's very widely known.  However, if you're trying to make money
+or do cutting-edge research, NLTK is not a good choice.
+
+The `list of stuff in NLTK <http://www.nltk.org/py-modindex.html>`_ looks impressive,
+but almost none of it is useful for real work.  You're not going to make any money,
+or do top research, by using the NLTK chat bots, theorem provers, toy CCG implementation,
+etc.  Most of NLTK is there to assist in the explanation ideas in computational
+linguistics, at roughly an undergraduate level.
+But it also claims to support serious work, by wrapping external tools.
+
+In a pretty well known essay, Joel Spolsky discusses the pain of dealing with 
+`leaky abstractions <http://www.joelonsoftware.com/articles/LeakyAbstractions.html>`_.
+An abstraction tells you to not care about implementation
+details, but sometimes the implementation matters after all. When it
+does, you have to waste time revising your assumptions.
+
+NLTK's wrappers call external tools via subprocesses, and wrap this up so
+that it looks like a native API.  This abstraction leaks *a lot*.  The system
+calls impose far more overhead than a normal Python function call, which makes
+the most natural way to program against the API infeasible. 
+
+
+Case study: POS tagging
+-----------------------
+
+Here's a quick comparison of the following POS taggers:
+
+* **Stanford (CLI)**: The Stanford POS tagger, invoked once as a batch process
+  from the command-line;
+* **nltk.tag.stanford**: The Stanford tagger, invoked document-by-document via
+  NLTK's wrapper;
+* **nltk.pos_tag**: NLTK's own POS tagger, invoked document-by-document.
+* **spacy.en.pos_tag**: spaCy's POS tagger, invoked document-by-document.
+
+
++-------------------+-------------+--------+
+| System            | Speed (w/s) | % Acc. |
++-------------------+-------------+--------+
+| spaCy             | 107,000     | 96.7   |
++-------------------+-------------+--------+
+| Stanford (CLI)    | 8,000       | 96.7   |
++-------------------+-------------+--------+
+| nltk.pos_tag      | 543         | 94.0   |
++-------------------+-------------+--------+
+| nltk.tag.stanford | 209         | 96.7   |
++-------------------+-------------+--------+
+
+Experimental details here.  Three things are apparent from this comparison:
+
+1. The native NLTK tagger, nltk.pos_tag, is both slow and inaccurate;
+
+2. Calling the Stanford tagger document-by-document via NLTK is **40x** slower
+   than invoking the model once as a batch process, via the command-line;
+
+3. spaCy is over 10x faster than the Stanford tagger, even when called
+   **sentence-by-sentence**.
+
+The problem is that NLTK simply wraps the command-line
+interfaces of these tools, so communication is via a subprocess.  NLTK does not
+even hold open a pipe for you --- the model is reloaded, again and again.
+
+To use the wrapper effectively, you should batch up your text as much as possible.
+This probably isn't how you would like to structure your pipeline, and you
+might not be able to batch up much text at all, e.g. if serving a single
+request means processing a single document.
+Technically, NLTK does give you Python functions to access lots of different
+systems --- but, you can't use them as you would expect to use a normal Python
+function.  The abstraction leaks.
+
+Here's the bottom-line: the Stanford tools are written in Java, so using them
+from Python sucks.  You shouldn't settle for this.  It's a problem that springs
+purely from the tooling, rather than the domain.
+
+Summary
+-------
+
+NLTK is a well-known Python library for NLP, but for the important bits, you
+don't get actual Python modules.  You get wrappers which throw to external
+tools, via subprocesses.  This is not at all the same thing.
+
+spaCy is implemented in Cython, just like numpy, scikit-learn, lxml and other
+high-performance Python libraries.  So you get a native Python API, but the
+performance you expect from a program written in C.
 
 
 .. toctree::
     :hidden:
     :maxdepth: 3
+
+    features.rst
     
-    what/index.rst
-    why/index.rst
-    how/index.rst

From ea19850a69291df678aefc4bab4deb7a143aa42b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 2 Dec 2014 04:39:12 +1100
Subject: [PATCH 03/56] * Add tokenizer section

---
 docs/source/index.rst | 74 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index dbadd9fc3..b0dd08417 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -8,11 +8,11 @@ spaCy NLP Tokenizer and Lexicon
 ================================
 
 spaCy is a library for industrial-strength NLP in Python and Cython.  It
-assumes that NLP is mostly about solving machine learning problems, and that
+assumes that NLP is mostly about solving large machine learning problems, and that
 solving these problems is mostly about feature extraction.  So, spaCy helps you
-do feature extraction --- it helps you represent a linguistic context as
-a vector of numbers.  It's also a great way to create an inverted index,
-particularly if you want to index documents on fancier properties.
+do feature extraction --- it includes an excellent set of distributional and
+orthographic features, memoizes them efficiently, and maps strings to
+consecutive integer values.
 
 For commercial users, a trial license costs $0, with a one-time license fee of
 $1,000 to use spaCy in production.  For non-commercial users, a GPL license is
@@ -20,6 +20,70 @@ available.  To quickly get the gist of the license terms, check out the license
 user stories.
 
 
+Tokenization done right
+=======================
+
+Most tokenizers rely on complicated regular expressions.  Often, they leave you
+with no way to align the tokens back to the original string --- a vital feature
+if you want to display some mark-up, such as spelling correction.  The regular
+expressions also interact, making it hard to accommodate special cases.
+
+spaCy introduces a **novel tokenization algorithm** that's much faster and much
+more flexible:
+
+.. code-block:: python
+
+    def tokenize(string, prefixes={}, suffixes={}, specials={}):
+        '''Sketch of spaCy's tokenization algorithm.'''
+        tokens = []
+        cache = {}
+        for chunk in string.split():
+            # Because of Zipf's law, the cache serves the majority of "chunks".
+            if chunk in cache:
+                tokens.extend(cache[chunl])
+                continue
+            key = chunk
+
+            subtokens = []
+            # Process a chunk by splitting off prefixes e.g. ( " { and suffixes e.g. , . :
+            # If we split one off, check whether we're left with a special-case, 
+            # e.g. contractions (can't, won't, etc), emoticons, abbreviations, etc.
+            # This makes the tokenization easy to update and customize.
+            while chunk:
+                prefix, chunk = _consume_prefix(chunk, prefixes)
+                if prefix:
+                    subtokens.append(prefix)
+                    if chunk in specials:
+                        subtokens.extend(specials[chunk])
+                        break
+                suffix, chunk = _consume_suffix(chunk, suffixes)
+                if suffix:
+                    subtokens.append(suffix)
+                    if chunk in specials:
+                        subtokens.extend(specials[chunk])
+                        break
+            cache[key] = subtokens
+
+Your data is going to have its own quirks, so it's really useful to have
+a tokenizer you can easily control.  To see the limitations of the standard
+regex-based approach, check out `CMU's recent work on tokenizing tweets <http://www.ark.cs.cmu.edu/TweetNLP/>`_. Despite a lot of careful attention, they can't handle all of their
+known emoticons correctly --- doing so would interfere with the way they
+process other punctuation.  This isn't a problem for spaCy: we just add them
+all to the special tokenization rules.
+
+spaCy's tokenizer is also incredibly efficient:
+
++--------+---------------+--------------+
+| System | Tokens/second | Speed Factor |
++--------+---------------+--------------+
+| NLTK   | 89 000        | 1.00         |
++--------+---------------+--------------+
+| spaCy  | 3 093 000     | 38.30        |
++--------+---------------+--------------+
+
+spaCy can create an inverted index of the 1.8 billion word Gigaword corpus,
+keyed by lemmas, in under half an hour --- on a Macbook Air.
+
 Unique Lexicon-centric design
 =============================
 
@@ -114,7 +178,7 @@ Here's a quick comparison of the following POS taggers:
 | nltk.tag.stanford | 209         | 96.7   |
 +-------------------+-------------+--------+
 
-Experimental details here.  Three things are apparent from this comparison:
+Experimental details TODO.  Three things are apparent from this comparison:
 
 1. The native NLTK tagger, nltk.pos_tag, is both slow and inaccurate;
 

From 2ee8a1e61fcf295b83129cee5ec3f402239fd911 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 2 Dec 2014 15:20:18 +1100
Subject: [PATCH 04/56] * Make intro chattier, explain philosophy better

---
 docs/source/index.rst | 106 +++++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 52 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index b0dd08417..808455fd0 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -7,19 +7,59 @@
 spaCy NLP Tokenizer and Lexicon
 ================================
 
-spaCy is a library for industrial-strength NLP in Python and Cython.  It
-assumes that NLP is mostly about solving large machine learning problems, and that
-solving these problems is mostly about feature extraction.  So, spaCy helps you
-do feature extraction --- it includes an excellent set of distributional and
-orthographic features, memoizes them efficiently, and maps strings to
-consecutive integer values.
+spaCy is a library for industrial-strength NLP in Python and Cython.  spaCy's
+take on NLP is that it's mostly about feature extraction --- that's the part
+that's specific to NLP, so that's what an NLP library should focus on.
+It should tell you what the current best-practice is, and help you do exactly
+that, quickly and efficiently.
 
-For commercial users, a trial license costs $0, with a one-time license fee of
-$1,000 to use spaCy in production.  For non-commercial users, a GPL license is
-available.  To quickly get the gist of the license terms, check out the license
-user stories.
+Best-practice is to **use lots of large lexicons**.  Let's say you hit the word
+*belieber* in production.  What will your system know about this word?  A bad
+system will only know things about the words in its training corpus, which
+probably consists of texts written before Justin Bieber was even born.
+It doesn't have to be like that.
 
 
+Unique Lexicon-centric design
+=============================
+
+spaCy helps you build models that generalise better, by making it easy to use
+more robust features.  Instead of a list of strings, the tokenizer returns
+references to rich lexical types.  Its tokenizer returns sequence of references
+to rich lexical types.  Features which ask about the word's Brown cluster, its
+typical part-of-speech tag, how it's usually cased etc require no extra effort:
+
+    >>> from spacy.en import EN
+    >>> from spacy.feature_names import *
+    >>> feats = (
+            SIC, # ID of the original word form
+            NORM, # ID of the normalized word form
+            CLUSTER, # ID of the word's Brown cluster
+            IS_TITLE, # Was the word title-cased?
+            POS_TYPE # A cluster ID describing what POS tags the word is usually assigned
+        )
+    >>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^')
+    >>> tokens.to_array(feats)[:5]
+        array([[    1,  2,  3,  4],
+               [...],
+               [...],
+               [...]])
+
+
+spaCy is designed to **make the right thing easy**, where the right thing is to:
+
+* **Use rich distributional and orthographic features**. Without these, your model
+  will be very brittle and domain dependent.
+
+* **Compute features per type, not per token**. Because of Zipf's law, you can
+  expect this to be exponentially more efficient.
+
+* **Minimize string processing**, and instead compute with arrays of ID ints.
+  
+For the current list of lexical features, see `Lexical Features`_.
+
+.. _lexical features: features.html
+
 Tokenization done right
 =======================
 
@@ -82,48 +122,10 @@ spaCy's tokenizer is also incredibly efficient:
 +--------+---------------+--------------+
 
 spaCy can create an inverted index of the 1.8 billion word Gigaword corpus,
-keyed by lemmas, in under half an hour --- on a Macbook Air.
+in under half an hour --- on a Macbook Air.  See the `inverted
+index tutorial`_.
 
-Unique Lexicon-centric design
-=============================
-
-spaCy takes care of all string-processing, efficiently and accurately.  This
-makes a night-and-day difference to your feature extraction code.
-Instead of a list of strings, spaCy's tokenizer gives you references to feature-rich
-lexeme objects:
-
-    >>> from spacy.en import EN
-    >>> from spacy.feature_names import SIC, NORM, SHAPE, ASCIIED, PREFIX, SUFFIX, \
-            LENGTH, CLUSTER, POS_TYPE, SENSE_TYPE, \
-            IS_ALPHA, IS_ASCII, IS_DIGIT, IS_PUNCT, IS_SPACE, IS_TITLE, IS_UPPER, \
-            LIKE_URL, LIKE_NUMBER
-    >>> feats = (
-            SIC, # ID of the original word form
-            NORM, # ID of the normalized word form
-            CLUSTER, # ID of the word's Brown cluster
-            IS_TITLE, # Was the word title-cased?
-            POS_TYPE # A cluster ID describing what POS tags the word is usually assigned
-        )
-    >>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^')
-    >>> tokens.to_strings()
-    [u'Split', u'words', u',', u'punctuation', u',', u'emoticons', u'etc.', u'!', u'^_^']
-    >>> tokens.to_array(feats)[:5]
-        array([[    1,  2,  3,  4],
-               [...],
-               [...],
-               [...]])
-
-
-spaCy is designed to **make the right thing easy**, where the right thing is to:
-
-* **Use rich distributional and orthographic features**. Without these, your model
-  will be very brittle and domain dependent.
-
-* **Compute features per type, not per token**. Because of Zipf's law, you can
-  expect this to be exponentially more efficient.
-
-* **Minimize string processing**, and instead compute with arrays of ID ints.
-  
+.. _inverted index tutorial: index_tutorial.html
 
 Comparison with NLTK
 ====================
@@ -221,4 +223,4 @@ performance you expect from a program written in C.
     :maxdepth: 3
 
     features.rst
-    
+    license_stories.rst 

From 8c2938fe0157af0368a7eaf2671106702232e018 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 2 Dec 2014 23:46:59 +1100
Subject: [PATCH 05/56] * Rename Lexicon._dict to Lexicon._map

---
 spacy/lang.pxd | 15 +++------------
 spacy/lang.pyx | 14 +++++++-------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 68f1ee58a..dc3262771 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -1,5 +1,7 @@
 from libcpp.vector cimport vector
 
+from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
+
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 
@@ -7,17 +9,9 @@ from .typedefs cimport hash_t
 from .tokens cimport Tokens
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
-from .ner.greedy_parser cimport NERParser
 from .utf8string cimport StringStore
 
 
-cdef extern from "Python.h":
-    cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
-    cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
-    cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
-    cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
-
-
 cdef struct String:
     Py_UNICODE* chars
     size_t n
@@ -32,7 +26,7 @@ cdef class Lexicon:
 
     cdef Lexeme* get(self, String* s) except NULL
     
-    cdef PreshMap _dict
+    cdef PreshMap _map
     
 
 cdef class Language:
@@ -42,9 +36,6 @@ cdef class Language:
     cdef PreshMap _specials
     cpdef readonly Lexicon lexicon
 
-    cpdef readonly Tagger pos_tagger
-    cpdef readonly NERParser ner_tagger
-
     cdef object _prefix_re
     cdef object _suffix_re
     cdef object _infix_re
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index bc9677e6c..df9cf3166 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -241,7 +241,7 @@ cdef class Lexicon:
     '''
     def __init__(self):
         self.mem = Pool()
-        self._dict = PreshMap(2 ** 20)
+        self._map = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.lexemes.push_back(&EMPTY_LEXEME)
         self.size = 1
@@ -249,12 +249,12 @@ cdef class Lexicon:
     cdef Lexeme* get(self, String* string) except NULL:
         '''Retrieve a pointer to a Lexeme from the lexicon.'''
         cdef Lexeme* lex
-        lex = <Lexeme*>self._dict.get(string.key)
+        lex = <Lexeme*>self._map.get(string.key)
         if lex != NULL:
             return lex
         lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
         lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
-        self._dict.set(string.key, lex)
+        self._map.set(string.key, lex)
         while self.lexemes.size() < (lex.id + 1):
             self.lexemes.push_back(&EMPTY_LEXEME)
         self.lexemes[lex.id] = lex
@@ -302,11 +302,11 @@ cdef class Lexicon:
         assert fp != NULL
         cdef size_t st
         cdef hash_t key
-        for i in range(self._dict.length):
-            key = self._dict.c_map.cells[i].key
+        for i in range(self._map.length):
+            key = self._map.c_map.cells[i].key
             if key == 0:
                 continue
-            lexeme = <Lexeme*>self._dict.c_map.cells[i].value
+            lexeme = <Lexeme*>self._map.c_map.cells[i].value
             st = fwrite(&key, sizeof(key), 1, fp)
             assert st == 1
             st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
@@ -331,7 +331,7 @@ cdef class Lexicon:
             st = fread(lexeme, sizeof(Lexeme), 1, fp)
             if st != 1:
                 break
-            self._dict.set(key, lexeme)
+            self._map.set(key, lexeme)
             while self.lexemes.size() < (lexeme.id + 1):
                 self.lexemes.push_back(&EMPTY_LEXEME)
             self.lexemes[lexeme.id] = lexeme

From 522bb0346e038c475c16f94109f302ba0df3c2bb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 2 Dec 2014 23:48:05 +1100
Subject: [PATCH 06/56] * Work on get_array method of Tokens

---
 spacy/tokens.pxd |  5 +++++
 spacy/tokens.pyx | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index d1b2ef10b..36dee698e 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -1,3 +1,6 @@
+import numpy as np
+cimport numpy as np
+
 from cymem.cymem cimport Pool
 
 from .lexeme cimport Lexeme
@@ -28,6 +31,8 @@ cdef class Tokens:
     cdef int push_back(self, int i, Lexeme* lexeme) except -1
     cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1
 
+    cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features)
+
 
 cdef class Token:
     cdef StringStore _string_store
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 721e6bb80..ba8812f2e 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -102,6 +102,16 @@ cdef class Tokens:
         elif tag_type == ENTITY:
             self.ner[i] = tag
 
+    cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features):
+        cdef int i, j
+        cdef np.ndarray[atom_t, ndim=2] output
+        output = np.ndarray(shape=(self.length, len(features)), dtype=int)
+        for i in range(self.length):
+            for j, feature in enumerate(features):
+                output[i, j] = self.lex[i].sic
+                #output[i, j] = lexeme_get_feature(self.lex[i], feature)
+        return output
+
     def _realloc(self, new_size):
         self.max_length = new_size
         n = new_size + (PADDING * 2)

From 14097311ae77afdbe46dc859b23a6da9bf61a124 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 3 Dec 2014 01:33:20 +1100
Subject: [PATCH 07/56] * Make StringStore.__getitem__ accept unicode-typed
 keys.

---
 spacy/utf8string.pxd |  2 +-
 spacy/utf8string.pyx | 16 ++++++++++------
 tests/test_intern.py |  8 ++++++--
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd
index 82ae50022..16488b899 100644
--- a/spacy/utf8string.pxd
+++ b/spacy/utf8string.pxd
@@ -13,7 +13,7 @@ cdef struct Utf8Str:
 
 cdef class StringStore:
     cdef Pool mem
-    cdef PreshMap table
+    cdef PreshMap _map
     cdef Utf8Str* strings
     cdef int size
     cdef int _resize_at
diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx
index 18d4a4e5e..426b531f4 100644
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@@ -8,7 +8,7 @@ SEPARATOR = '\n|-SEP-|\n'
 cdef class StringStore:
     def __init__(self):
         self.mem = Pool()
-        self.table = PreshMap()
+        self._map = PreshMap()
         self._resize_at = 10000
         self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
         self.size = 1
@@ -17,17 +17,21 @@ cdef class StringStore:
         def __get__(self):
             return self.size-1
 
-    def __getitem__(self, string_or_id):
+    def __getitem__(self, object string_or_id):
         cdef bytes byte_string
         cdef Utf8Str* utf8str
-        if type(string_or_id) == int or type(string_or_id) == long:
+        if isinstance(string_or_id, int):
             if string_or_id < 1 or string_or_id >= self.size:
                 raise IndexError(string_or_id)
             utf8str = &self.strings[<int>string_or_id]
             return utf8str.chars[:utf8str.length]
-        elif type(string_or_id) == bytes:
+        elif isinstance(string_or_id, bytes):
             utf8str = self.intern(<char*>string_or_id, len(string_or_id))
             return utf8str.i
+        elif isinstance(string_or_id, unicode):
+            byte_string = string_or_id.encode('utf8')
+            utf8str = self.intern(<char*>byte_string, len(byte_string))
+            return utf8str.i
         else:
             raise TypeError(type(string_or_id))
 
@@ -36,7 +40,7 @@ cdef class StringStore:
         # slot 0 to simplify the code, because it doesn't matter.
         assert length != 0
         cdef hash_t key = hash64(chars, length * sizeof(char), 0)
-        cdef void* value = self.table.get(key)
+        cdef void* value = self._map.get(key)
         cdef size_t i
         if value == NULL:
             if self.size == self._resize_at:
@@ -48,7 +52,7 @@ cdef class StringStore:
             self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
             memcpy(self.strings[i].chars, chars, length)
             self.strings[i].length = length
-            self.table.set(key, <void*>self.size)
+            self._map.set(key, <void*>self.size)
             self.size += 1
         else:
             i = <size_t>value
diff --git a/tests/test_intern.py b/tests/test_intern.py
index 63b4b3433..a7a801b05 100644
--- a/tests/test_intern.py
+++ b/tests/test_intern.py
@@ -19,8 +19,12 @@ def test_save_bytes(sstore):
 
 
 def test_save_unicode(sstore):
-    with pytest.raises(TypeError):
-        A_i = sstore['A']
+    Hello_i = sstore[u'Hello']
+    assert Hello_i == 1
+    assert sstore[u'Hello'] == 1
+    assert sstore[u'goodbye'] != Hello_i
+    assert sstore[u'hello'] != Hello_i
+    assert Hello_i == 1
 
 
 def test_zero_id(sstore):

From 71b009e3232d923b8719a21d57ddc6ce3ba63c5c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 3 Dec 2014 11:02:24 +1100
Subject: [PATCH 08/56] * Fix bug in refactored StringStore.__getitem__

---
 spacy/utf8string.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx
index 426b531f4..0384a150c 100644
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@@ -5,6 +5,7 @@ import codecs
 
 SEPARATOR = '\n|-SEP-|\n'
 
+
 cdef class StringStore:
     def __init__(self):
         self.mem = Pool()
@@ -20,7 +21,7 @@ cdef class StringStore:
     def __getitem__(self, object string_or_id):
         cdef bytes byte_string
         cdef Utf8Str* utf8str
-        if isinstance(string_or_id, int):
+        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
             if string_or_id < 1 or string_or_id >= self.size:
                 raise IndexError(string_or_id)
             utf8str = &self.strings[<int>string_or_id]

From b463a7eb8632663fded2d21591b36d78e00d8242 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 3 Dec 2014 11:04:00 +1100
Subject: [PATCH 09/56] * Make flag-setting a language-specific thing

---
 spacy/en.pxd     |  26 +++++++++
 spacy/en.pyx     |  17 +++++-
 spacy/lang.pxd   |  18 ++----
 spacy/lang.pyx   |  79 ++++++++++++--------------
 spacy/lexeme.pxd | 142 +++++++++++++++++++++++++++++++++--------------
 spacy/lexeme.pyx |  88 +++++++++++++----------------
 6 files changed, 224 insertions(+), 146 deletions(-)

diff --git a/spacy/en.pxd b/spacy/en.pxd
index a7c643eba..cccfb60a8 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -1,6 +1,32 @@
 from spacy.lang cimport Language
 from spacy.tokens cimport Tokens
 
+# Flags
+cpdef enum FlagID:
+    IS_ALPHA
+    IS_ASCII
+    IS_DIGIT
+    IS_LOWER
+    IS_PUNCT
+    IS_SPACE
+    IS_TITLE
+    IS_UPPER
+
+    LIKE_URL
+    LIKE_NUMBER
+
+    OFT_LOWER
+    OFT_TITLE
+    OFT_UPPER
+
+    IN_MALES
+    IN_FEMALES
+    IN_SURNAMES
+    IN_PLACES
+    IN_GAMES
+    IN_CELEBS
+    IN_NAMES
+
 
 cdef class English(Language):
     pass
diff --git a/spacy/en.pyx b/spacy/en.pyx
index 95c1cbd94..92be97aad 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -38,6 +38,8 @@ provides a fully Penn Treebank 3-compliant tokenizer.
 from __future__ import unicode_literals
 
 cimport lang
+from .typedefs cimport flags_t
+import orth
 
 
 cdef class English(Language):
@@ -47,7 +49,20 @@ cdef class English(Language):
         name (unicode): The two letter code used by Wikipedia for the language.
         lexicon (Lexicon): The lexicon. Exposes the lookup method.
     """
-    pass
+    def set_flags(self, unicode string):
+        cdef flags_t flags = 0
+        flags |= orth.is_alpha(string) << IS_ALPHA
+        flags |= orth.is_ascii(string) << IS_ASCII
+        flags |= orth.is_digit(string) << IS_DIGIT
+        flags |= orth.is_lower(string) << IS_LOWER
+        flags |= orth.is_punct(string) << IS_PUNCT
+        flags |= orth.is_space(string) << IS_SPACE
+        flags |= orth.is_title(string) << IS_TITLE
+        flags |= orth.is_upper(string) << IS_UPPER
+
+        flags |= orth.like_url(string) << LIKE_URL
+        flags |= orth.like_number(string) << LIKE_NUMBER
+        return flags
 
 
 EN = English('en')
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index dc3262771..9e4bc7b5d 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -8,23 +8,17 @@ from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
 from .tokens cimport Tokens
 from .lexeme cimport Lexeme
-from .tagger cimport Tagger
-from .utf8string cimport StringStore
-
-
-cdef struct String:
-    Py_UNICODE* chars
-    size_t n
-    hash_t key
+from .utf8string cimport StringStore, UniStr
 
 
 cdef class Lexicon:
+    cpdef public set_flags
     cdef Pool mem
     cpdef readonly size_t size
     cpdef readonly StringStore strings
     cdef vector[Lexeme*] lexemes
 
-    cdef Lexeme* get(self, String* s) except NULL
+    cdef Lexeme* get(self, UniStr* s) except NULL
     
     cdef PreshMap _map
     
@@ -43,10 +37,10 @@ cdef class Language:
     cpdef Tokens tokens_from_list(self, list strings)
     cpdef Tokens tokenize(self, unicode text)
 
-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                              vector[Lexeme*] *suffixes) except NULL
-    cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
                             vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
     cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
     cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index df9cf3166..2a284b9df 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -19,6 +19,8 @@ from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
 
+from .utf8string cimport slice_unicode
+
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
@@ -34,7 +36,7 @@ cdef class Language:
         self._prefix_re = re.compile(prefix)
         self._suffix_re = re.compile(suffix)
         self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon()
+        self.lexicon = Lexicon(self.set_flags)
         if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
             self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
             self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
@@ -45,11 +47,11 @@ cdef class Language:
         cdef Tokens tokens = Tokens(self.lexicon.strings, length)
         if length == 0:
             return tokens
-        cdef String string_struct
+        cdef UniStr string_struct
         cdef unicode py_string
         cdef int idx = 0
         for i, py_string in enumerate(strings):
-            string_from_unicode(&string_struct, py_string)
+            slice_unicode(&string_struct, py_string, 0, len(py_string))
             tokens.push_back(idx, self.lexicon.get(&string_struct))
             idx += len(py_string) + 1
         return tokens
@@ -77,11 +79,11 @@ cdef class Language:
         cdef int start = 0
         cdef Py_UNICODE* chars = string
         cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
-        cdef String span
+        cdef UniStr span
         for i in range(1, length):
             if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                 if start < i:
-                    string_slice(&span, chars, start, i)
+                    slice_unicode(&span, chars, start, i)
                     lexemes = <Lexeme**>self._cache.get(span.key)
                     if lexemes != NULL:
                         tokens.extend(start, lexemes, 0)
@@ -93,7 +95,7 @@ cdef class Language:
                     start += 1
         i += 1
         if start < i:
-            string_slice(&span, chars, start, i)
+            slice_unicode(&span, chars, start, i)
             lexemes = <Lexeme**>self._cache.get(span.key)
             if lexemes != NULL:
                 tokens.extend(start, lexemes, 0)
@@ -101,7 +103,7 @@ cdef class Language:
                 self._tokenize(tokens, &span, start, i)
         return tokens
 
-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
         cdef vector[Lexeme*] prefixes
         cdef vector[Lexeme*] suffixes
         cdef hash_t orig_key
@@ -112,20 +114,20 @@ cdef class Language:
         self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
         self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
 
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                                 vector[Lexeme*] *suffixes) except NULL:
         cdef size_t i
-        cdef String prefix
-        cdef String suffix
-        cdef String minus_pre
-        cdef String minus_suf
+        cdef UniStr prefix
+        cdef UniStr suffix
+        cdef UniStr minus_pre
+        cdef UniStr minus_suf
         cdef size_t last_size = 0
         while string.n != 0 and string.n != last_size:
             last_size = string.n
             pre_len = self._find_prefix(string.chars, string.n)
             if pre_len != 0:
-                string_slice(&prefix, string.chars, 0, pre_len)
-                string_slice(&minus_pre, string.chars, pre_len, string.n)
+                slice_unicode(&prefix, string.chars, 0, pre_len)
+                slice_unicode(&minus_pre, string.chars, pre_len, string.n)
                 # Check whether we've hit a special-case
                 if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
                     string[0] = minus_pre
@@ -133,15 +135,15 @@ cdef class Language:
                     break
             suf_len = self._find_suffix(string.chars, string.n)
             if suf_len != 0:
-                string_slice(&suffix, string.chars, string.n - suf_len, string.n)
-                string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
+                slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
+                slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
                 # Check whether we've hit a special-case
                 if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
                     string[0] = minus_suf
                     suffixes.push_back(self.lexicon.get(&suffix))
                     break
             if pre_len and suf_len and (pre_len + suf_len) <= string.n:
-                string_slice(string, string.chars, pre_len, string.n - suf_len)
+                slice_unicode(string, string.chars, pre_len, string.n - suf_len)
                 prefixes.push_back(self.lexicon.get(&prefix))
                 suffixes.push_back(self.lexicon.get(&suffix))
             elif pre_len:
@@ -155,13 +157,13 @@ cdef class Language:
         return string
 
     cdef int _attach_tokens(self, Tokens tokens,
-                            int idx, String* string,
+                            int idx, UniStr* string,
                             vector[Lexeme*] *prefixes,
                             vector[Lexeme*] *suffixes) except -1:
         cdef int split
         cdef Lexeme** lexemes
         cdef Lexeme* lexeme
-        cdef String span
+        cdef UniStr span
         if prefixes.size():
             idx = tokens.extend(idx, prefixes.data(), prefixes.size())
         if string.n != 0:
@@ -174,11 +176,11 @@ cdef class Language:
                 if split == 0 or split == -1:
                     idx = tokens.push_back(idx, self.lexicon.get(string))
                 else:
-                    string_slice(&span, string.chars, 0, split)
+                    slice_unicode(&span, string.chars, 0, split)
                     idx = tokens.push_back(idx, self.lexicon.get(&span))
-                    string_slice(&span, string.chars, split, split+1)
+                    slice_unicode(&span, string.chars, split, split+1)
                     idx = tokens.push_back(idx, self.lexicon.get(&span))
-                    string_slice(&span, string.chars, split + 1, string.n)
+                    slice_unicode(&span, string.chars, split + 1, string.n)
                     idx = tokens.push_back(idx, self.lexicon.get(&span))
         cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
@@ -222,14 +224,14 @@ cdef class Language:
         '''
         cdef Lexeme** lexemes
         cdef hash_t hashed
-        cdef String string
+        cdef UniStr string
         for uni_string, substrings in token_rules:
             lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
             for i, substring in enumerate(substrings):
-                string_from_unicode(&string, substring)
+                slice_unicode(&string, substring, 0, len(substring))
                 lexemes[i] = <Lexeme*>self.lexicon.get(&string)
             lexemes[i + 1] = NULL
-            string_from_unicode(&string, uni_string)
+            slice_unicode(&string, uni_string, 0, len(uni_string))
             self._specials.set(string.key, lexemes)
             self._cache.set(string.key, lexemes)
 
@@ -239,21 +241,23 @@ cdef class Lexicon:
     
     Also interns UTF-8 strings, and maps them to consecutive integer IDs.
     '''
-    def __init__(self):
+    def __init__(self, object set_flags=None):
         self.mem = Pool()
         self._map = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.lexemes.push_back(&EMPTY_LEXEME)
         self.size = 1
+        self.set_flags = set_flags
 
-    cdef Lexeme* get(self, String* string) except NULL:
+    cdef Lexeme* get(self, UniStr* string) except NULL:
         '''Retrieve a pointer to a Lexeme from the lexicon.'''
         cdef Lexeme* lex
         lex = <Lexeme*>self._map.get(string.key)
         if lex != NULL:
             return lex
         lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
+        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
+                self.strings, {'flags': self.set_flags(string.chars[:string.n])})
         self._map.set(string.key, lex)
         while self.lexemes.size() < (lex.id + 1):
             self.lexemes.push_back(&EMPTY_LEXEME)
@@ -283,14 +287,14 @@ cdef class Lexicon:
         '''
         if type(id_or_string) == int:
             return self.lexemes.at(id_or_string)[0]
-        cdef String string
-        string_from_unicode(&string, id_or_string)
+        cdef UniStr string
+        slice_unicode(&string, id_or_string, 0, len(id_or_string))
         cdef Lexeme* lexeme = self.get(&string)
         return lexeme[0]
 
     def __setitem__(self, unicode uni_string, dict props):
-        cdef String s
-        string_from_unicode(&s, uni_string)
+        cdef UniStr s
+        slice_unicode(&s, uni_string, 0, len(uni_string))
         cdef Lexeme* lex = self.get(&s)
         lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
 
@@ -338,14 +342,3 @@ cdef class Lexicon:
             i += 1
             self.size += 1
         fclose(fp)
-        
-
-cdef void string_from_unicode(String* s, unicode uni):
-    cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
-    string_slice(s, c_uni, 0, len(uni))
-
-
-cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
-    s.chars = &chars[start]
-    s.n = end - start
-    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 0d7d206e5..9d5dddd6d 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,61 +1,119 @@
-from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
+from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
 
 from .utf8string cimport StringStore
-from libc.stdint cimport uint16_t
 
-cpdef flag_t OOV_DIST_FLAGS
 
-# Flags
-cpdef enum:
-    IS_ALPHA
-    IS_ASCII
-    IS_DIGIT
-    IS_LOWER
-    IS_PUNCT
-    IS_SPACE
-    IS_TITLE
-    IS_UPPER
+# Reserve 64 values for flag features
+cpdef enum attr_id_t:
+    FLAG0
+    FLAG1
+    FLAG2
+    FLAG3
+    FLAG4
+    FLAG5
+    FLAG6
+    FLAG7
+    FLAG8
+    FLAG9
+    FLAG10
+    FLAG11
+    FLAG12
+    FLAG13
+    FLAG14
+    FLAG15
+    FLAG16
+    FLAG17
+    FLAG18
+    FLAG19
+    FLAG20
+    FLAG21
+    FLAG22
+    FLAG23
+    FLAG24
+    FLAG25
+    FLAG26
+    FLAG27
+    FLAG28
+    FLAG29
+    FLAG30
+    FLAG31
+    FLAG32
+    FLAG33
+    FLAG34
+    FLAG35
+    FLAG36
+    FLAG37
+    FLAG38
+    FLAG39
+    FLAG40
+    FLAG41
+    FLAG42
+    FLAG43
+    FLAG44
+    FLAG45
+    FLAG46
+    FLAG47
+    FLAG48
+    FLAG49
+    FLAG50
+    FLAG51
+    FLAG52
+    FLAG53
+    FLAG54
+    FLAG55
+    FLAG56
+    FLAG57
+    FLAG58
+    FLAG59
+    FLAG60
+    FLAG61
+    FLAG62
+    FLAG63
 
-    LIKE_URL
-    LIKE_NUMBER
+    ID
+    SIC
+    NORM
+    SHAPE
+    ASCIIED
+    PREFIX
+    SUFFIX
 
-    OFT_LOWER
-    OFT_TITLE
-    OFT_UPPER
-
-    IN_MALES
-    IN_FEMALES
-    IN_SURNAMES
-    IN_PLACES
-    IN_GAMES
-    IN_CELEBS
-    IN_NAMES
+    LENGTH
+    CLUSTER
+    POS_TYPE
+    SENSE_TYPE
 
 
 cdef struct Lexeme:
-    flag_t flags
+    flags_t flags
    
-    id_t id
-    id_t sic
-    id_t norm
-    id_t shape
-    id_t asciied
-    id_t prefix
-    id_t suffix
+    attr_t id
+    attr_t sic
+    attr_t norm
+    attr_t shape
+    attr_t asciied
+    attr_t prefix
+    attr_t suffix
+ 
+    attr_t length
+    attr_t cluster
+    attr_t pos_type
+    attr_t sense_type
 
     float prob
-    
-    len_t length
-    tag_t cluster
-    tag_t postype
-    tag_t supersense
+    float upper_pc
+    float title_pc
 
 
 cdef Lexeme EMPTY_LEXEME
 
-cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
-                  StringStore store, dict props) except *
+
+cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
+                  dict props) except *
  
 
-cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
+cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil:
     return lexeme.flags & (1 << flag_id)
+
+
+cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id)
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 64eb699a6..888edc07b 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -6,67 +6,59 @@ from libc.string cimport memset
 
 import orth
 
-from .utf8string cimport Utf8Str
-
-OOV_DIST_FLAGS = 0
 
 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
 
 
-def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
-    cdef flag_t flags = 0
-    flags |= orth.is_alpha(string) << IS_ALPHA
-    flags |= orth.is_ascii(string) << IS_ASCII
-    flags |= orth.is_digit(string) << IS_DIGIT
-    flags |= orth.is_lower(string) << IS_LOWER
-    flags |= orth.is_punct(string) << IS_PUNCT
-    flags |= orth.is_space(string) << IS_SPACE
-    flags |= orth.is_title(string) << IS_TITLE
-    flags |= orth.is_upper(string) << IS_UPPER
-
-    flags |= orth.like_url(string) << LIKE_URL
-    flags |= orth.like_number(string) << LIKE_NUMBER
-    return flags
-
-
 cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
-                  StringStore store, dict props) except *:
+                  StringStore string_store, dict props) except *:
     cdef Lexeme lex
     lex.id = i
     lex.length = len(string)
-    lex.sic = get_string_id(string, store)
+    lex.sic = string_store[string]
     
     lex.cluster = props.get('cluster', 0)
-    lex.postype = props.get('postype', 0)
-    lex.supersense = props.get('supersense', 0)
+    lex.pos_type = props.get('pos_type', 0)
+    lex.sense_type = props.get('sense_type', 0)
     lex.prob = props.get('prob', 0)
 
-    cdef float upper_pc = props.get('upper_pc', 0.0)
-    cdef float lower_pc = props.get('lower_pc', 0.0)
-    cdef float title_pc = props.get('title_pc', 0.0)
+    lex.upper_pc = props.get('upper_pc', 0.0)
+    lex.title_pc = props.get('lower_pc', 0.0)
 
-    lex.prefix = get_string_id(string[0], store)
-    lex.suffix = get_string_id(string[-3:], store)
-    if upper_pc or lower_pc or title_pc:
-        canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
-        lex.norm = get_string_id(canon_cased, store)
-    else:
-        lex.norm = lex.sic
-    lex.shape = get_string_id(orth.word_shape(string), store)
-    lex.asciied = get_string_id(orth.asciied(string), store)
-    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
-    
-    lex.flags |= props.get('in_males', 0) << IN_MALES
-    lex.flags |= props.get('in_females', 0) << IN_FEMALES
-    lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
-    lex.flags |= props.get('in_places', 0) << IN_PLACES
-    lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
-    lex.flags |= props.get('in_games', 0) << IN_GAMES
-    lex.flags |= props.get('in_names', 0) << IN_NAMES
+    lex.prefix = string_store[string[:1]]
+    lex.suffix = string_store[string[-3:]]
+    lex.norm = lex.sic # TODO
+    lex.shape = string_store[orth.word_shape(string)]
+    lex.asciied = string_store[orth.asciied(string)]
+   
+    lex.flags = props.get('flags', 0)
     return lex
 
 
-cdef id_t get_string_id(unicode string, StringStore store) except 0:
-    cdef bytes byte_string = string.encode('utf8')
-    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
-    return orig_str.i
+cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name):
+    if feat_name < (sizeof(flags_t) * 8):
+        return check_flag(lex, feat_name)
+    elif feat_name == ID:
+        return lex.id
+    elif feat_name == SIC:
+        return lex.sic
+    elif feat_name == NORM:
+        return lex.norm
+    elif feat_name == SHAPE:
+        return lex.shape
+    elif feat_name == ASCIIED:
+        return lex.asciied
+    elif feat_name == PREFIX:
+        return lex.prefix
+    elif feat_name == SUFFIX:
+        return lex.suffix
+    elif feat_name == LENGTH:
+        return lex.length
+    elif feat_name == CLUSTER:
+        return lex.cluster
+    elif feat_name == POS_TYPE:
+        return lex.pos_type
+    elif feat_name == SENSE_TYPE:
+        return lex.sense_type
+    else:
+        raise StandardError('Feature ID: %d not found' % feat_name)

From e170faf5b0d949dde0beb35e02e790d9678f8b67 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 3 Dec 2014 11:05:15 +1100
Subject: [PATCH 10/56] * Hack Tokens to work without tagger.pyx

---
 spacy/tokens.pxd |  7 +++----
 spacy/tokens.pyx | 13 +++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 36dee698e..2c97a3163 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -4,9 +4,8 @@ cimport numpy as np
 from cymem.cymem cimport Pool
 
 from .lexeme cimport Lexeme
-from .typedefs cimport flag_t
+from .typedefs cimport flags_t
 from .utf8string cimport StringStore
-from .tagger cimport TagType
 
 from thinc.typedefs cimport atom_t
 
@@ -29,7 +28,7 @@ cdef class Tokens:
 
     cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
     cdef int push_back(self, int i, Lexeme* lexeme) except -1
-    cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1
+    cpdef int set_tag(self, int i, int tag_type, int tag) except -1
 
     cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features)
 
@@ -56,4 +55,4 @@ cdef class Token:
 
     cdef public float prob
 
-    cdef public flag_t flags
+    cdef public flags_t flags
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index ba8812f2e..e8e016944 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -1,7 +1,9 @@
 # cython: profile=True
 from .lexeme cimport *
 cimport cython
-from .tagger cimport POS, ENTITY
+
+POS = 0
+ENTITY = 0
 
 DEF PADDING = 5
 
@@ -96,7 +98,7 @@ cdef class Tokens:
                 idx = self.push_back(idx, lexemes[i])
         return idx
 
-    cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
+    cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
         if tag_type == POS:
             self.pos[i] = tag
         elif tag_type == ENTITY:
@@ -108,8 +110,7 @@ cdef class Tokens:
         output = np.ndarray(shape=(self.length, len(features)), dtype=int)
         for i in range(self.length):
             for j, feature in enumerate(features):
-                output[i, j] = self.lex[i].sic
-                #output[i, j] = lexeme_get_feature(self.lex[i], feature)
+                output[i, j] = get_attr(self.lex[i], feature)
         return output
 
     def _realloc(self, new_size):
@@ -140,8 +141,8 @@ cdef class Token:
         
         self.cluster = lex['cluster']
         self.length = lex['length']
-        self.postype = lex['postype']
-        self.sensetype = lex['supersense']
+        self.postype = lex['pos_type']
+        self.sensetype = lex['sense_type']
         self.sic = lex['sic']
         self.norm = lex['norm']
         self.shape = lex['shape']

From e600f7b3275a6c976dcc67ff6f3fddf07a241777 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 3 Dec 2014 11:06:00 +1100
Subject: [PATCH 11/56] * Move String struct stuff into the utf8string module,
 from spacy.lang

---
 spacy/utf8string.pxd | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd
index 16488b899..6bd5c6757 100644
--- a/spacy/utf8string.pxd
+++ b/spacy/utf8string.pxd
@@ -1,5 +1,6 @@
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
+from murmurhash.mrmr cimport hash64
 
 from .typedefs cimport utf8_t, id_t, hash_t
 
@@ -11,6 +12,18 @@ cdef struct Utf8Str:
     int length
 
 
+cdef struct UniStr:
+    Py_UNICODE* chars
+    size_t n
+    hash_t key
+
+
+cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
+    s.chars = &chars[start]
+    s.n = end - start
+    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
+
+
 cdef class StringStore:
     cdef Pool mem
     cdef PreshMap _map

From 4560ada85b6cec7e6828c8b2669f300f87f9b9be Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 3 Dec 2014 11:06:31 +1100
Subject: [PATCH 12/56] * Add typedef for attr_t. Change flag_t to flags_t

---
 spacy/typedefs.pxd | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd
index 21818f05e..893865133 100644
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@@ -2,7 +2,8 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
 
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
-ctypedef uint64_t flag_t
+ctypedef uint32_t attr_t
+ctypedef uint64_t flags_t
 ctypedef uint32_t id_t
 ctypedef uint16_t len_t
 ctypedef uint16_t tag_t

From d0d812c548f09ded9436ff584b9615cc5727eb42 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 3 Dec 2014 11:06:57 +1100
Subject: [PATCH 13/56] * Hack setup.py to exclude tagger stuff

---
 setup.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/setup.py b/setup.py
index 10ba5b1ae..ae6d5a99d 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,8 @@ import os.path
 from os import path
 from glob import glob
 
+import numpy
+
 
 def clean(ext):
     for pyx in ext.sources:
@@ -34,7 +36,7 @@ compile_args = []
 link_args = []
 libs = []
 
-includes = ['.']
+includes = ['.', numpy.get_include()]
 cython_includes = ['.']
 
 
@@ -50,18 +52,18 @@ exts = [
     Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
+    #Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
 ]
 
 

From d70d31aa45a4bafa868424c84867b7c187aae71f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 3 Dec 2014 15:44:25 +1100
Subject: [PATCH 14/56] * Introduce first attempt at const-ness

---
 spacy/lang.pxd   |  4 ++--
 spacy/lang.pyx   | 40 +++++++++++++++++++++-------------------
 spacy/lexeme.pxd |  4 ++--
 spacy/lexeme.pyx |  2 +-
 spacy/tokens.pxd |  8 ++++----
 spacy/tokens.pyx |  8 ++++----
 6 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 9e4bc7b5d..d4b587a6b 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -18,7 +18,7 @@ cdef class Lexicon:
     cpdef readonly StringStore strings
     cdef vector[Lexeme*] lexemes
 
-    cdef Lexeme* get(self, UniStr* s) except NULL
+    cdef const Lexeme* get(self, UniStr* s) except NULL
     
     cdef PreshMap _map
     
@@ -45,5 +45,5 @@ cdef class Language:
     cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
     cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
     cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
-    cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1
+    cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1
  
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 2a284b9df..8d4ea7802 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -37,11 +37,12 @@ cdef class Language:
         self._suffix_re = re.compile(suffix)
         self._infix_re = re.compile(infix)
         self.lexicon = Lexicon(self.set_flags)
-        if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
-            self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
-            self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
         self._load_special_tokenization(rules)
 
+    def load(self):
+        self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
+        self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
+
     cpdef Tokens tokens_from_list(self, list strings):
         cdef int length = sum([len(s) for s in strings])
         cdef Tokens tokens = Tokens(self.lexicon.strings, length)
@@ -84,7 +85,7 @@ cdef class Language:
             if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                 if start < i:
                     slice_unicode(&span, chars, start, i)
-                    lexemes = <Lexeme**>self._cache.get(span.key)
+                    lexemes = <const Lexeme* const*>self._cache.get(span.key)
                     if lexemes != NULL:
                         tokens.extend(start, lexemes, 0)
                     else: 
@@ -96,7 +97,7 @@ cdef class Language:
         i += 1
         if start < i:
             slice_unicode(&span, chars, start, i)
-            lexemes = <Lexeme**>self._cache.get(span.key)
+            lexemes = <const Lexeme* const*>self._cache.get(span.key)
             if lexemes != NULL:
                 tokens.extend(start, lexemes, 0)
             else: 
@@ -114,8 +115,8 @@ cdef class Language:
         self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
         self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
 
-    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
-                                vector[Lexeme*] *suffixes) except NULL:
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
+                                vector[const Lexeme*] *suffixes) except NULL:
         cdef size_t i
         cdef UniStr prefix
         cdef UniStr suffix
@@ -158,17 +159,17 @@ cdef class Language:
 
     cdef int _attach_tokens(self, Tokens tokens,
                             int idx, UniStr* string,
-                            vector[Lexeme*] *prefixes,
-                            vector[Lexeme*] *suffixes) except -1:
+                            vector[const Lexeme*] *prefixes,
+                            vector[const Lexeme*] *suffixes) except -1:
         cdef int split
-        cdef Lexeme** lexemes
+        cdef const Lexeme* const* lexemes
         cdef Lexeme* lexeme
         cdef UniStr span
         if prefixes.size():
             idx = tokens.extend(idx, prefixes.data(), prefixes.size())
         if string.n != 0:
 
-            lexemes = <Lexeme**>self._cache.get(string.key)
+            lexemes = <const Lexeme* const*>self._cache.get(string.key)
             if lexemes != NULL:
                 idx = tokens.extend(idx, lexemes, 0)
             else:
@@ -182,13 +183,13 @@ cdef class Language:
                     idx = tokens.push_back(idx, self.lexicon.get(&span))
                     slice_unicode(&span, string.chars, split + 1, string.n)
                     idx = tokens.push_back(idx, self.lexicon.get(&span))
-        cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
+        cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
             idx = tokens.push_back(idx, deref(it))
             preinc(it)
 
-    cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
-        lexemes = <Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
+    cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1:
+        lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
         cdef int i
         for i in range(n):
             lexemes[i] = tokens[i]
@@ -249,7 +250,7 @@ cdef class Lexicon:
         self.size = 1
         self.set_flags = set_flags
 
-    cdef Lexeme* get(self, UniStr* string) except NULL:
+    cdef const Lexeme* get(self, UniStr* string) except NULL:
         '''Retrieve a pointer to a Lexeme from the lexicon.'''
         cdef Lexeme* lex
         lex = <Lexeme*>self._map.get(string.key)
@@ -289,14 +290,14 @@ cdef class Lexicon:
             return self.lexemes.at(id_or_string)[0]
         cdef UniStr string
         slice_unicode(&string, id_or_string, 0, len(id_or_string))
-        cdef Lexeme* lexeme = self.get(&string)
+        cdef const Lexeme* lexeme = self.get(&string)
         return lexeme[0]
 
     def __setitem__(self, unicode uni_string, dict props):
         cdef UniStr s
         slice_unicode(&s, uni_string, 0, len(uni_string))
-        cdef Lexeme* lex = self.get(&s)
-        lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
+        cdef const Lexeme* lex = self.get(&s)
+        self.lexemes[lex.id][0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
 
     def dump(self, loc):
         if path.exists(loc):
@@ -319,7 +320,8 @@ cdef class Lexicon:
         assert st == 0
 
     def load(self, loc):
-        assert path.exists(loc)
+        if not path.exists(loc):
+            raise IOError('Lexemes file not found at %s' % loc)
         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
         cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
         assert fp != NULL
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 9d5dddd6d..a998aeedb 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -112,8 +112,8 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
                   dict props) except *
  
 
-cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil:
+cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
     return lexeme.flags & (1 << flag_id)
 
 
-cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id)
+cdef attr_t get_attr(const Lexeme* lex, attr_id_t attr_id)
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 888edc07b..2090ece50 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -35,7 +35,7 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
     return lex
 
 
-cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name):
+cdef attr_t get_attr(const Lexeme* lex, attr_id_t feat_name):
     if feat_name < (sizeof(flags_t) * 8):
         return check_flag(lex, feat_name)
     elif feat_name == ID:
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 2c97a3163..f91aa16ba 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -14,11 +14,11 @@ cdef class Tokens:
     cdef Pool mem
     cdef StringStore _string_store
 
-    cdef Lexeme** _lex_ptr
+    cdef const Lexeme** _lex_ptr
     cdef int* _idx_ptr
     cdef int* _pos_ptr
     cdef int* _ner_ptr
-    cdef Lexeme** lex
+    cdef const Lexeme** lex
     cdef int* idx
     cdef int* pos
     cdef int* ner
@@ -26,8 +26,8 @@ cdef class Tokens:
     cdef int length
     cdef int max_length
 
-    cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
-    cdef int push_back(self, int i, Lexeme* lexeme) except -1
+    cdef int extend(self, int i, const Lexeme* const* lexemes, int n) except -1
+    cdef int push_back(self, int i, const Lexeme* lexeme) except -1
     cpdef int set_tag(self, int i, int tag_type, int tag) except -1
 
     cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features)
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index e8e016944..7f79dcda9 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -44,7 +44,7 @@ cdef class Tokens:
         # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
         # However, we need to remember the true starting places, so that we can
         # realloc.
-        self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
+        self._lex_ptr = <const Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
         self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
         self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
         self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
@@ -74,7 +74,7 @@ cdef class Tokens:
     def __len__(self):
         return self.length
 
-    cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
+    cdef int push_back(self, int idx, const Lexeme* lexeme) except -1:
         if self.length == self.max_length:
             self._realloc(self.length * 2)
         self.lex[self.length] = lexeme
@@ -84,7 +84,7 @@ cdef class Tokens:
         self.length += 1
         return idx + lexeme.length
 
-    cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
+    cdef int extend(self, int idx, const Lexeme* const* lexemes, int n) except -1:
         cdef int i
         if lexemes == NULL:
             return idx
@@ -116,7 +116,7 @@ cdef class Tokens:
     def _realloc(self, new_size):
         self.max_length = new_size
         n = new_size + (PADDING * 2)
-        self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
+        self._lex_ptr = <const Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
         self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
         self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
         self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))

From 7e04c22f8feb2f71bb417ef87887a5654496a09e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 3 Dec 2014 15:58:17 +1100
Subject: [PATCH 15/56] * const added to Lexicon interface. Seems to work.

---
 spacy/lang.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 8d4ea7802..9013de9d0 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -296,8 +296,10 @@ cdef class Lexicon:
     def __setitem__(self, unicode uni_string, dict props):
         cdef UniStr s
         slice_unicode(&s, uni_string, 0, len(uni_string))
-        cdef const Lexeme* lex = self.get(&s)
-        self.lexemes[lex.id][0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
+        # Cast through the const here, since we're allowed to change our own
+        # Lexemes.
+        lex = <Lexeme*><void*>self.get(&s)
+        lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
 
     def dump(self, loc):
         if path.exists(loc):

From d7952634cae783948b7f81543c37ec583a9a9d67 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 3 Dec 2014 16:01:47 +1100
Subject: [PATCH 16/56] * Make the string-store serve const pointers to Utf8Str

---
 spacy/utf8string.pxd | 2 +-
 spacy/utf8string.pyx | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd
index 6bd5c6757..5ef4113d5 100644
--- a/spacy/utf8string.pxd
+++ b/spacy/utf8string.pxd
@@ -31,4 +31,4 @@ cdef class StringStore:
     cdef int size
     cdef int _resize_at
     
-    cdef Utf8Str* intern(self, char* chars, int length) except NULL
+    cdef const Utf8Str* intern(self, char* chars, int length) except NULL
diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx
index 0384a150c..1d2b7a264 100644
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@@ -20,7 +20,7 @@ cdef class StringStore:
 
     def __getitem__(self, object string_or_id):
         cdef bytes byte_string
-        cdef Utf8Str* utf8str
+        cdef const Utf8Str* utf8str
         if isinstance(string_or_id, int) or isinstance(string_or_id, long):
             if string_or_id < 1 or string_or_id >= self.size:
                 raise IndexError(string_or_id)
@@ -36,7 +36,7 @@ cdef class StringStore:
         else:
             raise TypeError(type(string_or_id))
 
-    cdef Utf8Str* intern(self, char* chars, int length) except NULL:
+    cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
         # 0 means missing, but we don't bother offsetting the index. We waste
         # slot 0 to simplify the code, because it doesn't matter.
         assert length != 0

From e1b1f45cc942d59bf68513176669f9021695bae7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 4 Dec 2014 20:46:20 +1100
Subject: [PATCH 17/56] * Add STEM attribute to lexeme

---
 spacy/lexeme.pxd | 36 +++++++++++++++++++++++++++++++++---
 spacy/lexeme.pyx | 32 ++------------------------------
 2 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index a998aeedb..ef0e8fb12 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -72,7 +72,8 @@ cpdef enum attr_id_t:
 
     ID
     SIC
-    NORM
+    STEM
+    DENSE
     SHAPE
     ASCIIED
     PREFIX
@@ -89,7 +90,8 @@ cdef struct Lexeme:
    
     attr_t id
     attr_t sic
-    attr_t norm
+    attr_t stem
+    attr_t dense
     attr_t shape
     attr_t asciied
     attr_t prefix
@@ -116,4 +118,32 @@ cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
     return lexeme.flags & (1 << flag_id)
 
 
-cdef attr_t get_attr(const Lexeme* lex, attr_id_t attr_id)
+cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
+    if feat_name < (sizeof(flags_t) * 8):
+        return check_flag(lex, feat_name)
+    elif feat_name == ID:
+        return lex.id
+    elif feat_name == SIC:
+        return lex.sic
+    elif feat_name == DENSE:
+        return lex.dense
+    elif feat_name == STEM:
+        return lex.stem
+    elif feat_name == SHAPE:
+        return lex.shape
+    elif feat_name == ASCIIED:
+        return lex.asciied
+    elif feat_name == PREFIX:
+        return lex.prefix
+    elif feat_name == SUFFIX:
+        return lex.suffix
+    elif feat_name == LENGTH:
+        return lex.length
+    elif feat_name == CLUSTER:
+        return lex.cluster
+    elif feat_name == POS_TYPE:
+        return lex.pos_type
+    elif feat_name == SENSE_TYPE:
+        return lex.sense_type
+    else:
+        return 0
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 2090ece50..5c8d7a60e 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -27,38 +27,10 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
 
     lex.prefix = string_store[string[:1]]
     lex.suffix = string_store[string[-3:]]
-    lex.norm = lex.sic # TODO
     lex.shape = string_store[orth.word_shape(string)]
+    lex.dense = lex.sic if lex.prob >= -10 else lex.shape
+    lex.stem = string_store[props.get('stem', string)]
     lex.asciied = string_store[orth.asciied(string)]
    
     lex.flags = props.get('flags', 0)
     return lex
-
-
-cdef attr_t get_attr(const Lexeme* lex, attr_id_t feat_name):
-    if feat_name < (sizeof(flags_t) * 8):
-        return check_flag(lex, feat_name)
-    elif feat_name == ID:
-        return lex.id
-    elif feat_name == SIC:
-        return lex.sic
-    elif feat_name == NORM:
-        return lex.norm
-    elif feat_name == SHAPE:
-        return lex.shape
-    elif feat_name == ASCIIED:
-        return lex.asciied
-    elif feat_name == PREFIX:
-        return lex.prefix
-    elif feat_name == SUFFIX:
-        return lex.suffix
-    elif feat_name == LENGTH:
-        return lex.length
-    elif feat_name == CLUSTER:
-        return lex.cluster
-    elif feat_name == POS_TYPE:
-        return lex.pos_type
-    elif feat_name == SENSE_TYPE:
-        return lex.sense_type
-    else:
-        raise StandardError('Feature ID: %d not found' % feat_name)

From 69bb0222041f8d43febc7648c04b903804a6b299 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 4 Dec 2014 20:46:55 +1100
Subject: [PATCH 18/56] * Add as_array and count_by method

---
 spacy/tokens.pxd |  5 ++---
 spacy/tokens.pyx | 39 ++++++++++++++++++++++++---------------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index f91aa16ba..90356b74e 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -2,13 +2,12 @@ import numpy as np
 cimport numpy as np
 
 from cymem.cymem cimport Pool
+from thinc.typedefs cimport atom_t
 
 from .lexeme cimport Lexeme
 from .typedefs cimport flags_t
 from .utf8string cimport StringStore
 
-from thinc.typedefs cimport atom_t
-
 
 cdef class Tokens:
     cdef Pool mem
@@ -30,7 +29,7 @@ cdef class Tokens:
     cdef int push_back(self, int i, const Lexeme* lexeme) except -1
     cpdef int set_tag(self, int i, int tag_type, int tag) except -1
 
-    cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features)
+    cpdef np.ndarray[long, ndim=2] get_array(self, list features)
 
 
 cdef class Token:
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 7f79dcda9..7fdfa8e1e 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -1,7 +1,13 @@
 # cython: profile=True
+from preshed.maps cimport PreshMap
+from preshed.counter cimport PreshCounter
+
 from .lexeme cimport *
 cimport cython
 
+import numpy as np
+cimport numpy as np
+
 POS = 0
 ENTITY = 0
 
@@ -19,20 +25,10 @@ cdef class Tokens:
     """A sequence of references to Lexeme objects.
 
     The Tokens class provides fast and memory-efficient access to lexical features,
-    and can efficiently export the data to a numpy array.  Specific languages
-    create their own Tokens subclasses, to provide more convenient access to
-    language-specific features.
+    and can efficiently export the data to a numpy array.
 
     >>> from spacy.en import EN
     >>> tokens = EN.tokenize('An example sentence.')
-    >>> tokens.string(0)
-    'An'
-    >>> tokens.prob(0) > tokens.prob(1)
-    True
-    >>> tokens.can_noun(0)
-    False
-    >>> tokens.can_noun(1)
-    True
     """
     def __init__(self, StringStore string_store, string_length=0):
         self._string_store = string_store
@@ -104,15 +100,28 @@ cdef class Tokens:
         elif tag_type == ENTITY:
             self.ner[i] = tag
 
-    cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features):
+    @cython.boundscheck(False)
+    cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
         cdef int i, j
-        cdef np.ndarray[atom_t, ndim=2] output
-        output = np.ndarray(shape=(self.length, len(features)), dtype=int)
+        cdef attr_id_t feature
+        cdef np.ndarray[long, ndim=2] output
+        output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
         for i in range(self.length):
-            for j, feature in enumerate(features):
+            for j, feature in enumerate(attr_ids):
                 output[i, j] = get_attr(self.lex[i], feature)
         return output
 
+    def count_by(self, attr_id_t attr_id):
+        cdef int i
+        cdef attr_t attr
+        cdef size_t count
+
+        cdef PreshCounter counts = PreshCounter(2 ** 8)
+        for i in range(self.length):
+            attr = get_attr(self.lex[i], attr_id)
+            counts.inc(attr, 1)
+        return dict(counts)
+
     def _realloc(self, new_size):
         self.max_length = new_size
         n = new_size + (PADDING * 2)

From 564082e48e91efcf5f6dcb30b27b2252a7364ab2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 4 Dec 2014 20:51:29 +1100
Subject: [PATCH 19/56] * Hack Token class to take lex.dense inplace of the old
 lex.norm. This needs to be fixed...

---
 spacy/tokens.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 7fdfa8e1e..c06a1b4d8 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -153,7 +153,7 @@ cdef class Token:
         self.postype = lex['pos_type']
         self.sensetype = lex['sense_type']
         self.sic = lex['sic']
-        self.norm = lex['norm']
+        self.norm = lex['dense']
         self.shape = lex['shape']
         self.suffix = lex['asciied']
         self.prefix = lex['prefix']

From 49f3780ff5fc34343fe40ab0ad0f8e0b44b54ca9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 4 Dec 2014 21:22:38 +1100
Subject: [PATCH 20/56] * Fiddle with lexeme attrs

---
 spacy/lexeme.pxd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index ef0e8fb12..e35bde61e 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -103,8 +103,8 @@ cdef struct Lexeme:
     attr_t sense_type
 
     float prob
-    float upper_pc
-    float title_pc
+    float lower_pc
+    float sentiment
 
 
 cdef Lexeme EMPTY_LEXEME

From a14f9eaf6355e28dcb4ba8180831e0470dcb2388 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 4 Dec 2014 22:14:11 +1100
Subject: [PATCH 21/56] * Add index.pyx to setup

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index ae6d5a99d..35c411d38 100644
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,7 @@ exts = [
     Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes)
     #Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),

From 75b8dfb3484e89be88dd10fa9ab7ebdedf949efe Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 4 Dec 2014 22:14:34 +1100
Subject: [PATCH 22/56] * Remove upper_pc from lexeme.pyx

---
 spacy/lexeme.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 5c8d7a60e..cd92c4845 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -22,8 +22,7 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
     lex.sense_type = props.get('sense_type', 0)
     lex.prob = props.get('prob', 0)
 
-    lex.upper_pc = props.get('upper_pc', 0.0)
-    lex.title_pc = props.get('lower_pc', 0.0)
+    lex.lower_pc = props.get('lower_pc', 0.0)
 
     lex.prefix = string_store[string[:1]]
     lex.suffix = string_store[string[-3:]]

From 187372c7f338e23b29994956c434da48c6af467c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 5 Dec 2014 03:29:50 +1100
Subject: [PATCH 23/56] * Allow the lexicon to create lexemes using an external
 memory pool, so that it can decide to make some lexemes temporary, rather
 than cached

---
 spacy/lang.pxd |  2 +-
 spacy/lang.pyx | 55 +++++++++++++++++++++++++++++---------------------
 2 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index d4b587a6b..d27378816 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -18,7 +18,7 @@ cdef class Lexicon:
     cpdef readonly StringStore strings
     cdef vector[Lexeme*] lexemes
 
-    cdef const Lexeme* get(self, UniStr* s) except NULL
+    cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
     
     cdef PreshMap _map
     
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 9013de9d0..100b51a98 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -18,6 +18,7 @@ from preshed.maps cimport PreshMap
 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
+from .lexeme cimport check_flag, IS_ALPHA
 
 from .utf8string cimport slice_unicode
 
@@ -53,7 +54,7 @@ cdef class Language:
         cdef int idx = 0
         for i, py_string in enumerate(strings):
             slice_unicode(&string_struct, py_string, 0, len(py_string))
-            tokens.push_back(idx, self.lexicon.get(&string_struct))
+            tokens.push_back(idx, self.lexicon.get(tokens.mem, &string_struct))
             idx += len(py_string) + 1
         return tokens
 
@@ -132,7 +133,7 @@ cdef class Language:
                 # Check whether we've hit a special-case
                 if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
                     string[0] = minus_pre
-                    prefixes.push_back(self.lexicon.get(&prefix))
+                    prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
                     break
             suf_len = self._find_suffix(string.chars, string.n)
             if suf_len != 0:
@@ -141,18 +142,18 @@ cdef class Language:
                 # Check whether we've hit a special-case
                 if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
                     string[0] = minus_suf
-                    suffixes.push_back(self.lexicon.get(&suffix))
+                    suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
                     break
             if pre_len and suf_len and (pre_len + suf_len) <= string.n:
                 slice_unicode(string, string.chars, pre_len, string.n - suf_len)
-                prefixes.push_back(self.lexicon.get(&prefix))
-                suffixes.push_back(self.lexicon.get(&suffix))
+                prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
+                suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
             elif pre_len:
                 string[0] = minus_pre
-                prefixes.push_back(self.lexicon.get(&prefix))
+                prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
             elif suf_len:
                 string[0] = minus_suf
-                suffixes.push_back(self.lexicon.get(&suffix))
+                suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
             if self._specials.get(string.key):
                 break
         return string
@@ -175,22 +176,25 @@ cdef class Language:
             else:
                 split = self._find_infix(string.chars, string.n)
                 if split == 0 or split == -1:
-                    idx = tokens.push_back(idx, self.lexicon.get(string))
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, string))
                 else:
                     slice_unicode(&span, string.chars, 0, split)
-                    idx = tokens.push_back(idx, self.lexicon.get(&span))
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
                     slice_unicode(&span, string.chars, split, split+1)
-                    idx = tokens.push_back(idx, self.lexicon.get(&span))
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
                     slice_unicode(&span, string.chars, split + 1, string.n)
-                    idx = tokens.push_back(idx, self.lexicon.get(&span))
+                    idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
         cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
             idx = tokens.push_back(idx, deref(it))
             preinc(it)
 
     cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1:
-        lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
         cdef int i
+        for i in range(n):
+            if tokens[i].id == 1:
+                return 0
+        lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
         for i in range(n):
             lexemes[i] = tokens[i]
         lexemes[i + 1] = NULL
@@ -230,7 +234,7 @@ cdef class Language:
             lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
             for i, substring in enumerate(substrings):
                 slice_unicode(&string, substring, 0, len(substring))
-                lexemes[i] = <Lexeme*>self.lexicon.get(&string)
+                lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
             lexemes[i + 1] = NULL
             slice_unicode(&string, uni_string, 0, len(uni_string))
             self._specials.set(string.key, lexemes)
@@ -247,23 +251,28 @@ cdef class Lexicon:
         self._map = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.lexemes.push_back(&EMPTY_LEXEME)
-        self.size = 1
+        self.size = 2
         self.set_flags = set_flags
 
-    cdef const Lexeme* get(self, UniStr* string) except NULL:
+    cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
         '''Retrieve a pointer to a Lexeme from the lexicon.'''
         cdef Lexeme* lex
         lex = <Lexeme*>self._map.get(string.key)
         if lex != NULL:
             return lex
-        lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
+        if string.n < 3:
+            mem = self.mem
+        lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
         lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
                 self.strings, {'flags': self.set_flags(string.chars[:string.n])})
-        self._map.set(string.key, lex)
-        while self.lexemes.size() < (lex.id + 1):
-            self.lexemes.push_back(&EMPTY_LEXEME)
-        self.lexemes[lex.id] = lex
-        self.size += 1
+        if mem is self.mem:
+            self._map.set(string.key, lex)
+            while self.lexemes.size() < (lex.id + 1):
+                self.lexemes.push_back(&EMPTY_LEXEME)
+            self.lexemes[lex.id] = lex
+            self.size += 1
+        else:
+            lex[0].id = 1
         return lex
 
     def __getitem__(self,  id_or_string):
@@ -290,7 +299,7 @@ cdef class Lexicon:
             return self.lexemes.at(id_or_string)[0]
         cdef UniStr string
         slice_unicode(&string, id_or_string, 0, len(id_or_string))
-        cdef const Lexeme* lexeme = self.get(&string)
+        cdef const Lexeme* lexeme = self.get(self.mem, &string)
         return lexeme[0]
 
     def __setitem__(self, unicode uni_string, dict props):
@@ -298,7 +307,7 @@ cdef class Lexicon:
         slice_unicode(&s, uni_string, 0, len(uni_string))
         # Cast through the const here, since we're allowed to change our own
         # Lexemes.
-        lex = <Lexeme*><void*>self.get(&s)
+        lex = <Lexeme*><void*>self.get(self.mem, &s)
         lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
 
     def dump(self, loc):

From 1c9253701daeeafac166608204c44a2db0e9e1fa Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 5 Dec 2014 15:56:14 +1100
Subject: [PATCH 24/56] * Introduce a TokenC struct, to handle token indices,
 pos tags and sense tags

---
 spacy/lang.pxd   |  4 ++--
 spacy/lang.pyx   | 14 ++++++++------
 spacy/tokens.pxd | 20 ++++++++++++--------
 spacy/tokens.pyx | 46 ++++++++++++++++------------------------------
 4 files changed, 38 insertions(+), 46 deletions(-)

diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index d27378816..fd4cf6e70 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -6,7 +6,7 @@ from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 
 from .typedefs cimport hash_t
-from .tokens cimport Tokens
+from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .utf8string cimport StringStore, UniStr
 
@@ -45,5 +45,5 @@ cdef class Language:
     cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
     cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
     cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
-    cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
  
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 100b51a98..1fdd683f3 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -18,7 +18,7 @@ from preshed.maps cimport PreshMap
 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
-from .lexeme cimport check_flag, IS_ALPHA
+from .lexeme cimport check_flag
 
 from .utf8string cimport slice_unicode
 
@@ -114,7 +114,7 @@ cdef class Language:
         orig_size = tokens.length
         self._split_affixes(span, &prefixes, &suffixes)
         self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
-        self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
+        self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
 
     cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
                                 vector[const Lexeme*] *suffixes) except NULL:
@@ -189,14 +189,14 @@ cdef class Language:
             idx = tokens.push_back(idx, deref(it))
             preinc(it)
 
-    cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1:
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
         cdef int i
         for i in range(n):
-            if tokens[i].id == 1:
+            if tokens[i].lex.id == 1:
                 return 0
         lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
         for i in range(n):
-            lexemes[i] = tokens[i]
+            lexemes[i] = tokens[i].lex
         lexemes[i + 1] = NULL
         self._cache.set(key, lexemes)
 
@@ -255,7 +255,9 @@ cdef class Lexicon:
         self.set_flags = set_flags
 
     cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
-        '''Retrieve a pointer to a Lexeme from the lexicon.'''
+        '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
+        if necessary, using memory acquired from the given pool.  If the pool
+        is the lexicon's own memory, the lexeme is saved in the lexicon.'''
         cdef Lexeme* lex
         lex = <Lexeme*>self._map.get(string.key)
         if lex != NULL:
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 90356b74e..a219c707f 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -9,18 +9,22 @@ from .typedefs cimport flags_t
 from .utf8string cimport StringStore
 
 
+cdef struct TokenC:
+    const Lexeme* lex
+    int idx
+    int pos
+    int sense
+
+
+cdef TokenC EMPTY_TOKEN = TokenC(&EMPTY_LEXEME, 0, 0, 0)
+
+
 cdef class Tokens:
     cdef Pool mem
     cdef StringStore _string_store
 
-    cdef const Lexeme** _lex_ptr
-    cdef int* _idx_ptr
-    cdef int* _pos_ptr
-    cdef int* _ner_ptr
-    cdef const Lexeme** lex
-    cdef int* idx
-    cdef int* pos
-    cdef int* ner
+    cdef TokenC* _data
+    cdef TokenC* data
 
     cdef int length
     cdef int max_length
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index c06a1b4d8..06d3eeb99 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -40,28 +40,18 @@ cdef class Tokens:
         # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
         # However, we need to remember the true starting places, so that we can
         # realloc.
-        self._lex_ptr = <const Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
-        self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
-        self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
-        self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
-        self.lex = self._lex_ptr
-        self.idx = self._idx_ptr
-        self.pos = self._pos_ptr
-        self.ner = self._ner_ptr
+        self._data = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
         cdef int i
         for i in range(size + (PADDING*2)):
-            self.lex[i] = &EMPTY_LEXEME
-        self.lex += PADDING
-        self.idx += PADDING
-        self.pos += PADDING
-        self.ner += PADDING
+            self._data[i] = EMPTY_TOKEN
+        self.data = self._data + PADDING
         self.max_length = size
         self.length = 0
 
     def __getitem__(self, i):
         bounds_check(i, self.length, PADDING)
-        return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
-                     self.lex[i][0])
+        return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
+                     self.data[i].sense, self.data[i].lex[0])
 
     def __iter__(self):
         for i in range(self.length):
@@ -73,10 +63,11 @@ cdef class Tokens:
     cdef int push_back(self, int idx, const Lexeme* lexeme) except -1:
         if self.length == self.max_length:
             self._realloc(self.length * 2)
-        self.lex[self.length] = lexeme
-        self.idx[self.length] = idx
-        self.pos[self.length] = 0
-        self.ner[self.length] = 0
+        cdef TokenC* t = &self.data[self.length]
+        t.lex = lexeme
+        t.idx = idx
+        t.pos = 0
+        t.sense = 0
         self.length += 1
         return idx + lexeme.length
 
@@ -108,7 +99,7 @@ cdef class Tokens:
         output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
         for i in range(self.length):
             for j, feature in enumerate(attr_ids):
-                output[i, j] = get_attr(self.lex[i], feature)
+                output[i, j] = get_attr(self.data[i].lex, feature)
         return output
 
     def count_by(self, attr_id_t attr_id):
@@ -118,23 +109,18 @@ cdef class Tokens:
 
         cdef PreshCounter counts = PreshCounter(2 ** 8)
         for i in range(self.length):
-            attr = get_attr(self.lex[i], attr_id)
+            attr = get_attr(self.data[i].lex, attr_id)
             counts.inc(attr, 1)
         return dict(counts)
 
     def _realloc(self, new_size):
         self.max_length = new_size
         n = new_size + (PADDING * 2)
-        self._lex_ptr = <const Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
-        self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
-        self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
-        self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
-        self.lex = self._lex_ptr + PADDING
-        self.idx = self._idx_ptr + PADDING
-        self.pos = self._pos_ptr + PADDING
-        self.ner = self._ner_ptr + PADDING
+        self._data = <TokenC*>self.mem.realloc(self._data, n * sizeof(TokenC))
+        self.data = self._data + PADDING
+        cdef int i
         for i in range(self.length, self.max_length + PADDING):
-            self.lex[i] = &EMPTY_LEXEME
+            self.data[i] = EMPTY_TOKEN
 
 
 @cython.freelist(64)

From e27b912ef98ac974181309b3d3be056ea4c9393b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 5 Dec 2014 16:31:30 +1100
Subject: [PATCH 25/56] * Remove need for confusing _data pointer to be stored
 on Tokens

---
 spacy/tokens.pxd |  1 -
 spacy/tokens.pyx | 11 ++++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index a219c707f..addb1e3e5 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -23,7 +23,6 @@ cdef class Tokens:
     cdef Pool mem
     cdef StringStore _string_store
 
-    cdef TokenC* _data
     cdef TokenC* data
 
     cdef int length
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 06d3eeb99..b474ff6fb 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -40,11 +40,11 @@ cdef class Tokens:
         # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
         # However, we need to remember the true starting places, so that we can
         # realloc.
-        self._data = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
+        data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
         cdef int i
         for i in range(size + (PADDING*2)):
-            self._data[i] = EMPTY_TOKEN
-        self.data = self._data + PADDING
+            data_start[i] = EMPTY_TOKEN
+        self.data = data_start + PADDING
         self.max_length = size
         self.length = 0
 
@@ -116,8 +116,9 @@ cdef class Tokens:
     def _realloc(self, new_size):
         self.max_length = new_size
         n = new_size + (PADDING * 2)
-        self._data = <TokenC*>self.mem.realloc(self._data, n * sizeof(TokenC))
-        self.data = self._data + PADDING
+        cdef TokenC* data_start = self.data - PADDING
+        data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
+        self.data = data_start + PADDING
         cdef int i
         for i in range(self.length, self.max_length + PADDING):
             self.data[i] = EMPTY_TOKEN

From f5c4f2eb52c3618c4dda056c0171b21b1b7a0e63 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 15:28:22 +1100
Subject: [PATCH 26/56] * Revise context, focussing on POS tagging for now

---
 spacy/context.pxd |  97 ++++++++++++++------------------
 spacy/context.pyx | 138 +++++-----------------------------------------
 2 files changed, 54 insertions(+), 181 deletions(-)

diff --git a/spacy/context.pxd b/spacy/context.pxd
index 8f798d347..3dd842b6e 100644
--- a/spacy/context.pxd
+++ b/spacy/context.pxd
@@ -1,66 +1,49 @@
 from thinc.typedefs cimport atom_t
-from .typedefs cimport hash_t
-from .tokens cimport Tokens
-from .lexeme cimport Lexeme
+from .tokens cimport TokenC
 
 
-cdef class Token:
-    cdef readonly atom_t sic
-    cdef readonly atom_t cluster
-    cdef readonly atom_t norm
-    cdef readonly atom_t shape
-    cdef readonly atom_t asciied
-    cdef readonly atom_t prefix
-    cdef readonly atom_t suffix
-    cdef readonly atom_t length
+cpdef enum:
+    P2_sic
+    P2_cluster
+    P2_shape
+    P2_prefix
+    P2_suffix
+    P2_pos
+    P2_sense
 
-    cdef readonly atom_t postype
-    cdef readonly atom_t nertype
-    cdef readonly atom_t sensetype
+    P1_sic
+    P1_cluster
+    P1_shape
+    P1_prefix
+    P1_suffix
+    P1_pos
+    P1_sense
 
-    cdef readonly atom_t is_alpha
-    cdef readonly atom_t is_ascii
-    cdef readonly atom_t is_digit
-    cdef readonly atom_t is_lower
-    cdef readonly atom_t is_punct
-    cdef readonly atom_t is_space
-    cdef readonly atom_t is_title
-    cdef readonly atom_t is_upper
-    cdef readonly atom_t like_url
-    cdef readonly atom_t like_number
-    cdef readonly atom_t oft_lower
-    cdef readonly atom_t oft_title
-    cdef readonly atom_t oft_upper
+    W_sic
+    W_cluster
+    W_shape
+    W_prefix
+    W_suffix
+    W_pos
+    W_sense
 
-    cdef readonly atom_t in_males
-    cdef readonly atom_t in_females
-    cdef readonly atom_t in_surnames
-    cdef readonly atom_t in_places
-    cdef readonly atom_t in_games
-    cdef readonly atom_t in_celebs
-    cdef readonly atom_t in_names
+    N1_sic
+    N1_cluster
+    N1_shape
+    N1_prefix
+    N1_suffix
+    N1_pos
+    N1_sense
 
-    cdef readonly atom_t pos
-    cdef readonly atom_t sense
-    cdef readonly atom_t ner
+    N2_sic
+    N2_cluster
+    N2_shape
+    N2_prefix
+    N2_suffix
+    N2_pos
+    N2_sense
+
+    N_FIELDS
 
 
-cdef class Slots:
-    cdef readonly Token P4
-    cdef readonly Token P3
-    cdef readonly Token P2
-    cdef readonly Token P1
-    cdef readonly Token N0
-    cdef readonly Token N1
-    cdef readonly Token N2
-    cdef readonly Token N3
-    cdef readonly Token N4
-
-
-cdef int N_FIELDS
-
-
-cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
-
-
-cpdef Slots FIELD_IDS
+cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1
diff --git a/spacy/context.pyx b/spacy/context.pyx
index aeb78ae5c..c81daef2c 100644
--- a/spacy/context.pyx
+++ b/spacy/context.pyx
@@ -1,126 +1,16 @@
-from murmurhash.mrmr cimport hash64
-from .lexeme cimport *
+cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1:
+    _fill_from_token(&context[P2_sic], &tokens[i-2])
+    _fill_from_token(&context[P1_sic], &tokens[i-1])
+    _fill_from_token(&context[W_sic], &tokens[i])
+    _fill_from_token(&context[N1_sic], &tokens[i+1])
+    _fill_from_token(&context[N2_sic], &tokens[i+2])
 
 
-cdef class Slots:
-    def __init__(self):
-        self.P4 = Token()
-        self.P3 = Token()
-        self.P2 = Token()
-        self.P1 = Token()
-        self.N0 = Token()
-        self.N1 = Token()
-        self.N2 = Token()
-        self.N3 = Token()
-        self.N4 = Token()
-
-
-cdef void _number_token(Token t, int* n_fields):
-    cdef int i = n_fields[0]
-    t.sic = i; i += 1
-    t.cluster = i; i += 1
-    t.norm = i; i += 1
-    t.shape = i; i += 1
-    t.prefix = i; i += 1
-    t.suffix = i; i += 1
-    t.length = i; i += 1
-
-    t.postype = i; i += 1
-    t.nertype = i; i += 1
-    t.sensetype = i; i += 1
-
-    t.is_alpha = i; i += 1
-    t.is_ascii = i; i += 1
-    t.is_digit = i; i += 1
-    t.is_lower = i; i += 1
-    t.is_punct = i; i += 1
-    t.is_space = i; i += 1
-    t.is_title = i; i += 1
-    t.is_upper = i; i += 1
-
-    t.like_number = i; i += 1
-    t.like_url = i; i += 1
-
-    t.oft_lower = i; i += 1
-    t.oft_title = i; i += 1
-    t.oft_upper = i; i += 1
-
-    t.in_males = i; i += 1
-    t.in_females = i; i += 1
-    t.in_surnames = i; i += 1
-    t.in_places = i; i += 1
-    t.in_games = i; i += 1
-    t.in_celebs = i; i += 1
-    t.in_names = i; i += 1
-
-    t.pos = i; i += 1
-    t.sense = i; i += 1
-    t.ner = i; i += 1
-
-    n_fields[0] = i
-
-
-cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner):
-    c[t.sic] = lex.sic
-    c[t.cluster] = lex.cluster
-    c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
-    c[t.shape] = lex.shape
-    c[t.asciied] = lex.asciied
-    c[t.prefix] = lex.prefix
-    c[t.suffix] = lex.suffix
-    c[t.length] = lex.length
-
-    c[t.postype] = lex.postype
-    c[t.nertype] = 0
-    c[t.sensetype] = 0
-    
-    c[t.is_alpha] = lex.flags & (1 << IS_ALPHA)
-    c[t.is_digit] = lex.flags & (1 << IS_DIGIT)
-    c[t.is_lower] = lex.flags & (1 << IS_LOWER)
-    c[t.is_punct] = lex.flags & (1 << IS_PUNCT)
-    c[t.is_space] = lex.flags & (1 << IS_SPACE)
-    c[t.is_title] = lex.flags & (1 << IS_TITLE)
-    c[t.is_upper] = lex.flags & (1 << IS_UPPER)
-    c[t.like_url] = lex.flags & (1 << LIKE_URL)
-    c[t.like_number] = lex.flags & (1 << LIKE_NUMBER)
-    c[t.oft_lower] = lex.flags & (1 << OFT_LOWER)
-    c[t.oft_title] = lex.flags & (1 << OFT_TITLE)
-    c[t.oft_upper] = lex.flags & (1 << OFT_UPPER)
-
-    c[t.in_males] = lex.flags & (1 << IN_MALES)
-    c[t.in_females] = lex.flags & (1 << IN_FEMALES)
-    c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES)
-    c[t.in_places] = lex.flags & (1 << IN_PLACES)
-    c[t.in_games] = lex.flags & (1 << IN_GAMES)
-    c[t.in_celebs] = lex.flags & (1 << IN_CELEBS)
-    c[t.in_names] = lex.flags & (1 << IN_NAMES)
-
-    c[t.pos] = pos
-    c[t.sense] = 0
-    c[t.ner] = ner
-
-
-cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
-    _fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4])
-    _fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3])
-    _fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
-    _fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
-    _fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
-    _fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
-    _fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
-    _fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3])
-    _fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4])
-    return 1
-
-
-N_FIELDS = 0
-FIELD_IDS = Slots()
-_number_token(FIELD_IDS.P4, &N_FIELDS)
-_number_token(FIELD_IDS.P3, &N_FIELDS)
-_number_token(FIELD_IDS.P2, &N_FIELDS)
-_number_token(FIELD_IDS.P1, &N_FIELDS)
-_number_token(FIELD_IDS.N0, &N_FIELDS)
-_number_token(FIELD_IDS.N1, &N_FIELDS)
-_number_token(FIELD_IDS.N2, &N_FIELDS)
-_number_token(FIELD_IDS.N3, &N_FIELDS)
-_number_token(FIELD_IDS.N4, &N_FIELDS)
+cdef inline void _fill_from_token(atom_t[N_FIELDS] context, const TokenC* t) nogil:
+    context[0] = t.lex.sic
+    context[1] = t.lex.cluster
+    context[2] = t.lex.shape
+    context[3] = t.lex.prefix
+    context[4] = t.lex.suffix
+    context[5] = t.pos
+    context[6] = t.sense

From 0c7aeb9de7105e513d38544c15c860764a521bcb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 15:29:04 +1100
Subject: [PATCH 27/56] * Begin revising tagger, focussing on POS tagging

---
 spacy/tagger.pxd |  11 +-----
 spacy/tagger.pyx | 100 +++++++++++++++++++++++------------------------
 2 files changed, 50 insertions(+), 61 deletions(-)

diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index 11d8d2a4c..0a9b4a0c4 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -5,20 +5,17 @@ from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 
 from .typedefs cimport hash_t
-from .context cimport Slots
 from .tokens cimport Tokens
 
 
 cpdef enum TagType:
     POS
-    ENTITY
     SENSE
 
 
 cdef class Tagger:
     cpdef int set_tags(self, Tokens tokens) except -1
-    cpdef class_t predict(self, int i, Tokens tokens) except 0
-    cpdef int tell_answer(self, list gold) except -1
+    cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except 0
  
     cpdef readonly Pool mem
     cpdef readonly Extractor extractor
@@ -26,9 +23,3 @@ cdef class Tagger:
 
     cpdef readonly TagType tag_type
     cpdef readonly list tag_names
-
-    cdef class_t _guess
-    cdef atom_t* _context
-    cdef feat_t* _feats
-    cdef weight_t* _values
-    cdef weight_t* _scores
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 428814f70..22732843d 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -1,8 +1,10 @@
 # cython: profile=True
-from __future__ import print_function
 from __future__ import unicode_literals
 from __future__ import division
 
+from .context cimport fill_context
+from .context cimport N_FIELDS
+
 from os import path
 import os
 import shutil
@@ -10,11 +12,7 @@ import random
 import json
 import cython
 
-
-from .context cimport fill_context
-from .context cimport N_FIELDS
-
-from thinc.features cimport ConjFeat
+from thinc.features cimport Feature, count_feats
 
 
 NULL_TAG = 0
@@ -35,7 +33,8 @@ def setup_model_dir(tag_type, tag_names, templates, model_dir):
 
 def train(train_sents, model_dir, nr_iter=10):
     cdef Tokens tokens
-    tagger = Tagger(model_dir)
+    cdef Tagger tagger = Tagger(model_dir)
+    cdef int i
     for _ in range(nr_iter):
         n_corr = 0
         total = 0
@@ -43,9 +42,10 @@ def train(train_sents, model_dir, nr_iter=10):
             assert len(tokens) == len(golds), [t.string for t in tokens]
             for i in range(tokens.length):
                 if tagger.tag_type == POS:
-                    gold = _get_gold_pos(i, golds, tokens.pos)
-                elif tagger.tag_type == ENTITY:
-                    gold = _get_gold_ner(i, golds, tokens.ner)
+                    gold = _get_gold_pos(i, golds)
+                else:
+                    raise StandardError
+
                 guess = tagger.predict(i, tokens)
                 tokens.set_tag(i, tagger.tag_type, guess)
                 if gold is not None:
@@ -59,7 +59,7 @@ def train(train_sents, model_dir, nr_iter=10):
     tagger.model.dump(path.join(model_dir, 'model'))
 
 
-cdef object _get_gold_pos(i, golds, int* pred):
+cdef object _get_gold_pos(i, golds):
     if golds[i] == 0:
         return None
     else:
@@ -96,17 +96,11 @@ cdef class Tagger:
         templates = cfg['templates']
         self.tag_names = cfg['tag_names']
         self.tag_type = cfg['tag_type']
-        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
+        self.extractor = Extractor(templates)
         self.model = LinearModel(len(self.tag_names))
         if path.exists(path.join(model_dir, 'model')):
             self.model.load(path.join(model_dir, 'model'))
 
-        self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
-        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
-        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
-        self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
-        self._guess = NULL_TAG
-
     cpdef int set_tags(self, Tokens tokens) except -1:
         """Assign tags to a Tokens object.
 
@@ -119,7 +113,7 @@ cdef class Tagger:
         for i in range(tokens.length):
             tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
 
-    cpdef class_t predict(self, int i, Tokens tokens) except 0:
+    cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except 0:
         """Predict the tag of tokens[i].  The tagger remembers the features and
         prediction, in case you later call tell_answer.
 
@@ -127,38 +121,20 @@ cdef class Tagger:
         >>> tag = EN.pos_tagger.predict(0, tokens)
         >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
         """
-        fill_context(self._context, i, tokens)
-        self.extractor.extract(self._feats, self._values, self._context, NULL)
-        self._guess = self.model.score(self._scores, self._feats, self._values)
-        return self._guess
-
-    cpdef int tell_answer(self, list golds) except -1:
-        """Provide the correct tag for the word the tagger was last asked to predict.
-        During Tagger.predict, the tagger remembers the features and prediction
-        for the example. These are used to calculate a weight update given the
-        correct label.
-
-        >>> tokens = EN.tokenize('An example sentence.')
-        >>> guess = EN.pos_tagger.predict(1, tokens)
-        >>> JJ = EN.pos_tagger.tag_id('JJ')
-        >>> JJ
-        7
-        >>> EN.pos_tagger.tell_answer(JJ)
-        """
-        cdef class_t guess = self._guess
-        if guess in golds:
-            self.model.update({})
-            return 0
-        best_gold = golds[0]
-        best_score = self._scores[best_gold-1]
-        for gold in golds[1:]:
-            if self._scores[gold-1] > best_gold:
-                best_score = self._scores[best_gold-1]
-                best_gold = gold
-        counts = {guess: {}, best_gold: {}}
-        self.extractor.count(counts[best_gold], self._feats, 1)
-        self.extractor.count(counts[guess], self._feats, -1)
-        self.model.update(counts)
+        cdef int n_feats
+        cdef atom_t[N_FIELDS] context
+        print sizeof(context)
+        fill_context(context, i, tokens.data)
+        cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
+        cdef weight_t* scores = self.model.get_scores(feats, n_feats)
+        cdef class_t guess = _arg_max(scores, self.nr_class)
+        if golds is not None and guess not in golds:
+            best = _arg_max_among(scores, golds)
+            counts = {}
+            count_feats(counts[guess], feats, n_feats, -1)
+            count_feats(counts[best], feats, n_feats, 1)
+            self.model.update(counts)
+        return guess
 
     def tag_id(self, object tag_name):
         """Encode tag_name into a tag ID integer."""
@@ -167,3 +143,25 @@ cdef class Tagger:
             tag_id = len(self.tag_names)
             self.tag_names.append(tag_name)
         return tag_id
+
+
+cdef class_t _arg_max(weight_t* scores, int n_classes):
+    cdef int best = 0
+    cdef weight_t score = scores[best]
+    cdef int i
+    for i in range(1, n_classes):
+        if scores[i] > score:
+            score = scores[i]
+            best = i
+    return best
+
+
+cdef class_t _arg_max_among(weight_t* scores, list classes):
+    cdef int best = classes[0]
+    cdef weight_t score = scores[best]
+    cdef class_t clas
+    for clas in classes:
+        if scores[clas] > score:
+            score = scores[clas]
+            best = clas
+    return best

From 5caabec789a767a6d9876fc4b79f3c302afceb0a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 15:29:41 +1100
Subject: [PATCH 28/56] * Link in tagger, to work on integrating POS tagging

---
 spacy/lang.pxd | 2 ++
 spacy/lang.pyx | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index fd4cf6e70..54f317ce8 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -8,6 +8,7 @@ from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
+from .tagger cimport Tagger
 from .utf8string cimport StringStore, UniStr
 
 
@@ -29,6 +30,7 @@ cdef class Language:
     cdef PreshMap _cache
     cdef PreshMap _specials
     cpdef readonly Lexicon lexicon
+    cpdef readonly Tagger pos_tagger
 
     cdef object _prefix_re
     cdef object _suffix_re
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 1fdd683f3..0ca5f08d2 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -39,10 +39,12 @@ cdef class Language:
         self._infix_re = re.compile(infix)
         self.lexicon = Lexicon(self.set_flags)
         self._load_special_tokenization(rules)
+        self.pos_tagger = None
 
     def load(self):
         self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
         self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
+        self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
 
     cpdef Tokens tokens_from_list(self, list strings):
         cdef int length = sum([len(s) for s in strings])

From 91e8d9ea1c89da2cc2e339213771d6622c8ad3fc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 15:29:54 +1100
Subject: [PATCH 29/56] * Compile context.pyx and tagger.pyx modules

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 35c411d38..1199ba613 100644
--- a/setup.py
+++ b/setup.py
@@ -53,9 +53,9 @@ exts = [
     Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes)
-    #Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
-    #Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),

From 5fe5e6e66b3d3770214264671bf2c46315abd1c1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 21:59:04 +1100
Subject: [PATCH 30/56] * Move context functions to header, inlining them.

---
 spacy/context.pxd | 17 ++++++++++++++++-
 spacy/context.pyx | 15 ---------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/spacy/context.pxd b/spacy/context.pxd
index 3dd842b6e..3c7764846 100644
--- a/spacy/context.pxd
+++ b/spacy/context.pxd
@@ -46,4 +46,19 @@ cpdef enum:
     N_FIELDS
 
 
-cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1
+cdef inline void fill_context(atom_t* context, const int i, const TokenC* tokens) nogil:
+    _fill_from_token(&context[P2_sic], &tokens[i-2])
+    _fill_from_token(&context[P1_sic], &tokens[i-1])
+    _fill_from_token(&context[W_sic], &tokens[i])
+    _fill_from_token(&context[N1_sic], &tokens[i+1])
+    _fill_from_token(&context[N2_sic], &tokens[i+2])
+
+
+cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
+    context[0] = t.lex.sic
+    context[1] = t.lex.cluster
+    context[2] = t.lex.shape
+    context[3] = t.lex.prefix
+    context[4] = t.lex.suffix
+    context[5] = t.pos
+    context[6] = t.sense
diff --git a/spacy/context.pyx b/spacy/context.pyx
index c81daef2c..8b1378917 100644
--- a/spacy/context.pyx
+++ b/spacy/context.pyx
@@ -1,16 +1 @@
-cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1:
-    _fill_from_token(&context[P2_sic], &tokens[i-2])
-    _fill_from_token(&context[P1_sic], &tokens[i-1])
-    _fill_from_token(&context[W_sic], &tokens[i])
-    _fill_from_token(&context[N1_sic], &tokens[i+1])
-    _fill_from_token(&context[N2_sic], &tokens[i+2])
 
-
-cdef inline void _fill_from_token(atom_t[N_FIELDS] context, const TokenC* t) nogil:
-    context[0] = t.lex.sic
-    context[1] = t.lex.cluster
-    context[2] = t.lex.shape
-    context[3] = t.lex.prefix
-    context[4] = t.lex.suffix
-    context[5] = t.pos
-    context[6] = t.sense

From 677e111ee7c033cb4cb9a2b1cc41dd282c3a74a8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 22:04:47 +1100
Subject: [PATCH 31/56] * Revise tokenization rules to match PTB. Rules are
 pretty messy around periods, need better support for these.

---
 data/en/prefix       |   5 ++
 data/en/tokenization | 182 ++++++++++++++++++++++++-------------------
 2 files changed, 109 insertions(+), 78 deletions(-)

diff --git a/data/en/prefix b/data/en/prefix
index 64a3f1f2f..cb9bb4d7b 100644
--- a/data/en/prefix
+++ b/data/en/prefix
@@ -11,3 +11,8 @@ $
 '
 ``
 `
+#
+US$
+C$
+A$
+a-
diff --git a/data/en/tokenization b/data/en/tokenization
index 6bf0d738b..e2b78dd28 100644
--- a/data/en/tokenization
+++ b/data/en/tokenization
@@ -6,99 +6,100 @@
 
 's  's
 'S  'S
-ain't   are not
-aren't  are not
-can't   can not
+ain't   ai n't
+aren't  are n't
+can't   ca n't
 cannot  can not
-could've    could have
-couldn't    could not
-couldn't've could not have
-didn't  did not
-doesn't does not
-don't   do not
-hadn't  had not
-hadn't've   had not have
-hasn't  has not
-haven't have not
-he'd    he would
-he'd've he would have
-he'll   he will
+could've    could 've
+couldn't    could n't
+couldn't've could n't 've
+didn't  did n't
+doesn't does n't
+don't   do n't
+hadn't  had n't
+hadn't've   had n't 've
+hasn't  has n't
+haven't have n't
+he'd    he 'd
+he'd've he 'd 've
+he'll   he 'll
 he's    he 's
-how'd   he would
-how'll  he will
+how'd   he 'd
+how'll  he 'll
 how's   how 's
-I'd I would
-I'd've  I would have
-I'll    I will
-I'm I am
-I'ma    I will
-I've    I have
-isn't   is not
-it'd    it would
-it'd've it would have
-it'll   it will
+I'd I 'd
+I'd've  I 'd 've
+I'll    I 'll
+I'm I 'm
+I'ma    I 'ma
+I've    I 've
+isn't   is n't
+it'd    it 'd
+it'd've it 'd 've
+it'll   it 'll
 it's    it 's
 let's   let 's
-mightn't    might not
-mightn't've might not have
-might've    might have
-mustn't must not
-must've must have
-needn't need not
-not've  not have
-shan't  shall not
-she'd   she would
-she'd've    she would have
+mightn't    might n't
+mightn't've might n't 've
+might've    might 've
+mustn't must n't
+must've must 've
+needn't need n't
+not've  not 've
+shan't  sha n't
+she'd   she 'd
+she'd've    she 'd 've
 she'll  she will
 she's   she 's
-should've   should have
-shouldn't   should not
-shouldn't've    should not have
+should've   should 've
+shouldn't   should n't
+shouldn't've    should n't 've
 that's  that 's
-there'd there would
-there'd've  there would have
-there's there is
-they'd  there would
-they'd've   they would have
-they'll they will
-they're they are
-they've they have
-wasn't  was not
-we'd    we would
-we'd've we would have
-we'll   we will
-we're   we are
-we've   we have
-weren't were not
-what'll what will
-what're what are
+there'd there 'd
+there'd've  there 'd 've
+there's there 's
+they'd  there 'd
+they'd've   they 'd 've
+they'll they 'll
+they're they 're
+they've they 've
+wasn't  was n't
+we'd    we 'd
+we'd've we 'd 've
+we'll   we 'll
+we're   we 're
+we've   we 've
+weren't were n't
+what'll what 'll
+what're what 're
 what's  what 's
-what've what have
+what've what 've
 when's  when 's
-where'd where would
+where'd where 'd
 where's where 's
-where've    where have
-who'd   who would
-who'll  who will
-who're  who are
+where've    where 've
+who'd   who 'd
+who'll  who 'll
+who're  who 're
 who's   who 's
-who've  who have
-why'll  who will
-why're  why are
+who've  who 've
+why'll  why 'll
+why're  why 're
 why's   why 's
-won't   will not
-would've    would have
-wouldn't    would not
-wouldn't've would not have
-you'd   you would
-you'd've    you would have
-you'll  you will
-you're  you are
-you've  you have
-'em them
-'ol old
+won't   wo n't
+would've    would 've
+wouldn't    would n't
+wouldn't've would n't 've
+you'd   you 'd
+you'd've    you 'd 've
+you'll  you 'll
+you're  you 're
+you've  you 've
+'em 'em
+'ol 'ol
 10km    10 km
 U.S.    U.S.
+U.K.    U.K.
 non-U.S.    non-U.S.
 U.N.    U.N.
 Co. Co.
@@ -115,7 +116,12 @@ A.G.    A.G.
 Rep.    Rep.
 Ms. Ms.
 Mr. Mr.
+Mrs.    Mrs.
 a.m.    a.m.
+Sen.    Sen.
+INC.    INC.
+CO. CO.
+COS.    COS.
 p.m.    p.m.
 Nos.    Nos.
 a.k.a.  a.k.a.
@@ -127,6 +133,7 @@ E.  E.
 F.  F.
 G.  G.
 H.  H.
+I.  I.
 J.  J.
 K.  K.
 L.  L.
@@ -205,6 +212,9 @@ Wash.   Wash.
 W.Va.   W.Va.
 Wis.    Wis.
 Wyo.    Wyo.
+L.A.    L.A.
+R.H.    R.H.
+Gov.    Gov.
 ''  ''
 :)  :)
 <3  <3
@@ -262,3 +272,19 @@ V_V V_V
 o.O o.O
 ")  ")
 ....    ....
+a-  a -
+Messrs. Messrs.
+No. No.
+vs. vs.
+Gen.    Gen.
+Cos.    Cos.
+L.J.    L.J.
+D.T.    D.T.
+Prof.   Prof.
+Bros.   Bros.
+J.C.    J.C.
+Neb.    Neb.
+Adm.    Adm.
+U.S.S.R.    U.S.S.R.
+Rev.    Rev.
+H.F.    H.F.

From f00afe12c4af10d49f64d3bbc32734b94eb09df1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 22:05:57 +1100
Subject: [PATCH 32/56] * Load POS tagger in load() function if path exists

---
 spacy/lang.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 0ca5f08d2..d0f5e6944 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -44,7 +44,8 @@ cdef class Language:
     def load(self):
         self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
         self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
-        self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
+        if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
+            self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
 
     cpdef Tokens tokens_from_list(self, list strings):
         cdef int length = sum([len(s) for s in strings])

From 3819a88e1b313d868c60ab4815fca1c4e5eefd76 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 22:07:16 +1100
Subject: [PATCH 33/56] * Add support for tag dictionary, and fix error-code
 for predict method

---
 spacy/tagger.pxd |  4 +++-
 spacy/tagger.pyx | 55 ++++++++++++++++++++++++++++++------------------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index 0a9b4a0c4..772086926 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
+from preshed.maps cimport PreshMap
 
 from .typedefs cimport hash_t
 from .tokens cimport Tokens
@@ -15,7 +16,7 @@ cpdef enum TagType:
 
 cdef class Tagger:
     cpdef int set_tags(self, Tokens tokens) except -1
-    cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except 0
+    cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except *
  
     cpdef readonly Pool mem
     cpdef readonly Extractor extractor
@@ -23,3 +24,4 @@ cdef class Tagger:
 
     cpdef readonly TagType tag_type
     cpdef readonly list tag_names
+    cdef dict tagdict
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 22732843d..0ae66a844 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -18,7 +18,7 @@ from thinc.features cimport Feature, count_feats
 NULL_TAG = 0
 
 
-def setup_model_dir(tag_type, tag_names, templates, model_dir):
+def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
     if path.exists(model_dir):
         shutil.rmtree(model_dir)
     os.mkdir(model_dir)
@@ -26,6 +26,7 @@ def setup_model_dir(tag_type, tag_names, templates, model_dir):
         'tag_type': tag_type,
         'templates': templates,
         'tag_names': tag_names,
+        'tag_counts': tag_counts,
     }
     with open(path.join(model_dir, 'config.json'), 'w') as file_:
         json.dump(config, file_)
@@ -35,24 +36,19 @@ def train(train_sents, model_dir, nr_iter=10):
     cdef Tokens tokens
     cdef Tagger tagger = Tagger(model_dir)
     cdef int i
+    cdef class_t guess = 0
+    cdef class_t gold
     for _ in range(nr_iter):
         n_corr = 0
         total = 0
         for tokens, golds in train_sents:
             assert len(tokens) == len(golds), [t.string for t in tokens]
             for i in range(tokens.length):
-                if tagger.tag_type == POS:
-                    gold = _get_gold_pos(i, golds)
-                else:
-                    raise StandardError
-
-                guess = tagger.predict(i, tokens)
+                gold = golds[i]
+                guess = tagger.predict(i, tokens, [gold])
                 tokens.set_tag(i, tagger.tag_type, guess)
-                if gold is not None:
-                    tagger.tell_answer(gold)
-                    total += 1
-                    n_corr += guess in gold
-                #print('%s\t%d\t%d' % (tokens[i].string, guess, gold))
+                total += 1
+                n_corr += guess == gold
         print('%.4f' % ((n_corr / total) * 100))
         random.shuffle(train_sents)
     tagger.model.end_training()
@@ -96,8 +92,9 @@ cdef class Tagger:
         templates = cfg['templates']
         self.tag_names = cfg['tag_names']
         self.tag_type = cfg['tag_type']
+        self.tagdict = _make_tag_dict(cfg['tag_counts'])
         self.extractor = Extractor(templates)
-        self.model = LinearModel(len(self.tag_names))
+        self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
         if path.exists(path.join(model_dir, 'model')):
             self.model.load(path.join(model_dir, 'model'))
 
@@ -113,7 +110,7 @@ cdef class Tagger:
         for i in range(tokens.length):
             tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
 
-    cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except 0:
+    cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except *:
         """Predict the tag of tokens[i].  The tagger remembers the features and
         prediction, in case you later call tell_answer.
 
@@ -121,16 +118,18 @@ cdef class Tagger:
         >>> tag = EN.pos_tagger.predict(0, tokens)
         >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
         """
-        cdef int n_feats
+        cdef atom_t sic = tokens.data[i].lex.sic
+        if sic in self.tagdict:
+            return self.tagdict[sic]
         cdef atom_t[N_FIELDS] context
-        print sizeof(context)
         fill_context(context, i, tokens.data)
+        cdef int n_feats
         cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
         cdef weight_t* scores = self.model.get_scores(feats, n_feats)
-        cdef class_t guess = _arg_max(scores, self.nr_class)
+        guess = _arg_max(scores, self.model.nr_class)
         if golds is not None and guess not in golds:
             best = _arg_max_among(scores, golds)
-            counts = {}
+            counts = {guess: {}, best: {}}
             count_feats(counts[guess], feats, n_feats, -1)
             count_feats(counts[best], feats, n_feats, 1)
             self.model.update(counts)
@@ -145,12 +144,28 @@ cdef class Tagger:
         return tag_id
 
 
-cdef class_t _arg_max(weight_t* scores, int n_classes):
+def _make_tag_dict(counts):
+    freq_thresh = 50
+    ambiguity_thresh = 0.98
+    tagdict = {}
+    cdef atom_t word
+    cdef atom_t tag
+    for word_str, tag_freqs in counts.items():
+        tag_str, mode = max(tag_freqs.items(), key=lambda item: item[1])
+        n = sum(tag_freqs.values())
+        word = int(word_str)
+        tag = int(tag_str)
+        if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
+            tagdict[word] = tag
+    return tagdict
+
+
+cdef class_t _arg_max(weight_t* scores, int n_classes) except 9000:
     cdef int best = 0
     cdef weight_t score = scores[best]
     cdef int i
     for i in range(1, n_classes):
-        if scores[i] > score:
+        if scores[i] >= score:
             score = scores[i]
             best = i
     return best

From 9f17467c2e1a1d14d775e3da90aa1d668cbe7d15 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 22:07:41 +1100
Subject: [PATCH 34/56] * Fix EMPTY_TOKEN

---
 spacy/tokens.pxd |  3 ---
 spacy/tokens.pyx | 14 ++++++++------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index addb1e3e5..e6bc0a46a 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -16,9 +16,6 @@ cdef struct TokenC:
     int sense
 
 
-cdef TokenC EMPTY_TOKEN = TokenC(&EMPTY_LEXEME, 0, 0, 0)
-
-
 cdef class Tokens:
     cdef Pool mem
     cdef StringStore _string_store
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index b474ff6fb..407ffcb8b 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -43,7 +43,7 @@ cdef class Tokens:
         data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
         cdef int i
         for i in range(size + (PADDING*2)):
-            data_start[i] = EMPTY_TOKEN
+            data_start[i].lex = &EMPTY_LEXEME
         self.data = data_start + PADDING
         self.max_length = size
         self.length = 0
@@ -86,10 +86,7 @@ cdef class Tokens:
         return idx
 
     cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
-        if tag_type == POS:
-            self.pos[i] = tag
-        elif tag_type == ENTITY:
-            self.ner[i] = tag
+        self.data[i].pos = tag
 
     @cython.boundscheck(False)
     cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
@@ -116,12 +113,17 @@ cdef class Tokens:
     def _realloc(self, new_size):
         self.max_length = new_size
         n = new_size + (PADDING * 2)
+        # What we're storing is a "padded" array. We've jumped forward PADDING
+        # places, and are storing the pointer to that. This way, we can access
+        # words out-of-bounds, and get out-of-bounds markers.
+        # Now that we want to realloc, we need the address of the true start,
+        # so we jump the pointer back PADDING places.
         cdef TokenC* data_start = self.data - PADDING
         data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
         self.data = data_start + PADDING
         cdef int i
         for i in range(self.length, self.max_length + PADDING):
-            self.data[i] = EMPTY_TOKEN
+            self.data[i].lex = &EMPTY_LEXEME
 
 
 @cython.freelist(64)

From 8f2f319c57799f5bd9cad34c6dd4ee29c56e2713 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 22:08:04 +1100
Subject: [PATCH 35/56] * Add a couple more contractions tests

---
 tests/test_contractions.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_contractions.py b/tests/test_contractions.py
index b7347a617..8334a74a9 100644
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@@ -39,3 +39,10 @@ def test_capitalized():
     tokens = EN.tokenize("Ain't")
     assert len(tokens) == 2
     assert tokens[0].string == "Are"
+
+
+def test_punct():
+    tokens = EN.tokenize("We've")
+    assert len(tokens) == 2
+    tokens = EN.tokenize("``We've")
+    assert len(tokens) == 3

From 327383e38afb810f6afc3cd185444225f0368074 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 22:14:51 +1100
Subject: [PATCH 36/56] * Remove unused code in tagger.pyx

---
 spacy/tagger.pyx | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 0ae66a844..04ffef550 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -15,9 +15,6 @@ import cython
 from thinc.features cimport Feature, count_feats
 
 
-NULL_TAG = 0
-
-
 def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
     if path.exists(model_dir):
         shutil.rmtree(model_dir)
@@ -55,33 +52,6 @@ def train(train_sents, model_dir, nr_iter=10):
     tagger.model.dump(path.join(model_dir, 'model'))
 
 
-cdef object _get_gold_pos(i, golds):
-    if golds[i] == 0:
-        return None
-    else:
-        return [golds[i]]
-
-
-cdef object _get_gold_ner(i, golds, int* ner):
-    if golds[i] == 0:
-        return None
-    else:
-        return [golds[i]]
-
-
-def evaluate(tagger, sents):
-    n_corr = 0
-    total = 0
-    for tokens, golds in sents:
-        for i, gold in enumerate(golds):
-            guess = tagger.predict(i, tokens)
-            tokens.set_tag(i, tagger.tag_type, guess)
-            if gold != NULL_TAG:
-                total += 1
-                n_corr += guess == gold
-    return n_corr / total
-
-
 cdef class Tagger:
     """Assign part-of-speech, named entity or supersense tags, using greedy
     decoding.  The tagger reads its model and configuration from disk.

From ef4398b204ab60e4ff3c9bf476a059ad07be282d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 23:52:41 +1100
Subject: [PATCH 37/56] * Rearrange POS stuff, so that language-specific stuff
 can live in language-specific modules

---
 setup.py          |  1 -
 spacy/context.pxd | 63 ------------------------------------------
 spacy/en.pxd      | 70 +++++++++++++++++++++++++++++++++++++++++++++--
 spacy/en.pyx      | 51 ++++++++++++++++++++++++++++++----
 spacy/lang.pxd    |  2 +-
 spacy/lang.pyx    | 11 ++++----
 spacy/lexeme.pxd  | 13 ---------
 spacy/lexeme.pyx  |  7 +----
 spacy/tagger.pxd  | 10 +------
 spacy/tagger.pyx  | 49 ++-------------------------------
 spacy/tokens.pyx  |  4 +--
 11 files changed, 127 insertions(+), 154 deletions(-)

diff --git a/setup.py b/setup.py
index 1199ba613..6ff1f5d62 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,6 @@ exts = [
     Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
diff --git a/spacy/context.pxd b/spacy/context.pxd
index 3c7764846..8b1378917 100644
--- a/spacy/context.pxd
+++ b/spacy/context.pxd
@@ -1,64 +1 @@
-from thinc.typedefs cimport atom_t
-from .tokens cimport TokenC
 
-
-cpdef enum:
-    P2_sic
-    P2_cluster
-    P2_shape
-    P2_prefix
-    P2_suffix
-    P2_pos
-    P2_sense
-
-    P1_sic
-    P1_cluster
-    P1_shape
-    P1_prefix
-    P1_suffix
-    P1_pos
-    P1_sense
-
-    W_sic
-    W_cluster
-    W_shape
-    W_prefix
-    W_suffix
-    W_pos
-    W_sense
-
-    N1_sic
-    N1_cluster
-    N1_shape
-    N1_prefix
-    N1_suffix
-    N1_pos
-    N1_sense
-
-    N2_sic
-    N2_cluster
-    N2_shape
-    N2_prefix
-    N2_suffix
-    N2_pos
-    N2_sense
-
-    N_FIELDS
-
-
-cdef inline void fill_context(atom_t* context, const int i, const TokenC* tokens) nogil:
-    _fill_from_token(&context[P2_sic], &tokens[i-2])
-    _fill_from_token(&context[P1_sic], &tokens[i-1])
-    _fill_from_token(&context[W_sic], &tokens[i])
-    _fill_from_token(&context[N1_sic], &tokens[i+1])
-    _fill_from_token(&context[N2_sic], &tokens[i+2])
-
-
-cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
-    context[0] = t.lex.sic
-    context[1] = t.lex.cluster
-    context[2] = t.lex.shape
-    context[3] = t.lex.prefix
-    context[4] = t.lex.suffix
-    context[5] = t.pos
-    context[6] = t.sense
diff --git a/spacy/en.pxd b/spacy/en.pxd
index cccfb60a8..8ce023106 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -1,5 +1,9 @@
-from spacy.lang cimport Language
-from spacy.tokens cimport Tokens
+from thinc.typedefs cimport atom_t
+
+from .lang cimport Language
+from .tokens cimport Tokens
+from .tokens cimport TokenC
+
 
 # Flags
 cpdef enum FlagID:
@@ -28,5 +32,67 @@ cpdef enum FlagID:
     IN_NAMES
 
 
+cpdef enum:
+    P2_sic
+    P2_cluster
+    P2_shape
+    P2_prefix
+    P2_suffix
+    P2_pos
+    P2_sense
+
+    P1_sic
+    P1_cluster
+    P1_shape
+    P1_prefix
+    P1_suffix
+    P1_pos
+    P1_sense
+
+    W_sic
+    W_cluster
+    W_shape
+    W_prefix
+    W_suffix
+    W_pos
+    W_sense
+
+    N1_sic
+    N1_cluster
+    N1_shape
+    N1_prefix
+    N1_suffix
+    N1_pos
+    N1_sense
+
+    N2_sic
+    N2_cluster
+    N2_shape
+    N2_prefix
+    N2_suffix
+    N2_pos
+    N2_sense
+
+    N_CONTEXT_FIELDS
+
+
+cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
+    _fill_from_token(&context[P2_sic], &tokens[i-2])
+    _fill_from_token(&context[P1_sic], &tokens[i-1])
+    _fill_from_token(&context[W_sic], &tokens[i])
+    _fill_from_token(&context[N1_sic], &tokens[i+1])
+    _fill_from_token(&context[N2_sic], &tokens[i+2])
+
+
+cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
+    context[0] = t.lex.sic
+    context[1] = t.lex.cluster
+    context[2] = t.lex.shape
+    context[3] = t.lex.prefix
+    context[4] = t.lex.suffix
+    context[5] = t.pos
+    context[6] = t.sense
+
+
 cdef class English(Language):
     pass
diff --git a/spacy/en.pyx b/spacy/en.pyx
index 92be97aad..c0eb0368b 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -30,11 +30,6 @@ same scheme. Tokenization problems are a major cause of poor performance for
 NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 provides a fully Penn Treebank 3-compliant tokenizer.
 '''
-# TODO
-#The script translate_treebank_tokenization can be used to transform a treebank's
-#annotation to use one of the spacy tokenization schemes.
-
-
 from __future__ import unicode_literals
 
 cimport lang
@@ -42,6 +37,32 @@ from .typedefs cimport flags_t
 import orth
 
 
+POS_TEMPLATES = (
+    (W_sic,),
+    (P1_sic,),
+    (N1_sic,),
+    (N2_sic,),
+    (P2_sic,),
+
+    (W_suffix,),
+    (W_prefix,),
+
+    (P1_pos,),
+    (P2_pos,),
+    (P1_pos, P2_pos),
+    (P1_pos, W_sic),
+    (P1_suffix,),
+    (N1_suffix,),
+
+    (W_shape,),
+    (W_cluster,),
+    (N1_cluster,),
+    (N2_cluster,),
+    (P1_cluster,),
+    (P2_cluster,),
+)
+
+
 cdef class English(Language):
     """English tokenizer, tightly coupled to lexicon.
 
@@ -49,6 +70,9 @@ cdef class English(Language):
         name (unicode): The two letter code used by Wikipedia for the language.
         lexicon (Lexicon): The lexicon. Exposes the lookup method.
     """
+    def get_props(self, unicode string):
+        return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
+
     def set_flags(self, unicode string):
         cdef flags_t flags = 0
         flags |= orth.is_alpha(string) << IS_ALPHA
@@ -64,5 +88,22 @@ cdef class English(Language):
         flags |= orth.like_number(string) << LIKE_NUMBER
         return flags
 
+    def set_pos(self, Tokens tokens):
+        cdef int i
+        cdef atom_t[N_CONTEXT_FIELDS] context
+        for i in range(tokens.length):
+            fill_pos_context(context, i, tokens.data)
+            tokens.data[i].pos = self.pos_tagger.predict(context)
+
+    def train_pos(self, Tokens tokens, golds):
+        cdef int i
+        cdef atom_t[N_CONTEXT_FIELDS] context
+        c = 0
+        for i in range(tokens.length):
+            fill_pos_context(context, i, tokens.data)
+            tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
+            c += tokens.data[i].pos == golds[i]
+        return c
+
 
 EN = English('en')
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 54f317ce8..20374f40d 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -13,7 +13,7 @@ from .utf8string cimport StringStore, UniStr
 
 
 cdef class Lexicon:
-    cpdef public set_flags
+    cpdef public get_lex_props
     cdef Pool mem
     cpdef readonly size_t size
     cpdef readonly StringStore strings
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index d0f5e6944..496c6742c 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -37,7 +37,7 @@ cdef class Language:
         self._prefix_re = re.compile(prefix)
         self._suffix_re = re.compile(suffix)
         self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon(self.set_flags)
+        self.lexicon = Lexicon(self.get_props)
         self._load_special_tokenization(rules)
         self.pos_tagger = None
 
@@ -249,13 +249,13 @@ cdef class Lexicon:
     
     Also interns UTF-8 strings, and maps them to consecutive integer IDs.
     '''
-    def __init__(self, object set_flags=None):
+    def __init__(self, object get_props):
         self.mem = Pool()
         self._map = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.lexemes.push_back(&EMPTY_LEXEME)
         self.size = 2
-        self.set_flags = set_flags
+        self.get_lex_props = get_props
 
     cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
         '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
@@ -267,9 +267,10 @@ cdef class Lexicon:
             return lex
         if string.n < 3:
             mem = self.mem
+        cdef unicode py_string = string.chars[:string.n]
         lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
-                self.strings, {'flags': self.set_flags(string.chars[:string.n])})
+        lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
+                             self.get_lex_props(py_string))
         if mem is self.mem:
             self._map.set(string.key, lex)
             while self.lexemes.size() < (lex.id + 1):
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index e35bde61e..f524188ed 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -72,17 +72,14 @@ cpdef enum attr_id_t:
 
     ID
     SIC
-    STEM
     DENSE
     SHAPE
-    ASCIIED
     PREFIX
     SUFFIX
 
     LENGTH
     CLUSTER
     POS_TYPE
-    SENSE_TYPE
 
 
 cdef struct Lexeme:
@@ -90,20 +87,16 @@ cdef struct Lexeme:
    
     attr_t id
     attr_t sic
-    attr_t stem
     attr_t dense
     attr_t shape
-    attr_t asciied
     attr_t prefix
     attr_t suffix
  
     attr_t length
     attr_t cluster
     attr_t pos_type
-    attr_t sense_type
 
     float prob
-    float lower_pc
     float sentiment
 
 
@@ -127,12 +120,8 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
         return lex.sic
     elif feat_name == DENSE:
         return lex.dense
-    elif feat_name == STEM:
-        return lex.stem
     elif feat_name == SHAPE:
         return lex.shape
-    elif feat_name == ASCIIED:
-        return lex.asciied
     elif feat_name == PREFIX:
         return lex.prefix
     elif feat_name == SUFFIX:
@@ -143,7 +132,5 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
         return lex.cluster
     elif feat_name == POS_TYPE:
         return lex.pos_type
-    elif feat_name == SENSE_TYPE:
-        return lex.sense_type
     else:
         return 0
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index cd92c4845..f1974cbc9 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -19,17 +19,12 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
     
     lex.cluster = props.get('cluster', 0)
     lex.pos_type = props.get('pos_type', 0)
-    lex.sense_type = props.get('sense_type', 0)
     lex.prob = props.get('prob', 0)
 
-    lex.lower_pc = props.get('lower_pc', 0.0)
-
     lex.prefix = string_store[string[:1]]
     lex.suffix = string_store[string[-3:]]
     lex.shape = string_store[orth.word_shape(string)]
-    lex.dense = lex.sic if lex.prob >= -10 else lex.shape
-    lex.stem = string_store[props.get('stem', string)]
-    lex.asciied = string_store[orth.asciied(string)]
+    lex.dense = string_store[props['dense']]
    
     lex.flags = props.get('flags', 0)
     return lex
diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index 772086926..39ba7ed41 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -3,25 +3,17 @@ from cymem.cymem cimport Pool
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
-from preshed.maps cimport PreshMap
 
 from .typedefs cimport hash_t
 from .tokens cimport Tokens
 
 
-cpdef enum TagType:
-    POS
-    SENSE
-
-
 cdef class Tagger:
-    cpdef int set_tags(self, Tokens tokens) except -1
-    cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except *
+    cdef class_t predict(self, atom_t* context, object golds=*) except *
  
     cpdef readonly Pool mem
     cpdef readonly Extractor extractor
     cpdef readonly LinearModel model
 
-    cpdef readonly TagType tag_type
     cpdef readonly list tag_names
     cdef dict tagdict
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 04ffef550..e0cd0bf3b 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -2,9 +2,6 @@
 from __future__ import unicode_literals
 from __future__ import division
 
-from .context cimport fill_context
-from .context cimport N_FIELDS
-
 from os import path
 import os
 import shutil
@@ -15,12 +12,11 @@ import cython
 from thinc.features cimport Feature, count_feats
 
 
-def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
+def setup_model_dir(tag_names, tag_counts, templates, model_dir):
     if path.exists(model_dir):
         shutil.rmtree(model_dir)
     os.mkdir(model_dir)
     config = {
-        'tag_type': tag_type,
         'templates': templates,
         'tag_names': tag_names,
         'tag_counts': tag_counts,
@@ -29,29 +25,6 @@ def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
         json.dump(config, file_)
 
 
-def train(train_sents, model_dir, nr_iter=10):
-    cdef Tokens tokens
-    cdef Tagger tagger = Tagger(model_dir)
-    cdef int i
-    cdef class_t guess = 0
-    cdef class_t gold
-    for _ in range(nr_iter):
-        n_corr = 0
-        total = 0
-        for tokens, golds in train_sents:
-            assert len(tokens) == len(golds), [t.string for t in tokens]
-            for i in range(tokens.length):
-                gold = golds[i]
-                guess = tagger.predict(i, tokens, [gold])
-                tokens.set_tag(i, tagger.tag_type, guess)
-                total += 1
-                n_corr += guess == gold
-        print('%.4f' % ((n_corr / total) * 100))
-        random.shuffle(train_sents)
-    tagger.model.end_training()
-    tagger.model.dump(path.join(model_dir, 'model'))
-
-
 cdef class Tagger:
     """Assign part-of-speech, named entity or supersense tags, using greedy
     decoding.  The tagger reads its model and configuration from disk.
@@ -61,26 +34,13 @@ cdef class Tagger:
         cfg = json.load(open(path.join(model_dir, 'config.json')))
         templates = cfg['templates']
         self.tag_names = cfg['tag_names']
-        self.tag_type = cfg['tag_type']
         self.tagdict = _make_tag_dict(cfg['tag_counts'])
         self.extractor = Extractor(templates)
         self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
         if path.exists(path.join(model_dir, 'model')):
             self.model.load(path.join(model_dir, 'model'))
 
-    cpdef int set_tags(self, Tokens tokens) except -1:
-        """Assign tags to a Tokens object.
-
-        >>> tokens = EN.tokenize(u'An example sentence.')
-        >>> assert tokens[0].pos == 'NO_TAG'
-        >>> EN.pos_tagger.set_tags(tokens)
-        >>> assert tokens[0].pos == 'DT'
-        """
-        cdef int i
-        for i in range(tokens.length):
-            tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
-
-    cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except *:
+    cdef class_t predict(self, atom_t* context, object golds=None) except *:
         """Predict the tag of tokens[i].  The tagger remembers the features and
         prediction, in case you later call tell_answer.
 
@@ -88,11 +48,6 @@ cdef class Tagger:
         >>> tag = EN.pos_tagger.predict(0, tokens)
         >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
         """
-        cdef atom_t sic = tokens.data[i].lex.sic
-        if sic in self.tagdict:
-            return self.tagdict[sic]
-        cdef atom_t[N_FIELDS] context
-        fill_context(context, i, tokens.data)
         cdef int n_feats
         cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
         cdef weight_t* scores = self.model.get_scores(feats, n_feats)
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 407ffcb8b..33f265eef 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -140,11 +140,11 @@ cdef class Token:
         self.cluster = lex['cluster']
         self.length = lex['length']
         self.postype = lex['pos_type']
-        self.sensetype = lex['sense_type']
+        self.sensetype = 0
         self.sic = lex['sic']
         self.norm = lex['dense']
         self.shape = lex['shape']
-        self.suffix = lex['asciied']
+        self.suffix = lex['suffix']
         self.prefix = lex['prefix']
 
         self.prob = lex['prob']

From b031c7c4306ed348dc371ea28f3e4c2759845fcb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 7 Dec 2014 23:53:01 +1100
Subject: [PATCH 38/56] * Remove language-general context module

---
 spacy/context.pxd | 1 -
 spacy/context.pyx | 1 -
 2 files changed, 2 deletions(-)
 delete mode 100644 spacy/context.pxd
 delete mode 100644 spacy/context.pyx

diff --git a/spacy/context.pxd b/spacy/context.pxd
deleted file mode 100644
index 8b1378917..000000000
--- a/spacy/context.pxd
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/spacy/context.pyx b/spacy/context.pyx
deleted file mode 100644
index 8b1378917..000000000
--- a/spacy/context.pyx
+++ /dev/null
@@ -1 +0,0 @@
-

From c20dd79748e928384722df113473d207b26a7893 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 8 Dec 2014 00:03:55 +1100
Subject: [PATCH 39/56] * Fiddle with const correctness and comments

---
 spacy/tagger.pxd | 2 +-
 spacy/tagger.pyx | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index 39ba7ed41..f91bbeb0a 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -9,7 +9,7 @@ from .tokens cimport Tokens
 
 
 cdef class Tagger:
-    cdef class_t predict(self, atom_t* context, object golds=*) except *
+    cdef class_t predict(self, const atom_t* context, object golds=*) except *
  
     cpdef readonly Pool mem
     cpdef readonly Extractor extractor
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index e0cd0bf3b..22ec3896a 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -26,8 +26,8 @@ def setup_model_dir(tag_names, tag_counts, templates, model_dir):
 
 
 cdef class Tagger:
-    """Assign part-of-speech, named entity or supersense tags, using greedy
-    decoding.  The tagger reads its model and configuration from disk.
+    """Predict some type of tag, using greedy decoding.  The tagger reads its
+    model and configuration from disk.
     """
     def __init__(self, model_dir):
         self.mem = Pool()
@@ -40,7 +40,7 @@ cdef class Tagger:
         if path.exists(path.join(model_dir, 'model')):
             self.model.load(path.join(model_dir, 'model'))
 
-    cdef class_t predict(self, atom_t* context, object golds=None) except *:
+    cdef class_t predict(self, const atom_t* context, object golds=None) except *:
         """Predict the tag of tokens[i].  The tagger remembers the features and
         prediction, in case you later call tell_answer.
 

From 7b68f911cf882d5f2694eb7ea26eddf37b9c9070 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 8 Dec 2014 01:39:13 +1100
Subject: [PATCH 40/56] * Add WordNet lemmatizer

---
 spacy/lemmatizer.py                     | 87 +++++++++++++++++++++++++
 tests/{test_ner.py => depr_test_ner.py} |  0
 tests/test_lemmatizer.py                | 34 ++++++++++
 3 files changed, 121 insertions(+)
 create mode 100644 spacy/lemmatizer.py
 rename tests/{test_ner.py => depr_test_ner.py} (100%)
 create mode 100644 tests/test_lemmatizer.py

diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
new file mode 100644
index 000000000..a42a5daee
--- /dev/null
+++ b/spacy/lemmatizer.py
@@ -0,0 +1,87 @@
+from os import path
+
+
+NOUN_RULES = (
+    ('s', ''),
+    ('ses', 's'),
+    ('ves', 'f'),
+    ('xes', 'x'),
+    ('zes', 'z'),
+    ('ches', 'ch'),
+    ('shes', 'sh'),
+    ('men', 'man'),
+    ('ies', 'y')
+)
+
+
+VERB_RULES = (
+    ("s", ""),
+    ("ies", "y"),
+    ("es", "e"),
+    ("es", ""),
+    ("ed", "e"),
+    ("ed", ""),
+    ("ing", "e"),
+    ("ing", "")
+)
+
+
+ADJ_RULES = (
+    ("er", ""),
+    ("est", ""),
+    ("er", "e"),
+    ("est", "e")
+)
+
+
+class Lemmatizer(object):
+    def __init__(self, wn_dict_dir):
+        self.index = {}
+        self.exc = {}
+        for pos in ['adj', 'adv', 'noun', 'verb']:
+            self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
+            self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
+
+    def noun(self, string):
+        return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
+
+    def verb(self, string):
+        return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
+
+    def adj(self, string):
+        return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
+
+
+def lemmatize(string, index, exceptions, rules):
+    forms = []
+    if string in index:
+        forms.append(string)
+    forms.extend(exceptions.get(string, []))
+    for old, new in rules:
+        if string.endswith(old):
+            form = string[:len(string) - len(old)] + new
+            if form in index:
+                forms.append(form)
+    return set(forms)
+
+
+def read_index(loc):
+    index = set()
+    for line in open(loc):
+        if line.startswith(' '):
+            continue
+        pieces = line.split()
+        word = pieces[0]
+        if word.count('_') == 0:
+            index.add(word)
+    return index
+
+
+def read_exc(loc):
+    exceptions = {}
+    for line in open(loc):
+        if line.startswith(' '):
+            continue
+        pieces = line.split()
+        exceptions[pieces[0]] = tuple(pieces[1:])
+    return exceptions
diff --git a/tests/test_ner.py b/tests/depr_test_ner.py
similarity index 100%
rename from tests/test_ner.py
rename to tests/depr_test_ner.py
diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py
new file mode 100644
index 000000000..2047e4d2c
--- /dev/null
+++ b/tests/test_lemmatizer.py
@@ -0,0 +1,34 @@
+from spacy.lemmatizer import Lemmatizer, read_index, read_exc
+from spacy.util import DATA_DIR
+from os import path
+
+import pytest
+
+
+def test_read_index():
+    wn = path.join(DATA_DIR, 'wordnet')
+    index = read_index(path.join(wn, 'index.noun'))
+    assert 'man' in index
+    assert 'plantes' not in index
+    assert 'plant' in index
+
+
+def test_read_exc():
+    wn = path.join(DATA_DIR, 'wordnet')
+    exc = read_exc(path.join(wn, 'verb.exc'))
+    assert exc['was'] == ('be',)
+
+
+@pytest.fixture
+def lemmatizer():
+    return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
+
+
+def test_noun_lemmas(lemmatizer):
+    do = lemmatizer.noun
+
+    assert do('aardwolves') == set(['aardwolf'])
+    assert do('aardwolf') == set(['aardwolf'])
+    assert do('planets') == set(['planet'])
+    assert do('ring') == set(['ring'])
+    assert do('axes') == set(['axis', 'axe', 'ax'])

From 99bbbb6febf689250df0143394a82eb6177a5be2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 8 Dec 2014 21:12:15 +1100
Subject: [PATCH 41/56] * Work on morphological processing

---
 spacy/en.pxd        | 51 +++++++++++++++++++++++++++++++
 spacy/en.pyx        | 73 +++++++++++++++++++++++++++++++++++++++++----
 spacy/lang.pxd      |  8 +++--
 spacy/lang.pyx      | 39 +++++++++++++++++++++---
 spacy/lemmatizer.py |  3 ++
 spacy/pos_util.py   |  3 +-
 spacy/tagger.pxd    | 32 +++++++++++++++++++-
 spacy/tagger.pyx    | 42 +++++++++++++++++++++++---
 spacy/tokens.pxd    | 17 ++++++++++-
 spacy/tokens.pyx    | 14 +++++++--
 10 files changed, 261 insertions(+), 21 deletions(-)

diff --git a/spacy/en.pxd b/spacy/en.pxd
index 8ce023106..6887dbc08 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -5,6 +5,57 @@ from .tokens cimport Tokens
 from .tokens cimport TokenC
 
 
+cpdef enum en_person_t:
+    NO_PERSON
+    FIRST
+    SECOND
+    THIRD
+
+
+cpdef enum en_number_t:
+    NO_NUMBER
+    SINGULAR
+    PLURAL
+    MASS
+    CARDINAL
+    ORDINAL
+
+
+cpdef enum en_gender_t:
+    NO_GENDER
+    MASCULINE
+    FEMININE
+
+
+cpdef enum en_tenspect_t:
+    NO_TENSE
+    BASE_VERB
+    PRESENT
+    PAST
+    PASSIVE
+    ING
+    MODAL
+
+
+cpdef enum en_case_t:
+    NO_CASE
+    NOMINATIVE
+    ACCUSATIVE
+    GENITIVE
+    DEMONYM
+
+
+cpdef enum misc_t:
+    NO_MISC
+    COMPARATIVE
+    SUPERLATIVE
+    RELATIVE
+    NAME
+    URL
+    EMAIL
+    EMOTICON
+
+    
 # Flags
 cpdef enum FlagID:
     IS_ALPHA
diff --git a/spacy/en.pyx b/spacy/en.pyx
index c0eb0368b..fa59ef933 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -35,6 +35,63 @@ from __future__ import unicode_literals
 cimport lang
 from .typedefs cimport flags_t
 import orth
+from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
+from .tagger cimport X, PUNCT, EOL
+
+
+POS_TAGS = {
+    'NULL': (NO_TAG, {}),
+    'EOL': (EOL, {}),
+    'CC': (CONJ, {}),
+    'CD': (NUM, {}),
+    'DT': (DET, {}),
+    'EX': (DET, {}),
+    'FW': (X, {}),
+    'IN': (ADP, {}),
+    'JJ': (ADJ, {}),
+    'JJR': (ADJ, {'misc': COMPARATIVE}),
+    'JJS': (ADJ, {'misc': SUPERLATIVE}),
+    'LS': (X, {}),
+    'MD': (VERB, {'tenspect': MODAL}),
+    'NN': (NOUN, {}),
+    'NNS': (NOUN, {'number': PLURAL}),
+    'NNP': (NOUN, {'misc': NAME}),
+    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
+    'PDT': (DET, {}),
+    'POS': (PRT, {'case': GENITIVE}),
+    'PRP': (NOUN, {}),
+    'PRP$': (NOUN, {'case': GENITIVE}),
+    'RB': (ADV, {}),
+    'RBR': (ADV, {'misc': COMPARATIVE}),
+    'RBS': (ADV, {'misc': SUPERLATIVE}),
+    'RP': (PRT, {}),
+    'SYM': (X, {}),
+    'TO': (PRT, {}),
+    'UH': (X, {}),
+    'VB': (VERB, {}),
+    'VBD': (VERB, {'tenspect': PAST}),
+    'VBG': (VERB, {'tenspect': ING}),
+    'VBN': (VERB, {'tenspect': PASSIVE}),
+    'VBP': (VERB, {'tenspect': PRESENT}),
+    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
+    'WDT': (DET, {'misc': RELATIVE}),
+    'WP': (PRON, {'misc': RELATIVE}),
+    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
+    'WRB': (ADV, {'misc': RELATIVE}),
+    '!': (PUNCT, {}),
+    '#': (PUNCT, {}),
+    '$': (PUNCT, {}),
+    "''": (PUNCT, {}),
+    "(": (PUNCT, {}),
+    ")": (PUNCT, {}),
+    "-LRB-": (PUNCT, {}),
+    "-RRB-": (PUNCT, {}),
+    ".": (PUNCT, {}),
+    ",": (PUNCT, {}),
+    "``": (PUNCT, {}),
+    ":": (PUNCT, {}),
+    "?": (PUNCT, {}),
+}
 
 
 POS_TEMPLATES = (
@@ -91,19 +148,25 @@ cdef class English(Language):
     def set_pos(self, Tokens tokens):
         cdef int i
         cdef atom_t[N_CONTEXT_FIELDS] context
+        cdef TokenC* t = tokens.data
         for i in range(tokens.length):
-            fill_pos_context(context, i, tokens.data)
-            tokens.data[i].pos = self.pos_tagger.predict(context)
+            fill_pos_context(context, i, t)
+            t[i].pos = self.pos_tagger.predict(context)
+            #self.morphalyser.set_token(&t[i])
 
     def train_pos(self, Tokens tokens, golds):
         cdef int i
         cdef atom_t[N_CONTEXT_FIELDS] context
         c = 0
+        cdef TokenC* t = tokens.data
         for i in range(tokens.length):
-            fill_pos_context(context, i, tokens.data)
-            tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            c += tokens.data[i].pos == golds[i]
+            fill_pos_context(context, i, t)
+            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
+            t[i].morph = self.pos_tagger.tags[t[i].pos].morph
+            #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
+            c += t[i].pos == golds[i]
         return c
 
 
+
 EN = English('en')
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 20374f40d..124281a6b 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -2,20 +2,20 @@ from libcpp.vector cimport vector
 
 from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
 
-from preshed.maps cimport PreshMap
+from preshed.maps cimport PreshMap, PreshMapArray
 from cymem.cymem cimport Pool
 
 from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
+from .tagger cimport PosTag
 from .utf8string cimport StringStore, UniStr
 
 
 cdef class Lexicon:
     cpdef public get_lex_props
     cdef Pool mem
-    cpdef readonly size_t size
     cpdef readonly StringStore strings
     cdef vector[Lexeme*] lexemes
 
@@ -29,13 +29,17 @@ cdef class Language:
     cdef readonly unicode name
     cdef PreshMap _cache
     cdef PreshMap _specials
+    cdef PreshMapArray _lemmas
     cpdef readonly Lexicon lexicon
     cpdef readonly Tagger pos_tagger
+    cpdef readonly object lemmatizer
 
     cdef object _prefix_re
     cdef object _suffix_re
     cdef object _infix_re
 
+    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
+
     cpdef Tokens tokens_from_list(self, list strings)
     cpdef Tokens tokenize(self, unicode text)
 
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 496c6742c..fdeb7df66 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -14,6 +14,7 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
+from .lemmatizer import Lemmatizer
 
 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
@@ -26,6 +27,8 @@ from . import util
 from .util import read_lang_data
 from .tokens import Tokens
 
+from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
+
 
 cdef class Language:
     def __init__(self, name):
@@ -39,14 +42,40 @@ cdef class Language:
         self._infix_re = re.compile(infix)
         self.lexicon = Lexicon(self.get_props)
         self._load_special_tokenization(rules)
+        self._lemmas = PreshMapArray(N_UNIV_TAGS)
         self.pos_tagger = None
+        self.lemmatizer = None
 
     def load(self):
+        self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
         self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
         self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
         if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
             self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
 
+    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
+        if self.lemmatizer is None:
+            return lex.sic
+        if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
+            return lex.sic
+        cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
+        if lemma != 0:
+            return lemma
+        cdef bytes py_string = self.lexicon.strings[lex.sic]
+        cdef set lemma_strings
+        cdef bytes lemma_string
+        if pos.pos == NOUN:
+            lemma_strings = self.lemmatizer.noun(py_string)
+        elif pos.pos == VERB:
+            lemma_strings = self.lemmatizer.verb(py_string)
+        else:
+            assert pos.pos == ADJ
+            lemma_strings = self.lemmatizer.adj(py_string)
+        lemma_string = sorted(lemma_strings)[0]
+        lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
+        self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
+        return lemma
+
     cpdef Tokens tokens_from_list(self, list strings):
         cdef int length = sum([len(s) for s in strings])
         cdef Tokens tokens = Tokens(self.lexicon.strings, length)
@@ -254,9 +283,11 @@ cdef class Lexicon:
         self._map = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.lexemes.push_back(&EMPTY_LEXEME)
-        self.size = 2
         self.get_lex_props = get_props
 
+    def __len__(self):
+        return self.lexemes.size()
+
     cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
         '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
         if necessary, using memory acquired from the given pool.  If the pool
@@ -269,14 +300,13 @@ cdef class Lexicon:
             mem = self.mem
         cdef unicode py_string = string.chars[:string.n]
         lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
+        lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
                              self.get_lex_props(py_string))
         if mem is self.mem:
             self._map.set(string.key, lex)
             while self.lexemes.size() < (lex.id + 1):
                 self.lexemes.push_back(&EMPTY_LEXEME)
             self.lexemes[lex.id] = lex
-            self.size += 1
         else:
             lex[0].id = 1
         return lex
@@ -302,6 +332,8 @@ cdef class Lexicon:
                 a dict if the operator is called from Python.
         '''
         if type(id_or_string) == int:
+            if id_or_string >= self.lexemes.size():
+                raise IndexError
             return self.lexemes.at(id_or_string)[0]
         cdef UniStr string
         slice_unicode(&string, id_or_string, 0, len(id_or_string))
@@ -359,5 +391,4 @@ cdef class Lexicon:
                 self.lexemes.push_back(&EMPTY_LEXEME)
             self.lexemes[lexeme.id] = lexeme
             i += 1
-            self.size += 1
         fclose(fp)
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index a42a5daee..ce9bbefdc 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -53,6 +53,7 @@ class Lemmatizer(object):
 
 
 def lemmatize(string, index, exceptions, rules):
+    string = string.lower()
     forms = []
     if string in index:
         forms.append(string)
@@ -62,6 +63,8 @@ def lemmatize(string, index, exceptions, rules):
             form = string[:len(string) - len(old)] + new
             if form in index:
                 forms.append(form)
+    if not forms:
+        forms.append(string)
     return set(forms)
 
 
diff --git a/spacy/pos_util.py b/spacy/pos_util.py
index e5716665e..489f03dde 100644
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@@ -147,6 +147,7 @@ Y	PRT
 Z	NOUN
 ^	NOUN
 ~	X
-``	.""".strip().split('\n'))
+``	.
+EOL EOL""".strip().split('\n'))
     return mapping[tag]
 
diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index f91bbeb0a..11880bf13 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -1,11 +1,40 @@
+from libc.stdint cimport uint8_t
+
 from cymem.cymem cimport Pool
 
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 
+from preshed.maps cimport PreshMapArray
+
 from .typedefs cimport hash_t
-from .tokens cimport Tokens
+from .tokens cimport Tokens, Morphology
+
+
+# Google universal tag set
+cdef enum univ_tag_t:
+    NO_TAG
+    ADJ
+    ADV
+    ADP
+    CONJ
+    DET
+    NOUN
+    NUM
+    PRON
+    PRT
+    VERB
+    X
+    PUNCT
+    EOL
+    N_UNIV_TAGS
+
+
+cdef struct PosTag:
+    Morphology morph
+    int id
+    univ_tag_t pos
 
 
 cdef class Tagger:
@@ -16,4 +45,5 @@ cdef class Tagger:
     cpdef readonly LinearModel model
 
     cpdef readonly list tag_names
+    cdef PosTag* tags
     cdef dict tagdict
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 22ec3896a..db7974d91 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -12,13 +12,14 @@ import cython
 from thinc.features cimport Feature, count_feats
 
 
-def setup_model_dir(tag_names, tag_counts, templates, model_dir):
+def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
     if path.exists(model_dir):
         shutil.rmtree(model_dir)
     os.mkdir(model_dir)
     config = {
         'templates': templates,
         'tag_names': tag_names,
+        'tag_map': tag_map,
         'tag_counts': tag_counts,
     }
     with open(path.join(model_dir, 'config.json'), 'w') as file_:
@@ -33,16 +34,31 @@ cdef class Tagger:
         self.mem = Pool()
         cfg = json.load(open(path.join(model_dir, 'config.json')))
         templates = cfg['templates']
+        tag_map = cfg['tag_map']
+        univ_counts = {}
+        cdef unicode tag
+        cdef unicode univ_tag
         self.tag_names = cfg['tag_names']
+        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
+        for i, tag in enumerate(self.tag_names):
+            pos, props = tag_map[tag]
+            self.tags[i].id = i
+            self.tags[i].pos = pos
+            self.tags[i].morph.number = props.get('number', 0)
+            self.tags[i].morph.tenspect = props.get('tenspect', 0)
+            self.tags[i].morph.mood = props.get('mood', 0)
+            self.tags[i].morph.gender = props.get('gender', 0)
+            self.tags[i].morph.person = props.get('person', 0)
+            self.tags[i].morph.case = props.get('case', 0)
+            self.tags[i].morph.misc = props.get('misc', 0)
         self.tagdict = _make_tag_dict(cfg['tag_counts'])
         self.extractor = Extractor(templates)
         self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
         if path.exists(path.join(model_dir, 'model')):
             self.model.load(path.join(model_dir, 'model'))
 
-    cdef class_t predict(self, const atom_t* context, object golds=None) except *:
-        """Predict the tag of tokens[i].  The tagger remembers the features and
-        prediction, in case you later call tell_answer.
+    cdef class_t predict(self, atom_t* context, object golds=None) except *:
+        """Predict the tag of tokens[i].
 
         >>> tokens = EN.tokenize(u'An example sentence.')
         >>> tag = EN.pos_tagger.predict(0, tokens)
@@ -69,6 +85,24 @@ cdef class Tagger:
         return tag_id
 
 
+UNIV_TAGS = {
+    'NULL': NO_TAG,
+    'ADJ': ADJ,
+    'ADV': ADV,
+    'ADP': ADP,
+    'CONJ': CONJ,
+    'DET': DET,
+    'NOUN': NOUN,
+    'NUM': NUM,
+    'PRON': PRON,
+    'PRT': PRT,
+    'VERB': VERB,
+    'X': X,
+    '.': PUNCT,
+    'EOL': EOL
+}
+
+
 def _make_tag_dict(counts):
     freq_thresh = 50
     ambiguity_thresh = 0.98
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index e6bc0a46a..6f4691716 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -5,14 +5,29 @@ from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
 
 from .lexeme cimport Lexeme
+
 from .typedefs cimport flags_t
 from .utf8string cimport StringStore
+from libc.stdint cimport uint8_t, uint16_t
+
+
+cdef struct Morphology:
+    uint8_t number
+    uint8_t tenspect # Tense/aspect/voice
+    uint8_t mood
+    uint8_t gender
+    uint8_t person
+    uint8_t case
+    uint8_t misc
+
 
 
 cdef struct TokenC:
     const Lexeme* lex
+    Morphology morph
     int idx
     int pos
+    int lemma
     int sense
 
 
@@ -37,7 +52,7 @@ cdef class Token:
     cdef public int i
     cdef public int idx
     cdef public int pos
-    cdef public int ner
+    cdef int lemma
 
     cdef public atom_t id
     cdef public atom_t cluster
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 33f265eef..004d0578c 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -51,7 +51,7 @@ cdef class Tokens:
     def __getitem__(self, i):
         bounds_check(i, self.length, PADDING)
         return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
-                     self.data[i].sense, self.data[i].lex[0])
+                     self.data[i].lemma, self.data[i].lex[0])
 
     def __iter__(self):
         for i in range(self.length):
@@ -128,14 +128,15 @@ cdef class Tokens:
 
 @cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
+    def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
                  dict lex):
         self._string_store = string_store
         self.idx = idx
         self.pos = pos
-        self.ner = ner
         self.i = i
         self.id = lex['id']
+
+        self.lemma = lemma
         
         self.cluster = lex['cluster']
         self.length = lex['length']
@@ -156,3 +157,10 @@ cdef class Token:
                 return ''
             cdef bytes utf8string = self._string_store[self.sic]
             return utf8string.decode('utf8')
+
+    property lemma:
+        def __get__(self):
+            if self.lemma == 0:
+                return self.string
+            cdef bytes utf8string = self._string_store[self.lemma]
+            return utf8string.decode('utf8')

From cda9ea9a4af9d85fa590a7272649f6524a62df77 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 8 Dec 2014 21:12:51 +1100
Subject: [PATCH 42/56] * Add test to make sure iterating over the lexicon isnt
 broken

---
 tests/test_iter_lexicon.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 tests/test_iter_lexicon.py

diff --git a/tests/test_iter_lexicon.py b/tests/test_iter_lexicon.py
new file mode 100644
index 000000000..379ebd3bb
--- /dev/null
+++ b/tests/test_iter_lexicon.py
@@ -0,0 +1,15 @@
+import pytest
+
+from spacy.en import EN
+
+def test_range_iter():
+    EN.load()
+    for i in range(len(EN.lexicon)):
+        lex = EN.lexicon[i]
+
+
+def test_iter():
+    EN.load()
+    i = 0
+    for lex in EN.lexicon:
+        i += 1

From 302e09018bc090a32f45170d208bb2b6898ac185 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 9 Dec 2014 14:48:01 +1100
Subject: [PATCH 43/56] * Work on fixing special-cases, reading them in as JSON
 objects so that they can specify lemmas

---
 spacy/en.pxd               |  24 ++++----
 spacy/en.pyx               |  25 ++++++++-
 spacy/lang.pxd             |   5 +-
 spacy/lang.pyx             | 112 +++++++++++++++++++++++--------------
 spacy/tokens.pxd           |   1 -
 spacy/util.py              |  10 +++-
 tests/test_contractions.py |  12 ++--
 tests/test_tokenizer.py    |  17 ++++--
 8 files changed, 136 insertions(+), 70 deletions(-)

diff --git a/spacy/en.pxd b/spacy/en.pxd
index 6887dbc08..cee754d9c 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -10,6 +10,7 @@ cpdef enum en_person_t:
     FIRST
     SECOND
     THIRD
+    NON_THIRD
 
 
 cpdef enum en_number_t:
@@ -17,14 +18,22 @@ cpdef enum en_number_t:
     SINGULAR
     PLURAL
     MASS
-    CARDINAL
-    ORDINAL
 
 
 cpdef enum en_gender_t:
     NO_GENDER
     MASCULINE
     FEMININE
+    NEUTER
+
+
+cpdef enum en_case_t:
+    NO_CASE
+    NOMINATIVE
+    GENITIVE
+    ACCUSATIVE
+    REFLEXIVE
+    DEMONYM
 
 
 cpdef enum en_tenspect_t:
@@ -37,23 +46,12 @@ cpdef enum en_tenspect_t:
     MODAL
 
 
-cpdef enum en_case_t:
-    NO_CASE
-    NOMINATIVE
-    ACCUSATIVE
-    GENITIVE
-    DEMONYM
-
-
 cpdef enum misc_t:
     NO_MISC
     COMPARATIVE
     SUPERLATIVE
     RELATIVE
     NAME
-    URL
-    EMAIL
-    EMOTICON
 
     
 # Flags
diff --git a/spacy/en.pyx b/spacy/en.pyx
index fa59ef933..0136818f2 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -38,6 +38,8 @@ import orth
 from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from .tagger cimport X, PUNCT, EOL
 
+from .tokens cimport Morphology
+
 
 POS_TAGS = {
     'NULL': (NO_TAG, {}),
@@ -152,7 +154,8 @@ cdef class English(Language):
         for i in range(tokens.length):
             fill_pos_context(context, i, t)
             t[i].pos = self.pos_tagger.predict(context)
-            #self.morphalyser.set_token(&t[i])
+            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
+            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
 
     def train_pos(self, Tokens tokens, golds):
         cdef int i
@@ -162,11 +165,27 @@ cdef class English(Language):
         for i in range(tokens.length):
             fill_pos_context(context, i, t)
             t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            t[i].morph = self.pos_tagger.tags[t[i].pos].morph
-            #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
+            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
+            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
             c += t[i].pos == golds[i]
         return c
 
 
+cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
+    if tok_morph.number == 0:
+        tok_morph.number = pos_morph.number
+    if tok_morph.tenspect == 0:
+        tok_morph.tenspect = pos_morph.tenspect
+    if tok_morph.mood == 0:
+        tok_morph.mood = pos_morph.mood
+    if tok_morph.gender == 0:
+        tok_morph.gender = pos_morph.gender
+    if tok_morph.person == 0:
+        tok_morph.person = pos_morph.person
+    if tok_morph.case == 0:
+        tok_morph.case = pos_morph.case
+    if tok_morph.misc == 0:
+        tok_morph.misc = pos_morph.misc
+
 
 EN = English('en')
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 124281a6b..0307e12fe 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -9,7 +9,7 @@ from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
-from .tagger cimport PosTag
+from .tagger cimport univ_tag_t
 from .utf8string cimport StringStore, UniStr
 
 
@@ -38,11 +38,12 @@ cdef class Language:
     cdef object _suffix_re
     cdef object _infix_re
 
-    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
 
     cpdef Tokens tokens_from_list(self, list strings)
     cpdef Tokens tokenize(self, unicode text)
 
+    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
     cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                              vector[Lexeme*] *suffixes) except NULL
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index fdeb7df66..cdae8644a 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -28,6 +28,7 @@ from .util import read_lang_data
 from .tokens import Tokens
 
 from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
+from .tokens cimport Morphology
 
 
 cdef class Language:
@@ -53,27 +54,27 @@ cdef class Language:
         if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
             self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
 
-    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
         if self.lemmatizer is None:
             return lex.sic
-        if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
+        if pos != NOUN and pos != VERB and pos != ADJ:
             return lex.sic
-        cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
+        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
         if lemma != 0:
             return lemma
         cdef bytes py_string = self.lexicon.strings[lex.sic]
         cdef set lemma_strings
         cdef bytes lemma_string
-        if pos.pos == NOUN:
+        if pos == NOUN:
             lemma_strings = self.lemmatizer.noun(py_string)
-        elif pos.pos == VERB:
+        elif pos == VERB:
             lemma_strings = self.lemmatizer.verb(py_string)
         else:
-            assert pos.pos == ADJ
+            assert pos == ADJ
             lemma_strings = self.lemmatizer.adj(py_string)
         lemma_string = sorted(lemma_strings)[0]
         lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
-        self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
+        self._lemmas.set(pos, lex.sic, <void*>lemma)
         return lemma
 
     cpdef Tokens tokens_from_list(self, list strings):
@@ -111,6 +112,7 @@ cdef class Language:
             return tokens
         cdef int i = 0
         cdef int start = 0
+        cdef bint cache_hit
         cdef Py_UNICODE* chars = string
         cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
         cdef UniStr span
@@ -118,10 +120,8 @@ cdef class Language:
             if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                 if start < i:
                     slice_unicode(&span, chars, start, i)
-                    lexemes = <const Lexeme* const*>self._cache.get(span.key)
-                    if lexemes != NULL:
-                        tokens.extend(start, lexemes, 0)
-                    else: 
+                    cache_hit = self._try_cache(start, span.key, tokens)
+                    if not cache_hit:
                         self._tokenize(tokens, &span, start, i)
                 in_ws = not in_ws
                 start = i
@@ -130,13 +130,32 @@ cdef class Language:
         i += 1
         if start < i:
             slice_unicode(&span, chars, start, i)
-            lexemes = <const Lexeme* const*>self._cache.get(span.key)
-            if lexemes != NULL:
-                tokens.extend(start, lexemes, 0)
-            else: 
+            cache_hit = self._try_cache(start, span.key, tokens)
+            if not cache_hit:
                 self._tokenize(tokens, &span, start, i)
         return tokens
 
+    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
+        cdef int i
+        specials = <TokenC*>self._specials.get(key)
+        if specials != NULL:
+            i = 0
+            while specials[i].lex != NULL:
+                tokens.push_back(idx, specials[i].lex)
+                tokens.data[tokens.length - 1].pos = specials[i].pos
+                tokens.data[tokens.length - 1].morph = specials[i].morph
+                tokens.data[tokens.length - 1].lemma = specials[i].lemma
+                tokens.data[tokens.length - 1].sense = specials[i].sense
+                i += 1
+            return True
+        else:
+            cached = <const Lexeme* const*>self._cache.get(key)
+            if cached != NULL:
+                tokens.extend(i, cached, 0)
+                return True
+            else:
+                return False
+
     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
         cdef vector[Lexeme*] prefixes
         cdef vector[Lexeme*] suffixes
@@ -190,10 +209,10 @@ cdef class Language:
                 break
         return string
 
-    cdef int _attach_tokens(self, Tokens tokens,
-                            int idx, UniStr* string,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
                             vector[const Lexeme*] *prefixes,
                             vector[const Lexeme*] *suffixes) except -1:
+        cdef bint cache_hit
         cdef int split
         cdef const Lexeme* const* lexemes
         cdef Lexeme* lexeme
@@ -201,10 +220,9 @@ cdef class Language:
         if prefixes.size():
             idx = tokens.extend(idx, prefixes.data(), prefixes.size())
         if string.n != 0:
-
-            lexemes = <const Lexeme* const*>self._cache.get(string.key)
-            if lexemes != NULL:
-                idx = tokens.extend(idx, lexemes, 0)
+            cache_hit = self._try_cache(idx, string.key, tokens)
+            if cache_hit:
+                idx = tokens.data[tokens.length - 1].idx + 1
             else:
                 split = self._find_infix(string.chars, string.n)
                 if split == 0 or split == -1:
@@ -247,30 +265,42 @@ cdef class Language:
         match = self._suffix_re.search(string)
         return (match.end() - match.start()) if match is not None else 0
 
-    def _load_special_tokenization(self, token_rules):
-        '''Load special-case tokenization rules.
-
-        Loads special-case tokenization rules into the Language._cache cache,
-        read from data/<lang>/tokenization . The special cases are loaded before
-        any language data is tokenized, giving these priority.  For instance,
-        the English tokenization rules map "ain't" to ["are", "not"].
-
-        Args:
-            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
-                a string and tokens is a list of strings.
+    def _load_special_tokenization(self, object rules):
+        '''Add a special-case tokenization rule.
         '''
+        cdef int i
+        cdef unicode chunk
+        cdef list substrings
+        cdef unicode form
+        cdef unicode lemma
+        cdef dict props
         cdef Lexeme** lexemes
         cdef hash_t hashed
         cdef UniStr string
-        for uni_string, substrings in token_rules:
-            lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
-            for i, substring in enumerate(substrings):
-                slice_unicode(&string, substring, 0, len(substring))
-                lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
-            lexemes[i + 1] = NULL
-            slice_unicode(&string, uni_string, 0, len(uni_string))
-            self._specials.set(string.key, lexemes)
-            self._cache.set(string.key, lexemes)
+        for chunk, substrings in sorted(rules.items()):
+            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
+            for i, props in enumerate(substrings):
+                form = props['F']
+                lemma = props.get("L", None)
+                slice_unicode(&string, form, 0, len(form))
+                tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
+                if lemma:
+                    tokens[i].lemma = self.lexicon.strings[lemma]
+                set_morph_from_dict(&tokens[i].morph, props)
+            # Null-terminated array
+            tokens[i+1].lex = NULL
+            slice_unicode(&string, chunk, 0, len(chunk))
+            self._specials.set(string.key, tokens)
+
+
+cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
+    morph.number = props.get('number', 0)
+    morph.tenspect = props.get('tenspect', 0)
+    morph.mood = props.get('mood', 0)
+    morph.gender = props.get('gender', 0)
+    morph.person = props.get('person', 0)
+    morph.case = props.get('case', 0)
+    morph.misc = props.get('misc', 0)
 
 
 cdef class Lexicon:
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 6f4691716..f3d6011ec 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -21,7 +21,6 @@ cdef struct Morphology:
     uint8_t misc
 
 
-
 cdef struct TokenC:
     const Lexeme* lex
     Morphology morph
diff --git a/spacy/util.py b/spacy/util.py
index 5062ca6db..ff03760a5 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
 
 def read_lang_data(name):
     data_dir = path.join(DATA_DIR, name)
-    tokenization = read_tokenization(name)
+    with open(path.join(data_dir, 'specials.json')) as file_:
+        tokenization = ujson.load(file_)
     prefix = read_prefix(data_dir)
     suffix = read_suffix(data_dir)
     infix = read_infix(data_dir)
@@ -26,12 +27,17 @@ def read_prefix(data_dir):
         expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
     return expression
 
+
 def read_suffix(data_dir):
-    with  utf8open(path.join(data_dir, 'suffix')) as file_:
+    with utf8open(path.join(data_dir, 'suffix')) as file_:
         entries = file_.read().split('\n')
         expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
+    # TODO: Fix this hack!
+    expression += r'|(?<=[a-z0-9])\.$'
+    expression += r'|(?<=[0-9])km$'
     return expression
 
+
 def read_infix(data_dir):
     with utf8open(path.join(data_dir, 'infix')) as file_:
         entries = file_.read().split('\n')
diff --git a/tests/test_contractions.py b/tests/test_contractions.py
index 8334a74a9..1e697afd2 100644
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@@ -20,15 +20,18 @@ def test_apostrophe():
 def test_LL():
     tokens = EN.tokenize("we'll")
     assert len(tokens) == 2
-    assert tokens[1].string == "will"
+    assert tokens[1].string == "'ll"
+    assert tokens[1].lemma == "will"
     assert tokens[0].string == "we"
 
 
 def test_aint():
     tokens = EN.tokenize("ain't")
     assert len(tokens) == 2
-    assert tokens[0].string == "are"
-    assert tokens[1].string == "not"
+    assert tokens[0].string == "ai"
+    assert tokens[0].lemma == "be"
+    assert tokens[1].string == "n't"
+    assert tokens[1].lemma == "not"
 
 
 def test_capitalized():
@@ -38,7 +41,8 @@ def test_capitalized():
     assert len(tokens) == 2
     tokens = EN.tokenize("Ain't")
     assert len(tokens) == 2
-    assert tokens[0].string == "Are"
+    assert tokens[0].string == "Ai"
+    assert tokens[0].lemma == "be"
 
 
 def test_punct():
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index fb5f78ed7..21d115b9b 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -34,7 +34,7 @@ def test_digits():
 def test_contraction():
     tokens = EN.tokenize("don't giggle")
     assert len(tokens) == 3
-    assert tokens[1].sic == EN.lexicon["not"]['sic']
+    assert tokens[1].sic == EN.lexicon["n't"]['sic']
     tokens = EN.tokenize("i said don't!")
     assert len(tokens) == 5
     assert tokens[4].sic == EN.lexicon['!']['sic']
@@ -71,30 +71,39 @@ def test_cnts1():
     tokens = EN.tokenize(text)
     assert len(tokens) == 8
 
+
 def test_cnts2():
     text = u"""U.N. regulations are not a part of their concern."""
     tokens = EN.tokenize(text)
     assert len(tokens) == 10
 
+
 def test_cnts3():
     text = u"“Isn't it?”"
     tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
+    assert len(words) == 6
+
 
 def test_cnts4():
     text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
     tokens = EN.tokenize(text)
-    assert len(tokens) == 15
+    words = [t.string for t in tokens]
+    assert len(words) == 15
+
 
 def test_cnts5():
     text = """'Me too!', Mr. P. Delaware cried. """
     tokens = EN.tokenize(text)
     assert len(tokens) == 11
 
+
 def test_cnts6():
     text = u'They ran about 10km.'
     tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
+    assert len(words) == 6
+
 
 #def test_cnts7():
 #    text = 'But then the 6,000-year ice age came...'

From 2a6bd2818f581b385da3f9871d13663454507781 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 9 Dec 2014 15:18:43 +1100
Subject: [PATCH 44/56] * Load the lexicon before we check flag values

---
 tests/test_lexeme_flags.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py
index 10276d8ea..c1fe2d847 100644
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@@ -7,6 +7,7 @@ from spacy.lexeme import *
 
 
 def test_is_alpha():
+    EN.load()
     the = EN.lexicon['the']
     assert the['flags'] & (1 << IS_ALPHA)
     year = EN.lexicon['1999']
@@ -16,6 +17,7 @@ def test_is_alpha():
 
 
 def test_is_digit():
+    EN.load()
     the = EN.lexicon['the']
     assert not the['flags'] & (1 << IS_DIGIT)
     year = EN.lexicon['1999']

From 1ccabc806ef8c4575e12439b05f2d7cdb6d4b7e5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 9 Dec 2014 16:06:18 +1100
Subject: [PATCH 45/56] * Work on lemmatization

---
 data/en/suffix       |  1 -
 data/en/tokenization | 93 --------------------------------------------
 2 files changed, 94 deletions(-)

diff --git a/data/en/suffix b/data/en/suffix
index 77400d0fd..a6ff32849 100644
--- a/data/en/suffix
+++ b/data/en/suffix
@@ -16,7 +16,6 @@ $
 ''
 's
 'S
-.
 ..
 ...
 ....
diff --git a/data/en/tokenization b/data/en/tokenization
index e2b78dd28..382b7e383 100644
--- a/data/en/tokenization
+++ b/data/en/tokenization
@@ -4,99 +4,6 @@
 #*---* ---
 #*'s  's
 
-'s  's
-'S  'S
-ain't   ai n't
-aren't  are n't
-can't   ca n't
-cannot  can not
-could've    could 've
-couldn't    could n't
-couldn't've could n't 've
-didn't  did n't
-doesn't does n't
-don't   do n't
-hadn't  had n't
-hadn't've   had n't 've
-hasn't  has n't
-haven't have n't
-he'd    he 'd
-he'd've he 'd 've
-he'll   he 'll
-he's    he 's
-how'd   he 'd
-how'll  he 'll
-how's   how 's
-I'd I 'd
-I'd've  I 'd 've
-I'll    I 'll
-I'm I 'm
-I'ma    I 'ma
-I've    I 've
-isn't   is n't
-it'd    it 'd
-it'd've it 'd 've
-it'll   it 'll
-it's    it 's
-let's   let 's
-mightn't    might n't
-mightn't've might n't 've
-might've    might 've
-mustn't must n't
-must've must 've
-needn't need n't
-not've  not 've
-shan't  sha n't
-she'd   she 'd
-she'd've    she 'd 've
-she'll  she will
-she's   she 's
-should've   should 've
-shouldn't   should n't
-shouldn't've    should n't 've
-that's  that 's
-there'd there 'd
-there'd've  there 'd 've
-there's there 's
-they'd  there 'd
-they'd've   they 'd 've
-they'll they 'll
-they're they 're
-they've they 've
-wasn't  was n't
-we'd    we 'd
-we'd've we 'd 've
-we'll   we 'll
-we're   we 're
-we've   we 've
-weren't were n't
-what'll what 'll
-what're what 're
-what's  what 's
-what've what 've
-when's  when 's
-where'd where 'd
-where's where 's
-where've    where 've
-who'd   who 'd
-who'll  who 'll
-who're  who 're
-who's   who 's
-who've  who 've
-why'll  why 'll
-why're  why 're
-why's   why 's
-won't   wo n't
-would've    would 've
-wouldn't    would n't
-wouldn't've would n't 've
-you'd   you 'd
-you'd've    you 'd 've
-you'll  you 'll
-you're  you 're
-you've  you 've
-'em 'em
-'ol 'ol
 10km    10 km
 U.S.    U.S.
 U.K.    U.K.

From f15deaad5b5c506797772ded36a3102babfd4435 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 9 Dec 2014 16:08:01 +1100
Subject: [PATCH 46/56] * Upd docs

---
 docs/source/index.rst | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 808455fd0..fb738aa32 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -10,14 +10,27 @@ spaCy NLP Tokenizer and Lexicon
 spaCy is a library for industrial-strength NLP in Python and Cython.  spaCy's
 take on NLP is that it's mostly about feature extraction --- that's the part
 that's specific to NLP, so that's what an NLP library should focus on.
-It should tell you what the current best-practice is, and help you do exactly
-that, quickly and efficiently.
 
-Best-practice is to **use lots of large lexicons**.  Let's say you hit the word
-*belieber* in production.  What will your system know about this word?  A bad
-system will only know things about the words in its training corpus, which
-probably consists of texts written before Justin Bieber was even born.
-It doesn't have to be like that.
+spaCy also believes that for NLP, **efficiency is critical**.  If you're
+running batch jobs, you probably have an enormous amount of data; if you're
+serving requests one-by-one, you want lower latency and fewer servers.  Even if
+you're doing exploratory research on relatively small samples, you should still
+value efficiency, because it means you can run more experiments.
+
+Depending on the task, spaCy is between 10 and 200 times faster than NLTK,
+often with much better accuracy.  See Benchmarks for details, and
+Why is spaCy so fast? for a discussion of the algorithms and implementation
+that makes this possible.
+
++---------+----------+-------------+----------+
+| System  | Tokenize | --> Counts  | --> Stem |
++---------+----------+-------------+----------+
+| spaCy   | 1m42s    | 1m59s       | 1m59s    |
++---------+----------+-------------+----------+
+| NLTK    | 20m2s    | 28m24s      | 52m28    |
++---------+----------+-------------+----------+
+
+Times for 100m words of text.
 
 
 Unique Lexicon-centric design
@@ -25,15 +38,14 @@ Unique Lexicon-centric design
 
 spaCy helps you build models that generalise better, by making it easy to use
 more robust features.  Instead of a list of strings, the tokenizer returns
-references to rich lexical types.  Its tokenizer returns sequence of references
-to rich lexical types.  Features which ask about the word's Brown cluster, its
-typical part-of-speech tag, how it's usually cased etc require no extra effort:
+references to rich lexical types.  Features which ask about the word's Brown cluster,
+its typical part-of-speech tag, how it's usually cased etc require no extra effort:
 
     >>> from spacy.en import EN
     >>> from spacy.feature_names import *
     >>> feats = (
             SIC, # ID of the original word form
-            NORM, # ID of the normalized word form
+            STEM, # ID of the stemmed word form
             CLUSTER, # ID of the word's Brown cluster
             IS_TITLE, # Was the word title-cased?
             POS_TYPE # A cluster ID describing what POS tags the word is usually assigned
@@ -113,14 +125,6 @@ all to the special tokenization rules.
 
 spaCy's tokenizer is also incredibly efficient:
 
-+--------+---------------+--------------+
-| System | Tokens/second | Speed Factor |
-+--------+---------------+--------------+
-| NLTK   | 89 000        | 1.00         |
-+--------+---------------+--------------+
-| spaCy  | 3 093 000     | 38.30        |
-+--------+---------------+--------------+
-
 spaCy can create an inverted index of the 1.8 billion word Gigaword corpus,
 in under half an hour --- on a Macbook Air.  See the `inverted
 index tutorial`_.

From 6369835306a31b896a28b19f7620506b6d33dd34 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 9 Dec 2014 16:08:17 +1100
Subject: [PATCH 47/56] * Add false positive test for emoticons

---
 tests/test_emoticons.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_emoticons.py b/tests/test_emoticons.py
index 6bb58e661..143be607d 100644
--- a/tests/test_emoticons.py
+++ b/tests/test_emoticons.py
@@ -27,3 +27,9 @@ def test_tweebo_challenge():
     assert tokens[19].string == '")'
     assert tokens[20].string == ':>'
     assert tokens[21].string == '....'
+
+
+def test_false_positive():
+    text = "example:)"
+    tokens = EN.tokenize(text)
+    assert len(tokens) == 3

From 516f0f1e144af000bb2afb19b4745a46c6c3b4cb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 9 Dec 2014 16:08:45 +1100
Subject: [PATCH 48/56] * Remove test for loading ad hoc rules format

---
 tests/test_rules.py | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100644 tests/test_rules.py

diff --git a/tests/test_rules.py b/tests/test_rules.py
deleted file mode 100644
index b19a1c3f1..000000000
--- a/tests/test_rules.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from spacy import util
-
-
-def test_load_en():
-    rules = util.read_tokenization('en')
-    assert len(rules) != 0
-    aint = [rule for rule in rules if rule[0] == "ain't"][0]
-    chunk, pieces = aint
-    assert chunk == "ain't"
-    assert pieces[0] == "are"
-    assert pieces[1] == "not"

From 495e1c7366d30f79ef3332d05c94dbf14abd909d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 9 Dec 2014 16:50:01 +1100
Subject: [PATCH 49/56] * Use fused type in Tokens.push_back, simplifying the
 use of the cache

---
 spacy/lang.pxd   | 11 +++++++++++
 spacy/lang.pyx   | 39 +++++++++++++++++++++------------------
 spacy/tokens.pxd | 10 +++++++++-
 spacy/tokens.pyx | 12 ++++++------
 4 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 0307e12fe..8a6aa5f97 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -13,6 +13,17 @@ from .tagger cimport univ_tag_t
 from .utf8string cimport StringStore, UniStr
 
 
+cdef union LexemesOrTokens:
+    const Lexeme* const* lexemes
+    TokenC* tokens
+
+
+cdef struct Cached:
+    LexemesOrTokens data
+    bint is_lex
+    int length
+
+
 cdef class Lexicon:
     cpdef public get_lex_props
     cdef Pool mem
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index cdae8644a..044bfb7bc 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -137,21 +137,19 @@ cdef class Language:
 
     cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
         cdef int i
-        specials = <TokenC*>self._specials.get(key)
-        if specials != NULL:
-            i = 0
-            while specials[i].lex != NULL:
-                tokens.push_back(idx, specials[i].lex)
-                tokens.data[tokens.length - 1].pos = specials[i].pos
-                tokens.data[tokens.length - 1].morph = specials[i].morph
-                tokens.data[tokens.length - 1].lemma = specials[i].lemma
-                tokens.data[tokens.length - 1].sense = specials[i].sense
-                i += 1
+        cdef TokenC* token
+        cached = <Cached*>self._specials.get(key)
+        if cached != NULL:
+            assert not cached.is_lex
+            for i in range(cached.length):
+                token = &cached.data.tokens[i]
+                idx = tokens.push_back(idx, token)
             return True
         else:
-            cached = <const Lexeme* const*>self._cache.get(key)
+            cached = <Cached*>self._cache.get(key)
             if cached != NULL:
-                tokens.extend(i, cached, 0)
+                assert cached.is_lex == True
+                tokens.extend(i, cached.data.lexemes, cached.length)
                 return True
             else:
                 return False
@@ -244,11 +242,14 @@ cdef class Language:
         for i in range(n):
             if tokens[i].lex.id == 1:
                 return 0
-        lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
+        cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
+        cached.length = n
+        cached.is_lex = True
+        lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
         for i in range(n):
             lexemes[i] = tokens[i].lex
-        lexemes[i + 1] = NULL
-        self._cache.set(key, lexemes)
+        cached.data.lexemes = <const Lexeme* const*>lexemes
+        self._cache.set(key, cached)
 
     cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
         cdef unicode string = chars[:length]
@@ -287,10 +288,12 @@ cdef class Language:
                 if lemma:
                     tokens[i].lemma = self.lexicon.strings[lemma]
                 set_morph_from_dict(&tokens[i].morph, props)
-            # Null-terminated array
-            tokens[i+1].lex = NULL
+            cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
+            cached.length = len(substrings)
+            cached.is_lex = False
+            cached.data.tokens = tokens
             slice_unicode(&string, chunk, 0, len(chunk))
-            self._specials.set(string.key, tokens)
+            self._specials.set(string.key, cached)
 
 
 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index f3d6011ec..01bec6815 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -30,6 +30,14 @@ cdef struct TokenC:
     int sense
 
 
+ctypedef const Lexeme* const_Lexeme_ptr
+ctypedef TokenC* TokenC_ptr
+
+ctypedef fused LexemeOrToken:
+    const_Lexeme_ptr
+    TokenC_ptr
+
+
 cdef class Tokens:
     cdef Pool mem
     cdef StringStore _string_store
@@ -40,7 +48,7 @@ cdef class Tokens:
     cdef int max_length
 
     cdef int extend(self, int i, const Lexeme* const* lexemes, int n) except -1
-    cdef int push_back(self, int i, const Lexeme* lexeme) except -1
+    cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
     cpdef int set_tag(self, int i, int tag_type, int tag) except -1
 
     cpdef np.ndarray[long, ndim=2] get_array(self, list features)
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 004d0578c..4075e64d7 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -60,16 +60,16 @@ cdef class Tokens:
     def __len__(self):
         return self.length
 
-    cdef int push_back(self, int idx, const Lexeme* lexeme) except -1:
+    cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
         if self.length == self.max_length:
             self._realloc(self.length * 2)
         cdef TokenC* t = &self.data[self.length]
-        t.lex = lexeme
-        t.idx = idx
-        t.pos = 0
-        t.sense = 0
+        if LexemeOrToken is TokenC_ptr:
+            t[0] = lex_or_tok[0]
+        else:
+            t.lex = lex_or_tok
         self.length += 1
-        return idx + lexeme.length
+        return idx + t.lex.length
 
     cdef int extend(self, int idx, const Lexeme* const* lexemes, int n) except -1:
         cdef int i

From accdbe989b7110cea692ee4e5673b6739a65010a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 9 Dec 2014 17:09:23 +1100
Subject: [PATCH 50/56] * Remove Tokens.extend method

---
 spacy/lang.pyx   | 30 ++++++++++++++----------------
 spacy/tokens.pxd |  1 -
 spacy/tokens.pyx | 14 --------------
 3 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 044bfb7bc..14a83522b 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -87,7 +87,7 @@ cdef class Language:
         cdef int idx = 0
         for i, py_string in enumerate(strings):
             slice_unicode(&string_struct, py_string, 0, len(py_string))
-            tokens.push_back(idx, self.lexicon.get(tokens.mem, &string_struct))
+            tokens.push_back(idx, <const Lexeme*>self.lexicon.get(tokens.mem, &string_struct))
             idx += len(py_string) + 1
         return tokens
 
@@ -136,23 +136,19 @@ cdef class Language:
         return tokens
 
     cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
-        cdef int i
-        cdef TokenC* token
         cached = <Cached*>self._specials.get(key)
-        if cached != NULL:
-            assert not cached.is_lex
-            for i in range(cached.length):
-                token = &cached.data.tokens[i]
-                idx = tokens.push_back(idx, token)
-            return True
-        else:
+        if cached == NULL:
             cached = <Cached*>self._cache.get(key)
-            if cached != NULL:
-                assert cached.is_lex == True
-                tokens.extend(i, cached.data.lexemes, cached.length)
-                return True
-            else:
+            if cached == NULL:
                 return False
+        cdef int i
+        if cached.is_lex:
+            for i in range(cached.length):
+                idx = tokens.push_back(idx, cached.data.lexemes[i])
+        else:
+            for i in range(cached.length):
+                idx = tokens.push_back(idx, &cached.data.tokens[i])
+        return True
 
     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
         cdef vector[Lexeme*] prefixes
@@ -215,8 +211,10 @@ cdef class Language:
         cdef const Lexeme* const* lexemes
         cdef Lexeme* lexeme
         cdef UniStr span
+        cdef int i
         if prefixes.size():
-            idx = tokens.extend(idx, prefixes.data(), prefixes.size())
+            for i in range(prefixes.size()):
+                idx = tokens.push_back(idx, prefixes[0][i])
         if string.n != 0:
             cache_hit = self._try_cache(idx, string.key, tokens)
             if cache_hit:
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 01bec6815..cc9e8a05d 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -47,7 +47,6 @@ cdef class Tokens:
     cdef int length
     cdef int max_length
 
-    cdef int extend(self, int i, const Lexeme* const* lexemes, int n) except -1
     cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
     cpdef int set_tag(self, int i, int tag_type, int tag) except -1
 
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 4075e64d7..0b94d81d4 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -71,20 +71,6 @@ cdef class Tokens:
         self.length += 1
         return idx + t.lex.length
 
-    cdef int extend(self, int idx, const Lexeme* const* lexemes, int n) except -1:
-        cdef int i
-        if lexemes == NULL:
-            return idx
-        elif n == 0:
-            i = 0
-            while lexemes[i] != NULL:
-                idx = self.push_back(idx, lexemes[i])
-                i += 1
-        else:
-            for i in range(n):
-                idx = self.push_back(idx, lexemes[i])
-        return idx
-
     cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
         self.data[i].pos = tag
 

From b962fe73d7401f9255d4bc72acab9aa3e49b586f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 9 Dec 2014 19:04:27 +1100
Subject: [PATCH 51/56] * Make suffixes file use full-power regex, so that we
 can handle periods properly

---
 data/en/suffix | 24 +++++++++++++-----------
 spacy/util.py  |  5 +----
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/data/en/suffix b/data/en/suffix
index a6ff32849..8ba48296d 100644
--- a/data/en/suffix
+++ b/data/en/suffix
@@ -1,13 +1,13 @@
 ,
-"
-)
-]
-}
-*
-!
-?
+\"
+\)
+\]
+\}
+\*
+\!
+\?
 %
-$
+\$
 >
 :
 ;
@@ -16,6 +16,8 @@ $
 ''
 's
 'S
-..
-...
-....
+\.\.
+\.\.\.
+\.\.\.\.
+(?<=[a-z0-9])\.
+(?<=[0-9])km
diff --git a/spacy/util.py b/spacy/util.py
index ff03760a5..1c25aeaf2 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -31,10 +31,7 @@ def read_prefix(data_dir):
 def read_suffix(data_dir):
     with utf8open(path.join(data_dir, 'suffix')) as file_:
         entries = file_.read().split('\n')
-        expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
-    # TODO: Fix this hack!
-    expression += r'|(?<=[a-z0-9])\.$'
-    expression += r'|(?<=[0-9])km$'
+        expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
     return expression
 
 

From 6b34a2f34ba4cc61654404f30b84d554fbc2ee10 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 9 Dec 2014 21:16:17 +1100
Subject: [PATCH 52/56] * Move morphological analysis into its own module,
 morphology.pyx

---
 spacy/en.pyx         | 12 +++----
 spacy/lang.pxd       |  9 ++---
 spacy/lang.pyx       | 32 ++---------------
 spacy/morphology.pxd | 42 +++++++++++++++++++++++
 spacy/morphology.pyx | 81 ++++++++++++++++++++++++++++++++++++++++++++
 spacy/tagger.pxd     | 26 --------------
 spacy/tagger.pyx     | 30 ----------------
 7 files changed, 135 insertions(+), 97 deletions(-)
 create mode 100644 spacy/morphology.pxd
 create mode 100644 spacy/morphology.pyx

diff --git a/spacy/en.pyx b/spacy/en.pyx
index 0136818f2..9cd2546cb 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -35,8 +35,8 @@ from __future__ import unicode_literals
 cimport lang
 from .typedefs cimport flags_t
 import orth
-from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
-from .tagger cimport X, PUNCT, EOL
+from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
+from .morphology cimport X, PUNCT, EOL
 
 from .tokens cimport Morphology
 
@@ -154,8 +154,8 @@ cdef class English(Language):
         for i in range(tokens.length):
             fill_pos_context(context, i, t)
             t[i].pos = self.pos_tagger.predict(context)
-            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
-            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
+            if self.morphologizer:
+                self.morphologizer.set_morph(i, t)
 
     def train_pos(self, Tokens tokens, golds):
         cdef int i
@@ -165,8 +165,8 @@ cdef class English(Language):
         for i in range(tokens.length):
             fill_pos_context(context, i, t)
             t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
-            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
+            if self.morphologizer:
+                self.morphologizer.set_morph(i, t)
             c += t[i].pos == golds[i]
         return c
 
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 8a6aa5f97..20986f134 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -2,15 +2,15 @@ from libcpp.vector cimport vector
 
 from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
 
-from preshed.maps cimport PreshMap, PreshMapArray
+from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 
 from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
-from .tagger cimport univ_tag_t
 from .utf8string cimport StringStore, UniStr
+from .morphology cimport Morphologizer
 
 
 cdef union LexemesOrTokens:
@@ -40,17 +40,14 @@ cdef class Language:
     cdef readonly unicode name
     cdef PreshMap _cache
     cdef PreshMap _specials
-    cdef PreshMapArray _lemmas
     cpdef readonly Lexicon lexicon
     cpdef readonly Tagger pos_tagger
-    cpdef readonly object lemmatizer
+    cpdef readonly Morphologizer morphologizer
 
     cdef object _prefix_re
     cdef object _suffix_re
     cdef object _infix_re
 
-    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
-
     cpdef Tokens tokens_from_list(self, list strings)
     cpdef Tokens tokenize(self, unicode text)
 
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 14a83522b..6c018b2ce 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -14,7 +14,6 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
-from .lemmatizer import Lemmatizer
 
 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
@@ -26,8 +25,6 @@ from .utf8string cimport slice_unicode
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
-
-from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
 from .tokens cimport Morphology
 
 
@@ -43,39 +40,16 @@ cdef class Language:
         self._infix_re = re.compile(infix)
         self.lexicon = Lexicon(self.get_props)
         self._load_special_tokenization(rules)
-        self._lemmas = PreshMapArray(N_UNIV_TAGS)
         self.pos_tagger = None
-        self.lemmatizer = None
+        self.morphologizer = None
 
     def load(self):
-        self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
         self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
         self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
         if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
             self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
-
-    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
-        if self.lemmatizer is None:
-            return lex.sic
-        if pos != NOUN and pos != VERB and pos != ADJ:
-            return lex.sic
-        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
-        if lemma != 0:
-            return lemma
-        cdef bytes py_string = self.lexicon.strings[lex.sic]
-        cdef set lemma_strings
-        cdef bytes lemma_string
-        if pos == NOUN:
-            lemma_strings = self.lemmatizer.noun(py_string)
-        elif pos == VERB:
-            lemma_strings = self.lemmatizer.verb(py_string)
-        else:
-            assert pos == ADJ
-            lemma_strings = self.lemmatizer.adj(py_string)
-        lemma_string = sorted(lemma_strings)[0]
-        lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
-        self._lemmas.set(pos, lex.sic, <void*>lemma)
-        return lemma
+            self.morphologizer = Morphologizer(self.lexicon.strings,
+                                    path.join(util.DATA_DIR, self.name))
 
     cpdef Tokens tokens_from_list(self, list strings):
         cdef int length = sum([len(s) for s in strings])
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
new file mode 100644
index 000000000..084cbbbe6
--- /dev/null
+++ b/spacy/morphology.pxd
@@ -0,0 +1,42 @@
+from .tokens cimport TokenC, Morphology
+from .lexeme cimport Lexeme
+from .utf8string cimport StringStore
+
+from preshed.maps cimport PreshMapArray
+from cymem.cymem cimport Pool
+
+# Google universal tag set
+cpdef enum univ_tag_t:
+    NO_TAG
+    ADJ
+    ADV
+    ADP
+    CONJ
+    DET
+    NOUN
+    NUM
+    PRON
+    PRT
+    VERB
+    X
+    PUNCT
+    EOL
+    N_UNIV_TAGS
+
+
+cdef struct PosTag:
+    Morphology morph
+    int id
+    univ_tag_t pos
+
+
+cdef class Morphologizer:
+    cdef Pool mem
+    cdef StringStore strings
+    cdef object lemmatizer
+    cdef PosTag* tags
+
+    cdef PreshMapArray _morph
+    cdef PreshMapArray _lemmas
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
+    cdef int set_morph(self, const int i, TokenC* tokens) except -1
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
new file mode 100644
index 000000000..63c5ff827
--- /dev/null
+++ b/spacy/morphology.pyx
@@ -0,0 +1,81 @@
+from os import path
+import json
+
+from .lemmatizer import Lemmatizer
+
+
+UNIV_TAGS = {
+    'NULL': NO_TAG,
+    'ADJ': ADJ,
+    'ADV': ADV,
+    'ADP': ADP,
+    'CONJ': CONJ,
+    'DET': DET,
+    'NOUN': NOUN,
+    'NUM': NUM,
+    'PRON': PRON,
+    'PRT': PRT,
+    'VERB': VERB,
+    'X': X,
+    '.': PUNCT,
+    'EOL': EOL
+}
+
+
+cdef class Morphologizer:
+    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
+    """
+    def __init__(self, StringStore strings, data_dir):
+        self.mem = Pool()
+        self.strings = strings
+        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
+        tag_map = cfg['tag_map']
+        tag_names = cfg['tag_names']
+        self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
+        self._lemmas = PreshMapArray(N_UNIV_TAGS)
+        self._morph = PreshMapArray(len(tag_names))
+        self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
+        for i, tag in enumerate(tag_names):
+            pos, props = tag_map[tag]
+            self.tags[i].id = i
+            self.tags[i].pos = pos
+            self.tags[i].morph.number = props.get('number', 0)
+            self.tags[i].morph.tenspect = props.get('tenspect', 0)
+            self.tags[i].morph.mood = props.get('mood', 0)
+            self.tags[i].morph.gender = props.get('gender', 0)
+            self.tags[i].morph.person = props.get('person', 0)
+            self.tags[i].morph.case = props.get('case', 0)
+            self.tags[i].morph.misc = props.get('misc', 0)
+
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
+        if self.lemmatizer is None:
+            return lex.sic
+        if pos != NOUN and pos != VERB and pos != ADJ:
+            return lex.sic
+        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
+        if lemma != 0:
+            return lemma
+        cdef bytes py_string = self.strings[lex.sic]
+        cdef set lemma_strings
+        cdef bytes lemma_string
+        if pos == NOUN:
+            lemma_strings = self.lemmatizer.noun(py_string)
+        elif pos == VERB:
+            lemma_strings = self.lemmatizer.verb(py_string)
+        else:
+            assert pos == ADJ
+            lemma_strings = self.lemmatizer.adj(py_string)
+        lemma_string = sorted(lemma_strings)[0]
+        lemma = self.strings.intern(lemma_string, len(lemma_string)).i
+        self._lemmas.set(pos, lex.sic, <void*>lemma)
+        return lemma
+
+    cdef int set_morph(self, const int i, TokenC* tokens) except -1:
+        cdef const PosTag* tag = &self.tags[tokens[i].pos]
+        tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
+        morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
+        if morph is NULL:
+            self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
+            tokens[i].morph = tag.morph
+        else:
+            tokens[i].morph = morph[0]
diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index 11880bf13..9abe25209 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -12,31 +12,6 @@ from .typedefs cimport hash_t
 from .tokens cimport Tokens, Morphology
 
 
-# Google universal tag set
-cdef enum univ_tag_t:
-    NO_TAG
-    ADJ
-    ADV
-    ADP
-    CONJ
-    DET
-    NOUN
-    NUM
-    PRON
-    PRT
-    VERB
-    X
-    PUNCT
-    EOL
-    N_UNIV_TAGS
-
-
-cdef struct PosTag:
-    Morphology morph
-    int id
-    univ_tag_t pos
-
-
 cdef class Tagger:
     cdef class_t predict(self, const atom_t* context, object golds=*) except *
  
@@ -45,5 +20,4 @@ cdef class Tagger:
     cpdef readonly LinearModel model
 
     cpdef readonly list tag_names
-    cdef PosTag* tags
     cdef dict tagdict
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index db7974d91..a1e51c5b5 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -34,23 +34,10 @@ cdef class Tagger:
         self.mem = Pool()
         cfg = json.load(open(path.join(model_dir, 'config.json')))
         templates = cfg['templates']
-        tag_map = cfg['tag_map']
         univ_counts = {}
         cdef unicode tag
         cdef unicode univ_tag
         self.tag_names = cfg['tag_names']
-        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
-        for i, tag in enumerate(self.tag_names):
-            pos, props = tag_map[tag]
-            self.tags[i].id = i
-            self.tags[i].pos = pos
-            self.tags[i].morph.number = props.get('number', 0)
-            self.tags[i].morph.tenspect = props.get('tenspect', 0)
-            self.tags[i].morph.mood = props.get('mood', 0)
-            self.tags[i].morph.gender = props.get('gender', 0)
-            self.tags[i].morph.person = props.get('person', 0)
-            self.tags[i].morph.case = props.get('case', 0)
-            self.tags[i].morph.misc = props.get('misc', 0)
         self.tagdict = _make_tag_dict(cfg['tag_counts'])
         self.extractor = Extractor(templates)
         self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
@@ -85,23 +72,6 @@ cdef class Tagger:
         return tag_id
 
 
-UNIV_TAGS = {
-    'NULL': NO_TAG,
-    'ADJ': ADJ,
-    'ADV': ADV,
-    'ADP': ADP,
-    'CONJ': CONJ,
-    'DET': DET,
-    'NOUN': NOUN,
-    'NUM': NUM,
-    'PRON': PRON,
-    'PRT': PRT,
-    'VERB': VERB,
-    'X': X,
-    '.': PUNCT,
-    'EOL': EOL
-}
-
 
 def _make_tag_dict(counts):
     freq_thresh = 50

From 42973c4b370b1eb68eef45403356e267e25b5513 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 10 Dec 2014 01:02:04 +1100
Subject: [PATCH 53/56] * Improve efficiency of tagger, and improve
 morphological processing

---
 spacy/en.pxd         | 18 ------------
 spacy/en.pyx         | 44 +++++++++++++++-------------
 spacy/morphology.pxd |  4 +--
 spacy/morphology.pyx | 70 +++++++++++++++++++++++++++++++++-----------
 spacy/orth.py        |  1 +
 spacy/tagger.pxd     |  2 +-
 spacy/tagger.pyx     |  5 ++--
 7 files changed, 83 insertions(+), 61 deletions(-)

diff --git a/spacy/en.pxd b/spacy/en.pxd
index cee754d9c..4ac8a126d 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -125,23 +125,5 @@ cpdef enum:
     N_CONTEXT_FIELDS
 
 
-cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
-    _fill_from_token(&context[P2_sic], &tokens[i-2])
-    _fill_from_token(&context[P1_sic], &tokens[i-1])
-    _fill_from_token(&context[W_sic], &tokens[i])
-    _fill_from_token(&context[N1_sic], &tokens[i+1])
-    _fill_from_token(&context[N2_sic], &tokens[i+2])
-
-
-cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
-    context[0] = t.lex.sic
-    context[1] = t.lex.cluster
-    context[2] = t.lex.shape
-    context[3] = t.lex.prefix
-    context[4] = t.lex.suffix
-    context[5] = t.pos
-    context[6] = t.sense
-
-
 cdef class English(Language):
     pass
diff --git a/spacy/en.pyx b/spacy/en.pyx
index 9cd2546cb..10773e0e2 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -151,10 +151,14 @@ cdef class English(Language):
         cdef int i
         cdef atom_t[N_CONTEXT_FIELDS] context
         cdef TokenC* t = tokens.data
+        assert self.morphologizer is not None
+        cdef dict tagdict = self.pos_tagger.tagdict
         for i in range(tokens.length):
-            fill_pos_context(context, i, t)
-            t[i].pos = self.pos_tagger.predict(context)
-            if self.morphologizer:
+            if t[i].lex.sic in tagdict:
+                t[i].pos = tagdict[t[i].lex.sic]
+            else:
+                fill_pos_context(context, i, t)
+                t[i].pos = self.pos_tagger.predict(context)
                 self.morphologizer.set_morph(i, t)
 
     def train_pos(self, Tokens tokens, golds):
@@ -165,27 +169,27 @@ cdef class English(Language):
         for i in range(tokens.length):
             fill_pos_context(context, i, t)
             t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            if self.morphologizer:
-                self.morphologizer.set_morph(i, t)
+            self.morphologizer.set_morph(i, t)
             c += t[i].pos == golds[i]
         return c
 
 
-cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
-    if tok_morph.number == 0:
-        tok_morph.number = pos_morph.number
-    if tok_morph.tenspect == 0:
-        tok_morph.tenspect = pos_morph.tenspect
-    if tok_morph.mood == 0:
-        tok_morph.mood = pos_morph.mood
-    if tok_morph.gender == 0:
-        tok_morph.gender = pos_morph.gender
-    if tok_morph.person == 0:
-        tok_morph.person = pos_morph.person
-    if tok_morph.case == 0:
-        tok_morph.case = pos_morph.case
-    if tok_morph.misc == 0:
-        tok_morph.misc = pos_morph.misc
+cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
+    _fill_from_token(&context[P2_sic], &tokens[i-2])
+    _fill_from_token(&context[P1_sic], &tokens[i-1])
+    _fill_from_token(&context[W_sic], &tokens[i])
+    _fill_from_token(&context[N1_sic], &tokens[i+1])
+    _fill_from_token(&context[N2_sic], &tokens[i+2])
+
+
+cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
+    context[0] = t.lex.sic
+    context[1] = t.lex.cluster
+    context[2] = t.lex.shape
+    context[3] = t.lex.prefix
+    context[4] = t.lex.suffix
+    context[5] = t.pos
+    context[6] = t.sense
 
 
 EN = English('en')
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 084cbbbe6..31cb08855 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -35,8 +35,8 @@ cdef class Morphologizer:
     cdef StringStore strings
     cdef object lemmatizer
     cdef PosTag* tags
+    cdef readonly list tag_names
 
-    cdef PreshMapArray _morph
-    cdef PreshMapArray _lemmas
+    cdef PreshMapArray _cache
     cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
     cdef int set_morph(self, const int i, TokenC* tokens) except -1
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 63c5ff827..b21a3ced4 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,8 +1,10 @@
+# cython: profile=True
+# cython: embedsignature=True
 from os import path
 import json
 
 from .lemmatizer import Lemmatizer
-
+from .typedefs cimport id_t
 
 UNIV_TAGS = {
     'NULL': NO_TAG,
@@ -22,6 +24,11 @@ UNIV_TAGS = {
 }
 
 
+cdef struct _Cached:
+    Morphology morph
+    int lemma
+
+
 cdef class Morphologizer:
     """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
     """
@@ -30,12 +37,11 @@ cdef class Morphologizer:
         self.strings = strings
         cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
         tag_map = cfg['tag_map']
-        tag_names = cfg['tag_names']
+        self.tag_names = cfg['tag_names']
         self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
-        self._lemmas = PreshMapArray(N_UNIV_TAGS)
-        self._morph = PreshMapArray(len(tag_names))
-        self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
-        for i, tag in enumerate(tag_names):
+        self._cache = PreshMapArray(len(self.tag_names))
+        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
+        for i, tag in enumerate(self.tag_names):
             pos, props = tag_map[tag]
             self.tags[i].id = i
             self.tags[i].pos = pos
@@ -46,15 +52,15 @@ cdef class Morphologizer:
             self.tags[i].morph.person = props.get('person', 0)
             self.tags[i].morph.case = props.get('case', 0)
             self.tags[i].morph.misc = props.get('misc', 0)
+        if path.exists(path.join(data_dir, 'morph.json')):
+            with open(path.join(data_dir, 'morph.json')) as file_:
+                self.load_exceptions(json.loads(file_))
 
     cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
         if self.lemmatizer is None:
             return lex.sic
         if pos != NOUN and pos != VERB and pos != ADJ:
             return lex.sic
-        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
-        if lemma != 0:
-            return lemma
         cdef bytes py_string = self.strings[lex.sic]
         cdef set lemma_strings
         cdef bytes lemma_string
@@ -67,15 +73,45 @@ cdef class Morphologizer:
             lemma_strings = self.lemmatizer.adj(py_string)
         lemma_string = sorted(lemma_strings)[0]
         lemma = self.strings.intern(lemma_string, len(lemma_string)).i
-        self._lemmas.set(pos, lex.sic, <void*>lemma)
         return lemma
 
     cdef int set_morph(self, const int i, TokenC* tokens) except -1:
         cdef const PosTag* tag = &self.tags[tokens[i].pos]
-        tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
-        morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
-        if morph is NULL:
-            self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
-            tokens[i].morph = tag.morph
-        else:
-            tokens[i].morph = morph[0]
+        cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
+        if cached is NULL:
+            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
+            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
+            cached.morph = tag.morph
+            self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
+
+        tokens[i].lemma = cached.lemma
+        tokens[i].morph = cached.morph
+
+    def load_exceptions(self, dict exc):
+        cdef unicode pos_str
+        cdef unicode form_str
+        cdef unicode lemma_str
+        cdef dict entries
+        cdef dict props
+        cdef int lemma
+        cdef id_t sic
+        cdef univ_tag_t pos
+        for pos_str, entries in exc.items():
+            pos = self.tag_names.index(pos_str)
+            for form_str, props in entries.items():
+                lemma_str = props.get('L', form_str)
+                sic = self.strings[form_str]
+                cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
+                cached.lemma = self.strings[lemma_str]
+                set_morph_from_dict(&cached.morph, props)
+                self._cache.set(pos, sic, <void*>cached)
+                
+
+cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
+    morph.number = props.get('number', 0)
+    morph.tenspect = props.get('tenspect', 0)
+    morph.mood = props.get('mood', 0)
+    morph.gender = props.get('gender', 0)
+    morph.person = props.get('person', 0)
+    morph.case = props.get('case', 0)
+    morph.misc = props.get('misc', 0)
diff --git a/spacy/orth.py b/spacy/orth.py
index 0462d15df..2400b38a6 100644
--- a/spacy/orth.py
+++ b/spacy/orth.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 import unicodedata
 from unidecode import unidecode
+import re
 
 import math
 
diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index 9abe25209..a896742ad 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -8,7 +8,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 
 from preshed.maps cimport PreshMapArray
 
-from .typedefs cimport hash_t
+from .typedefs cimport hash_t, id_t
 from .tokens cimport Tokens, Morphology
 
 
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index a1e51c5b5..9890e95e1 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -72,10 +72,9 @@ cdef class Tagger:
         return tag_id
 
 
-
 def _make_tag_dict(counts):
-    freq_thresh = 50
-    ambiguity_thresh = 0.98
+    freq_thresh = 20
+    ambiguity_thresh = 0.97
     tagdict = {}
     cdef atom_t word
     cdef atom_t tag

From df3be149871da40b5f15c49d8870220f7fc36b5d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 10 Dec 2014 08:08:55 +1100
Subject: [PATCH 54/56] * Add pos_type features to POS tagger

---
 spacy/en.pxd | 15 ++++++++++-----
 spacy/en.pyx | 14 ++++++++++----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/spacy/en.pxd b/spacy/en.pxd
index 4ac8a126d..2ca081e47 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -88,7 +88,8 @@ cpdef enum:
     P2_prefix
     P2_suffix
     P2_pos
-    P2_sense
+    P2_lemma
+    P2_pos_type
 
     P1_sic
     P1_cluster
@@ -96,7 +97,8 @@ cpdef enum:
     P1_prefix
     P1_suffix
     P1_pos
-    P1_sense
+    P1_lemma
+    P1_pos_type
 
     W_sic
     W_cluster
@@ -104,7 +106,8 @@ cpdef enum:
     W_prefix
     W_suffix
     W_pos
-    W_sense
+    W_lemma
+    W_pos_type
 
     N1_sic
     N1_cluster
@@ -112,7 +115,8 @@ cpdef enum:
     N1_prefix
     N1_suffix
     N1_pos
-    N1_sense
+    N1_lemma
+    N1_pos_type
 
     N2_sic
     N2_cluster
@@ -120,7 +124,8 @@ cpdef enum:
     N2_prefix
     N2_suffix
     N2_pos
-    N2_sense
+    N2_lemma
+    N2_pos_type
 
     N_CONTEXT_FIELDS
 
diff --git a/spacy/en.pyx b/spacy/en.pyx
index 10773e0e2..3ed0eaaa9 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -98,10 +98,10 @@ POS_TAGS = {
 
 POS_TEMPLATES = (
     (W_sic,),
-    (P1_sic,),
+    (P1_lemma, P1_pos),
+    (P2_lemma, P2_pos),
     (N1_sic,),
     (N2_sic,),
-    (P2_sic,),
 
     (W_suffix,),
     (W_prefix,),
@@ -119,6 +119,11 @@ POS_TEMPLATES = (
     (N2_cluster,),
     (P1_cluster,),
     (P2_cluster,),
+
+    (W_pos_type,),
+    (N1_pos_type,),
+    (N1_pos_type,),
+    (P1_pos, W_pos_type, N1_pos_type),
 )
 
 
@@ -159,7 +164,7 @@ cdef class English(Language):
             else:
                 fill_pos_context(context, i, t)
                 t[i].pos = self.pos_tagger.predict(context)
-                self.morphologizer.set_morph(i, t)
+            self.morphologizer.set_morph(i, t)
 
     def train_pos(self, Tokens tokens, golds):
         cdef int i
@@ -189,7 +194,8 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
     context[3] = t.lex.prefix
     context[4] = t.lex.suffix
     context[5] = t.pos
-    context[6] = t.sense
+    context[6] = t.lemma
+    context[7] = t.lex.pos_type
 
 
 EN = English('en')

From 7831b066106620d3efa2acea3839daefd2035d98 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 10 Dec 2014 08:09:13 +1100
Subject: [PATCH 55/56] * Compile morphology.pyx file

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index 6ff1f5d62..827d44fc6 100644
--- a/setup.py
+++ b/setup.py
@@ -55,6 +55,8 @@ exts = [
     Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.morphology", ["spacy/morphology.pyx"], language="c++",
+              include_dirs=includes),
     #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
     #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),

From 9959a64f7bd36f6c62aa53f4bf8f30b0d4d81ee0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 10 Dec 2014 08:09:32 +1100
Subject: [PATCH 56/56] * Working morphology and lemmatisation. POS tagging
 quite fast.

---
 spacy/lang.pyx       | 12 ++++++------
 spacy/lexeme.pxd     |  1 +
 spacy/morphology.pxd |  5 ++++-
 spacy/morphology.pyx |  6 +++---
 spacy/tagger.pxd     |  2 +-
 spacy/tokens.pxd     | 21 ++++++---------------
 spacy/tokens.pyx     | 28 ++++++++++++++++------------
 spacy/typedefs.pxd   | 11 +++++++++++
 8 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 6c018b2ce..4617c3853 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -53,7 +53,7 @@ cdef class Language:
 
     cpdef Tokens tokens_from_list(self, list strings):
         cdef int length = sum([len(s) for s in strings])
-        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
+        cdef Tokens tokens = Tokens(self, length)
         if length == 0:
             return tokens
         cdef UniStr string_struct
@@ -81,7 +81,7 @@ cdef class Language:
             tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
         """
         cdef int length = len(string)
-        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
+        cdef Tokens tokens = Tokens(self, length)
         if length == 0:
             return tokens
         cdef int i = 0
@@ -110,11 +110,10 @@ cdef class Language:
         return tokens
 
     cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
-        cached = <Cached*>self._specials.get(key)
+        #cached = <Cached*>self._specials.get(key)
+        cached = <Cached*>self._cache.get(key)
         if cached == NULL:
-            cached = <Cached*>self._cache.get(key)
-            if cached == NULL:
-                return False
+            return False
         cdef int i
         if cached.is_lex:
             for i in range(cached.length):
@@ -266,6 +265,7 @@ cdef class Language:
             cached.data.tokens = tokens
             slice_unicode(&string, chunk, 0, len(chunk))
             self._specials.set(string.key, cached)
+            self._cache.set(string.key, cached)
 
 
 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index f524188ed..a6f20906b 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -80,6 +80,7 @@ cpdef enum attr_id_t:
     LENGTH
     CLUSTER
     POS_TYPE
+    LEMMA
 
 
 cdef struct Lexeme:
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 31cb08855..9c5d342e9 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,10 +1,13 @@
-from .tokens cimport TokenC, Morphology
+
+from .tokens cimport TokenC
 from .lexeme cimport Lexeme
 from .utf8string cimport StringStore
+from .typedefs cimport id_t, Morphology
 
 from preshed.maps cimport PreshMapArray
 from cymem.cymem cimport Pool
 
+
 # Google universal tag set
 cpdef enum univ_tag_t:
     NO_TAG
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index b21a3ced4..346c778a9 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -52,9 +52,9 @@ cdef class Morphologizer:
             self.tags[i].morph.person = props.get('person', 0)
             self.tags[i].morph.case = props.get('case', 0)
             self.tags[i].morph.misc = props.get('misc', 0)
-        if path.exists(path.join(data_dir, 'morph.json')):
-            with open(path.join(data_dir, 'morph.json')) as file_:
-                self.load_exceptions(json.loads(file_))
+        if path.exists(path.join(data_dir, 'morphs.json')):
+            with open(path.join(data_dir, 'morphs.json')) as file_:
+                self.load_exceptions(json.load(file_))
 
     cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
         if self.lemmatizer is None:
diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index a896742ad..33732f987 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -9,7 +9,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 from preshed.maps cimport PreshMapArray
 
 from .typedefs cimport hash_t, id_t
-from .tokens cimport Tokens, Morphology
+from .tokens cimport Tokens
 
 
 cdef class Tagger:
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index cc9e8a05d..43aa7b442 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -7,19 +7,10 @@ from thinc.typedefs cimport atom_t
 from .lexeme cimport Lexeme
 
 from .typedefs cimport flags_t
-from .utf8string cimport StringStore
-from libc.stdint cimport uint8_t, uint16_t
+from .typedefs cimport Morphology
+from .lang cimport Language
 
 
-cdef struct Morphology:
-    uint8_t number
-    uint8_t tenspect # Tense/aspect/voice
-    uint8_t mood
-    uint8_t gender
-    uint8_t person
-    uint8_t case
-    uint8_t misc
-
 
 cdef struct TokenC:
     const Lexeme* lex
@@ -40,7 +31,8 @@ ctypedef fused LexemeOrToken:
 
 cdef class Tokens:
     cdef Pool mem
-    cdef StringStore _string_store
+    cdef Language lang
+    cdef list tag_names
 
     cdef TokenC* data
 
@@ -48,16 +40,15 @@ cdef class Tokens:
     cdef int max_length
 
     cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
-    cpdef int set_tag(self, int i, int tag_type, int tag) except -1
 
     cpdef np.ndarray[long, ndim=2] get_array(self, list features)
 
 
 cdef class Token:
-    cdef StringStore _string_store
+    cdef public Language lang
     cdef public int i
     cdef public int idx
-    cdef public int pos
+    cdef int pos
     cdef int lemma
 
     cdef public atom_t id
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 0b94d81d4..617feb269 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -30,8 +30,8 @@ cdef class Tokens:
     >>> from spacy.en import EN
     >>> tokens = EN.tokenize('An example sentence.')
     """
-    def __init__(self, StringStore string_store, string_length=0):
-        self._string_store = string_store
+    def __init__(self, Language lang, string_length=0):
+        self.lang = lang
         if string_length >= 3:
             size = int(string_length / 3.0)
         else:
@@ -50,7 +50,7 @@ cdef class Tokens:
 
     def __getitem__(self, i):
         bounds_check(i, self.length, PADDING)
-        return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
+        return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
                      self.data[i].lemma, self.data[i].lex[0])
 
     def __iter__(self):
@@ -71,9 +71,6 @@ cdef class Tokens:
         self.length += 1
         return idx + t.lex.length
 
-    cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
-        self.data[i].pos = tag
-
     @cython.boundscheck(False)
     cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
         cdef int i, j
@@ -92,7 +89,10 @@ cdef class Tokens:
 
         cdef PreshCounter counts = PreshCounter(2 ** 8)
         for i in range(self.length):
-            attr = get_attr(self.data[i].lex, attr_id)
+            if attr_id == LEMMA:
+                attr = self.data[i].lemma
+            else:
+                attr = get_attr(self.data[i].lex, attr_id)
             counts.inc(attr, 1)
         return dict(counts)
 
@@ -114,9 +114,9 @@ cdef class Tokens:
 
 @cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
-                 dict lex):
-        self._string_store = string_store
+    def __init__(self, Language lang, int i, int idx,
+                 int pos, int lemma, dict lex):
+        self.lang = lang
         self.idx = idx
         self.pos = pos
         self.i = i
@@ -141,12 +141,16 @@ cdef class Token:
         def __get__(self):
             if self.sic == 0:
                 return ''
-            cdef bytes utf8string = self._string_store[self.sic]
+            cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
             return utf8string.decode('utf8')
 
     property lemma:
         def __get__(self):
             if self.lemma == 0:
                 return self.string
-            cdef bytes utf8string = self._string_store[self.lemma]
+            cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
             return utf8string.decode('utf8')
+
+    property pos:
+        def __get__(self):
+            return self.lang.pos_tagger.tag_names[self.pos]
diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd
index 893865133..02d327b72 100644
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@@ -1,4 +1,5 @@
 from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
+from libc.stdint cimport uint8_t
 
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
@@ -7,3 +8,13 @@ ctypedef uint64_t flags_t
 ctypedef uint32_t id_t
 ctypedef uint16_t len_t
 ctypedef uint16_t tag_t
+
+
+cdef struct Morphology:
+    uint8_t number
+    uint8_t tenspect # Tense/aspect/voice
+    uint8_t mood
+    uint8_t gender
+    uint8_t person
+    uint8_t case
+    uint8_t misc