diff --git a/setup.py b/setup.py
index eadfade84..50a8dd271 100644
--- a/setup.py
+++ b/setup.py
@@ -48,6 +48,7 @@ exts = [
     Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
               include_dirs=includes),
diff --git a/spacy/__init__.py b/spacy/__init__.py
index d2b763c42..9f7c7932c 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -1,5 +1,6 @@
 from .lexeme import lex_of
 from .lexeme import sic_of
+from .lexeme import length_of
 
 from .tokens import Tokens
 
@@ -10,28 +11,6 @@ LEX = 1
 NORM = 2
 SHAPE = 3
 LAST3 = 4
+LENGTH = 5
 
-__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
-
-
-"""
-from .tokens import ids_from_string
-from .tokens import group_by
-
-from .lex import sic_of
-from .lex import lex_of
-from .lex import normed_of
-from .lex import first_of
-from .lex import last_three_of
-
-from .lex import cluster_of
-from .lex import prob_of
-
-from .lex import is_oft_upper
-from .lex import is_oft_title
-
-from .lex import can_noun
-from .lex import can_verb
-from .lex import can_adj
-from .lex import can_adv
-"""
+__all__ = [Tokens, lex_of, sic_of, length_of, SIC, LEX, NORM, SHAPE, LAST3, LENGTH]
diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx
index 72a324673..99c8e7406 100644
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@@ -51,5 +51,3 @@ cdef class FixedTable:
 @cython.cdivision
 cdef inline size_t _find(uint64_t key, size_t size) nogil:
     return key % size
-
-
diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd
index fdb43df74..f5316a618 100644
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@@ -2,6 +2,7 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t
 
 from sparsehash.dense_hash_map cimport dense_hash_map
+from _hashing cimport FixedTable
 
 # Circular import problems here
 ctypedef size_t Lexeme_addr
@@ -24,6 +25,7 @@ from spacy.lexeme cimport Orthography
 
 cdef class Language:
     cdef object name
+    cdef FixedTable happax
     cdef Vocab* vocab
     cdef Vocab* distri
     cdef Vocab* ortho
@@ -39,3 +41,5 @@ cdef class Language:
     cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
                              int split, size_t length)
     cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
+
+    cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr)
diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index d896b922b..1e31ecdb2 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -50,15 +50,18 @@ def get_word_shape(lex, length):
     return shape
 
 
-
 def set_orth_flags(lex, length):
     return 0
 
 
+DEF MAX_HAPPAX = 1048576
+
+
 cdef class Language:
     def __cinit__(self, name):
         self.name = name
         self.bacov = {}
+        self.happax = FixedTable(MAX_HAPPAX)
         self.vocab = new Vocab()
         self.ortho = new Vocab()
         self.distri = new Vocab()
@@ -81,6 +84,7 @@ cdef class Language:
                 length = len(token_string)
                 hashed = self.hash_string(token_string, length)
                 word.tail = self._add(hashed, lex, 0, len(lex))
+                self._happax_to_vocab(hashed, <Lexeme_addr>word.tail)
                 word = word.tail
 
     def load_clusters(self):
@@ -122,14 +126,27 @@ cdef class Language:
         # First, check words seen 2+ times
         cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
         if word_ptr == NULL:
-            start = self.find_split(string, length) if start == -1 else start
-            word_ptr = self._add(hashed, string, start, length)
+            # Now check words seen exactly once
+            word_ptr = <Lexeme*>self.happax.get(hashed)
+            if word_ptr == NULL:
+                start = self.find_split(string, length) if start == -1 else start
+                word_ptr = self._add(hashed, string, start, length)
+            else:
+                # Second time word seen, move to vocab
+                self._happax_to_vocab(hashed, <Lexeme_addr>word_ptr)
         return <Lexeme_addr>word_ptr
 
+    cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr):
+        self.vocab[0][hashed] = word_ptr
+        self.happax.erase(hashed)
+
     cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
         cdef size_t i
         word = self.init_lexeme(string, hashed, split, length)
-        self.vocab[0][hashed] = <Lexeme_addr>word
+        if self.happax.keys[hashed % self.happax.size] != 0:
+            self._happax_to_vocab(self.happax.keys[hashed % self.happax.size],
+                                  self.happax.values[hashed % self.happax.size])
+        self.happax.insert(hashed, <size_t>word)
         self.bacov[hashed] = string
         return word   
 
@@ -194,6 +211,7 @@ cdef class Language:
         # Now recurse, and deal with the tail
         if tail_string:
             word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
+            self._happax_to_vocab(word.tail.sic, <Lexeme_addr>word.tail)
         return word
 
     cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 3e26b1cea..1b0d42981 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
 
 
 from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport attr_of, norm_of, shape_of
+from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
 from spacy.spacy cimport StringHash