* Push to master

2025-08-09 06:34:54 +03:00 · 2014-07-23 17:39:02 +01:00 · 2014-07-23 17:39:02 +01:00 · d2a151ec75
commit d2a151ec75
parent e3fad681c5 150cf6dd3b
50 changed files with 928453 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,13 @@
+# Vim
+*.swp
+
+spacy/*.cpp
+ext/murmurhash.cpp
+ext/sparsehash.cpp
+
+_build/
+.env/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/data/en/case
+++ b/data/en/case
--- a/data/en/clusters
+++ b/data/en/clusters
--- a/data/en/tokenization
+++ b/data/en/tokenization
@ -0,0 +1,93 @@
+# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
+#  21:09, 25 June 2014
+#*--*  --
+#*---* ---
+#*'s  's
+
+ain't   are not
+aren't  are not
+can't   can not
+could've    could have
+couldn't    could not
+couldn't've could not have
+didn't  did not
+doesn't does not
+don't   do not
+hadn't  had not
+hadn't've   had not have
+hasn't  has not
+haven't have not
+he'd    he would
+he'd've he would have
+he'll   he will
+he's    he 's
+how'd   he would
+how'll  he will
+how's   how 's
+I'd I would
+I'd've  I would have
+I'll    I will
+I'm I am
+I've    I have
+isn't   is not
+it'd    it would
+it'd've it would have
+it'll   it will
+it's    it 's
+let's   let 's
+mightn't    might not
+mightn't've might not have
+might've    might have
+mustn't must not
+must've must have
+needn't need not
+not've  not have
+shan't  shall not
+she'd   she would
+she'd've    she would have
+she'll  she will
+she's   she 's
+should've   should have
+shouldn't   should not
+shouldn't've    should not have
+that's  that 's
+there'd there would
+there'd've  there would have
+there's there is
+they'd  there would
+they'd've   they would have
+they'll they will
+they're they are
+they've they have
+wasn't  was not
+we'd    we would
+we'd've we would have
+we'll   we will
+we're   we are
+we've   we have
+weren't were not
+what'll what will
+what're what are
+what's  what 's
+what've what have
+when's  when 's
+where'd where would
+where's where 's
+where've    where have
+who'd   who would
+who'll  who will
+who're  who are
+who's   who 's
+who've  who have
+why'll  who will
+why're  why are
+why's   why 's
+won't   will not
+would've    would have
+wouldn't    would not
+wouldn't've would not have
+you'd   you would
+you'd've    you would have
+you'll  you will
+you're  you are
+you've  you have
--- a/data/en_ptb/case
+++ b/data/en_ptb/case
--- a/data/en_ptb/clusters
+++ b/data/en_ptb/clusters
--- a/data/en_ptb/tokenization
+++ b/data/en_ptb/tokenization
@ -0,0 +1,104 @@
+# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
+#  21:09, 25 June 2014
+#*--*  --
+#*---* ---
+#*'s  's
+
+cannot  can not
+d'ye    d' ye
+gimme   gim me
+gonna   gon na
+lemme   lem me
+more'n  more 'n
+'tis    't is
+'twas   't was
+wanna   wan na
+whaddya wha dd ya
+whatcha wha t cha
+ain't   ai n't
+aren't  are n't
+can't   can n't
+could've    could 've
+couldn't    could n't
+couldn't've could n't 've
+didn't  did n't
+doesn't does n't
+don't   do n't
+hadn't  had n't
+hadn't've   had n't 've
+hasn't  has n't
+haven't have n't
+he'd    he 'd
+he'd've he 'd 've
+he'll   he 'll
+he's    he 's
+how'd   he 'd
+how'll  he 'll
+how's   how 's
+I'd I 'd
+I'd've  I 'd 've
+I'll    I 'll
+I'm I 'm
+I've    I 've
+isn't   is n't
+it'd    it 'd
+it'd've it 'd 've
+it'll   it 'll
+it's    it 's
+let's   let 's
+mightn't    might n't
+mightn't've might n't 've
+might've    might 've
+mustn't must n't
+must've must 've
+needn't need n't
+not've  not h've
+shan't  sha n't
+she'd   she 'd
+she'd've    she 'd 've
+she'll  she 'll
+she's   she 's
+should've   should 've
+shouldn't   should n't
+shouldn't've    should n't 've
+that's  that 's
+there'd there 'd
+there'd've  there 'd 've
+there's there 's
+they'd  there 'd
+they'd've   they 'd 've
+they'll they 'll
+they're they 're
+they've they 've
+wasn't  was n't
+we'd    we 'd
+we'd've we 'd h've
+we'll   we 'll
+we're   we 're
+we've   we h've
+weren't were n't
+what'll what 'll
+what're what 're
+what's  what 's
+what've what 've
+when's  when 's
+where'd where 'd
+where's where 's
+where've    where 've
+who'd   who 'd
+who'll  who 'll
+who're  who 're
+who's   who 's
+who've  who 've
+why'll  why 'll
+why're  why 're
+why's   why 's
+won't   will n't
+would've    would 've
+wouldn't    would n't
+wouldn't've would n't 've
+you'd   you 'd
+you'd've    you 'd 've
+you'll  you 'll
+you're  you 're
+you've  you 've
--- a/ext/MurmurHash2.cpp
+++ b/ext/MurmurHash2.cpp
@ -0,0 +1,523 @@
+//-----------------------------------------------------------------------------
+// MurmurHash2 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - This code makes a few assumptions about how your machine behaves -
+
+// 1. We can read a 4-byte value from any address without crashing
+// 2. sizeof(int) == 4
+
+// And it has a few limitations -
+
+// 1. It will not work incrementally.
+// 2. It will not produce the same results on little-endian and big-endian
+//    machines.
+
+#include "MurmurHash2.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+uint32_t MurmurHash2 ( const void * key, int len, uint32_t seed )
+{
+  // 'm' and 'r' are mixing constants generated offline.
+  // They're not really 'magic', they just happen to work well.
+
+  const uint32_t m = 0x5bd1e995;
+  const int r = 24;
+
+  // Initialize the hash to a 'random' value
+
+  uint32_t h = seed ^ len;
+
+  // Mix 4 bytes at a time into the hash
+
+  const unsigned char * data = (const unsigned char *)key;
+
+  while(len >= 4)
+  {
+    uint32_t k = *(uint32_t*)data;
+
+    k *= m;
+    k ^= k >> r;
+    k *= m;
+
+    h *= m;
+    h ^= k;
+
+    data += 4;
+    len -= 4;
+  }
+
+  // Handle the last few bytes of the input array
+
+  switch(len)
+  {
+  case 3: h ^= data[2] << 16;
+  case 2: h ^= data[1] << 8;
+  case 1: h ^= data[0];
+      h *= m;
+  };
+
+  // Do a few final mixes of the hash to ensure the last few
+  // bytes are well-incorporated.
+
+  h ^= h >> 13;
+  h *= m;
+  h ^= h >> 15;
+
+  return h;
+} 
+
+//-----------------------------------------------------------------------------
+// MurmurHash2, 64-bit versions, by Austin Appleby
+
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 
+// and endian-ness issues if used across multiple platforms.
+
+// 64-bit hash for 64-bit platforms
+
+uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed )
+{
+  const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
+  const int r = 47;
+
+  uint64_t h = seed ^ (len * m);
+
+  const uint64_t * data = (const uint64_t *)key;
+  const uint64_t * end = data + (len/8);
+
+  while(data != end)
+  {
+    uint64_t k = *data++;
+
+    k *= m; 
+    k ^= k >> r; 
+    k *= m; 
+    
+    h ^= k;
+    h *= m; 
+  }
+
+  const unsigned char * data2 = (const unsigned char*)data;
+
+  switch(len & 7)
+  {
+  case 7: h ^= uint64_t(data2[6]) << 48;
+  case 6: h ^= uint64_t(data2[5]) << 40;
+  case 5: h ^= uint64_t(data2[4]) << 32;
+  case 4: h ^= uint64_t(data2[3]) << 24;
+  case 3: h ^= uint64_t(data2[2]) << 16;
+  case 2: h ^= uint64_t(data2[1]) << 8;
+  case 1: h ^= uint64_t(data2[0]);
+          h *= m;
+  };
+ 
+  h ^= h >> r;
+  h *= m;
+  h ^= h >> r;
+
+  return h;
+} 
+
+
+// 64-bit hash for 32-bit platforms
+
+uint64_t MurmurHash64B ( const void * key, int len, uint64_t seed )
+{
+  const uint32_t m = 0x5bd1e995;
+  const int r = 24;
+
+  uint32_t h1 = uint32_t(seed) ^ len;
+  uint32_t h2 = uint32_t(seed >> 32);
+
+  const uint32_t * data = (const uint32_t *)key;
+
+  while(len >= 8)
+  {
+    uint32_t k1 = *data++;
+    k1 *= m; k1 ^= k1 >> r; k1 *= m;
+    h1 *= m; h1 ^= k1;
+    len -= 4;
+
+    uint32_t k2 = *data++;
+    k2 *= m; k2 ^= k2 >> r; k2 *= m;
+    h2 *= m; h2 ^= k2;
+    len -= 4;
+  }
+
+  if(len >= 4)
+  {
+    uint32_t k1 = *data++;
+    k1 *= m; k1 ^= k1 >> r; k1 *= m;
+    h1 *= m; h1 ^= k1;
+    len -= 4;
+  }
+
+  switch(len)
+  {
+  case 3: h2 ^= ((unsigned char*)data)[2] << 16;
+  case 2: h2 ^= ((unsigned char*)data)[1] << 8;
+  case 1: h2 ^= ((unsigned char*)data)[0];
+      h2 *= m;
+  };
+
+  h1 ^= h2 >> 18; h1 *= m;
+  h2 ^= h1 >> 22; h2 *= m;
+  h1 ^= h2 >> 17; h1 *= m;
+  h2 ^= h1 >> 19; h2 *= m;
+
+  uint64_t h = h1;
+
+  h = (h << 32) | h2;
+
+  return h;
+} 
+
+//-----------------------------------------------------------------------------
+// MurmurHash2A, by Austin Appleby
+
+// This is a variant of MurmurHash2 modified to use the Merkle-Damgard 
+// construction. Bulk speed should be identical to Murmur2, small-key speed 
+// will be 10%-20% slower due to the added overhead at the end of the hash.
+
+// This variant fixes a minor issue where null keys were more likely to
+// collide with each other than expected, and also makes the function
+// more amenable to incremental implementations.
+
+#define mmix(h,k) { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
+
+uint32_t MurmurHash2A ( const void * key, int len, uint32_t seed )
+{
+  const uint32_t m = 0x5bd1e995;
+  const int r = 24;
+  uint32_t l = len;
+
+  const unsigned char * data = (const unsigned char *)key;
+
+  uint32_t h = seed;
+
+  while(len >= 4)
+  {
+    uint32_t k = *(uint32_t*)data;
+
+    mmix(h,k);
+
+    data += 4;
+    len -= 4;
+  }
+
+  uint32_t t = 0;
+
+  switch(len)
+  {
+  case 3: t ^= data[2] << 16;
+  case 2: t ^= data[1] << 8;
+  case 1: t ^= data[0];
+  };
+
+  mmix(h,t);
+  mmix(h,l);
+
+  h ^= h >> 13;
+  h *= m;
+  h ^= h >> 15;
+
+  return h;
+}
+
+//-----------------------------------------------------------------------------
+// CMurmurHash2A, by Austin Appleby
+
+// This is a sample implementation of MurmurHash2A designed to work 
+// incrementally.
+
+// Usage - 
+
+// CMurmurHash2A hasher
+// hasher.Begin(seed);
+// hasher.Add(data1,size1);
+// hasher.Add(data2,size2);
+// ...
+// hasher.Add(dataN,sizeN);
+// uint32_t hash = hasher.End()
+
+class CMurmurHash2A
+{
+public:
+
+  void Begin ( uint32_t seed = 0 )
+  {
+    m_hash  = seed;
+    m_tail  = 0;
+    m_count = 0;
+    m_size  = 0;
+  }
+
+  void Add ( const unsigned char * data, int len )
+  {
+    m_size += len;
+
+    MixTail(data,len);
+
+    while(len >= 4)
+    {
+      uint32_t k = *(uint32_t*)data;
+
+      mmix(m_hash,k);
+
+      data += 4;
+      len -= 4;
+    }
+
+    MixTail(data,len);
+  }
+
+  uint32_t End ( void )
+  {
+    mmix(m_hash,m_tail);
+    mmix(m_hash,m_size);
+
+    m_hash ^= m_hash >> 13;
+    m_hash *= m;
+    m_hash ^= m_hash >> 15;
+
+    return m_hash;
+  }
+
+private:
+
+  static const uint32_t m = 0x5bd1e995;
+  static const int r = 24;
+
+  void MixTail ( const unsigned char * & data, int & len )
+  {
+    while( len && ((len<4) || m_count) )
+    {
+      m_tail |= (*data++) << (m_count * 8);
+
+      m_count++;
+      len--;
+
+      if(m_count == 4)
+      {
+        mmix(m_hash,m_tail);
+        m_tail = 0;
+        m_count = 0;
+      }
+    }
+  }
+
+  uint32_t m_hash;
+  uint32_t m_tail;
+  uint32_t m_count;
+  uint32_t m_size;
+};
+
+//-----------------------------------------------------------------------------
+// MurmurHashNeutral2, by Austin Appleby
+
+// Same as MurmurHash2, but endian- and alignment-neutral.
+// Half the speed though, alas.
+
+uint32_t MurmurHashNeutral2 ( const void * key, int len, uint32_t seed )
+{
+  const uint32_t m = 0x5bd1e995;
+  const int r = 24;
+
+  uint32_t h = seed ^ len;
+
+  const unsigned char * data = (const unsigned char *)key;
+
+  while(len >= 4)
+  {
+    uint32_t k;
+
+    k  = data[0];
+    k |= data[1] << 8;
+    k |= data[2] << 16;
+    k |= data[3] << 24;
+
+    k *= m; 
+    k ^= k >> r; 
+    k *= m;
+
+    h *= m;
+    h ^= k;
+
+    data += 4;
+    len -= 4;
+  }
+  
+  switch(len)
+  {
+  case 3: h ^= data[2] << 16;
+  case 2: h ^= data[1] << 8;
+  case 1: h ^= data[0];
+          h *= m;
+  };
+
+  h ^= h >> 13;
+  h *= m;
+  h ^= h >> 15;
+
+  return h;
+} 
+
+//-----------------------------------------------------------------------------
+// MurmurHashAligned2, by Austin Appleby
+
+// Same algorithm as MurmurHash2, but only does aligned reads - should be safer
+// on certain platforms. 
+
+// Performance will be lower than MurmurHash2
+
+#define MIX(h,k,m) { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
+
+
+uint32_t MurmurHashAligned2 ( const void * key, int len, uint32_t seed )
+{
+  const uint32_t m = 0x5bd1e995;
+  const int r = 24;
+
+  const unsigned char * data = (const unsigned char *)key;
+
+  uint32_t h = seed ^ len;
+
+  int align = (uint64_t)data & 3;
+
+  if(align && (len >= 4))
+  {
+    // Pre-load the temp registers
+
+    uint32_t t = 0, d = 0;
+
+    switch(align)
+    {
+      case 1: t |= data[2] << 16;
+      case 2: t |= data[1] << 8;
+      case 3: t |= data[0];
+    }
+
+    t <<= (8 * align);
+
+    data += 4-align;
+    len -= 4-align;
+
+    int sl = 8 * (4-align);
+    int sr = 8 * align;
+
+    // Mix
+
+    while(len >= 4)
+    {
+      d = *(uint32_t *)data;
+      t = (t >> sr) | (d << sl);
+
+      uint32_t k = t;
+
+      MIX(h,k,m);
+
+      t = d;
+
+      data += 4;
+      len -= 4;
+    }
+
+    // Handle leftover data in temp registers
+
+    d = 0;
+
+    if(len >= align)
+    {
+      switch(align)
+      {
+      case 3: d |= data[2] << 16;
+      case 2: d |= data[1] << 8;
+      case 1: d |= data[0];
+      }
+
+      uint32_t k = (t >> sr) | (d << sl);
+      MIX(h,k,m);
+
+      data += align;
+      len -= align;
+
+      //----------
+      // Handle tail bytes
+
+      switch(len)
+      {
+      case 3: h ^= data[2] << 16;
+      case 2: h ^= data[1] << 8;
+      case 1: h ^= data[0];
+          h *= m;
+      };
+    }
+    else
+    {
+      switch(len)
+      {
+      case 3: d |= data[2] << 16;
+      case 2: d |= data[1] << 8;
+      case 1: d |= data[0];
+      case 0: h ^= (t >> sr) | (d << sl);
+          h *= m;
+      }
+    }
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+  }
+  else
+  {
+    while(len >= 4)
+    {
+      uint32_t k = *(uint32_t *)data;
+
+      MIX(h,k,m);
+
+      data += 4;
+      len -= 4;
+    }
+
+    //----------
+    // Handle tail bytes
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+  }
+}
+
+//-----------------------------------------------------------------------------
+
--- a/ext/MurmurHash2.h
+++ b/ext/MurmurHash2.h
@ -0,0 +1,39 @@
+//-----------------------------------------------------------------------------
+// MurmurHash2 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH2_H_
+#define _MURMURHASH2_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+typedef unsigned char uint8_t;
+typedef unsigned long uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+uint32_t MurmurHash2        ( const void * key, int len, uint32_t seed );
+uint64_t MurmurHash64A      ( const void * key, int len, uint64_t seed );
+uint64_t MurmurHash64B      ( const void * key, int len, uint64_t seed );
+uint32_t MurmurHash2A       ( const void * key, int len, uint32_t seed );
+uint32_t MurmurHashNeutral2 ( const void * key, int len, uint32_t seed );
+uint32_t MurmurHashAligned2 ( const void * key, int len, uint32_t seed );
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH2_H_
+
--- a/ext/MurmurHash3.cpp
+++ b/ext/MurmurHash3.cpp
@ -0,0 +1,346 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "MurmurHash3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE	__forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x,y)	_rotl(x,y)
+#define ROTL64(x,y)	_rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#if defined(GNUC) && ((GNUC > 4) || (GNUC == 4 && GNUC_MINOR >= 4))
+
+/* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6.
+ * Don't inline for RHEL 5 gcc which is 4.1 */
+#define FORCE_INLINE attribute((always_inline))
+
+#else
+
+#define FORCE_INLINE
+
+#endif
+
+
+inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+  return (x << r) | (x >> (64 - r));
+}
+
+#define	ROTL32(x,y)	rotl32(x,y)
+#define ROTL64(x,y)	rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
+{
+  return p[i];
+}
+
+FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
+{
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix ( uint64_t k )
+{
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len,
+                          uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  uint32_t c1 = 0xcc9e2d51;
+  uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock(blocks,i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1,15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = ROTL32(h1,13);
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix(h1);
+
+  *(uint32_t*)out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const int len,
+                           uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  uint32_t c1 = 0x239b961b;
+  uint32_t c2 = 0xab0e9789;
+  uint32_t c3 = 0x38b34ae5;
+  uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock(blocks,i*4+0);
+    uint32_t k2 = getblock(blocks,i*4+1);
+    uint32_t k3 = getblock(blocks,i*4+2);
+    uint32_t k4 = getblock(blocks,i*4+3);
+
+    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k4 ^= tail[14] << 16;
+  case 14: k4 ^= tail[13] << 8;
+  case 13: k4 ^= tail[12] << 0;
+           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+  case 12: k3 ^= tail[11] << 24;
+  case 11: k3 ^= tail[10] << 16;
+  case 10: k3 ^= tail[ 9] << 8;
+  case  9: k3 ^= tail[ 8] << 0;
+           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+  case  8: k2 ^= tail[ 7] << 24;
+  case  7: k2 ^= tail[ 6] << 16;
+  case  6: k2 ^= tail[ 5] << 8;
+  case  5: k2 ^= tail[ 4] << 0;
+           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+  case  4: k1 ^= tail[ 3] << 24;
+  case  3: k1 ^= tail[ 2] << 16;
+  case  2: k1 ^= tail[ 1] << 8;
+  case  1: k1 ^= tail[ 0] << 0;
+           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  h1 = fmix(h1);
+  h2 = fmix(h2);
+  h3 = fmix(h3);
+  h4 = fmix(h4);
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const int len,
+                           const uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t * blocks = (const uint64_t *)(data);
+
+  for(int i = 0; i < nblocks; i++)
+  {
+    uint64_t k1 = getblock(blocks,i*2+0);
+    uint64_t k2 = getblock(blocks,i*2+1);
+
+    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k2 ^= uint64_t(tail[14]) << 48;
+  case 14: k2 ^= uint64_t(tail[13]) << 40;
+  case 13: k2 ^= uint64_t(tail[12]) << 32;
+  case 12: k2 ^= uint64_t(tail[11]) << 24;
+  case 11: k2 ^= uint64_t(tail[10]) << 16;
+  case 10: k2 ^= uint64_t(tail[ 9]) << 8;
+  case  9: k2 ^= uint64_t(tail[ 8]) << 0;
+           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+  case  8: k1 ^= uint64_t(tail[ 7]) << 56;
+  case  7: k1 ^= uint64_t(tail[ 6]) << 48;
+  case  6: k1 ^= uint64_t(tail[ 5]) << 40;
+  case  5: k1 ^= uint64_t(tail[ 4]) << 32;
+  case  4: k1 ^= uint64_t(tail[ 3]) << 24;
+  case  3: k1 ^= uint64_t(tail[ 2]) << 16;
+  case  2: k1 ^= uint64_t(tail[ 1]) << 8;
+  case  1: k1 ^= uint64_t(tail[ 0]) << 0;
+           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix(h1);
+  h2 = fmix(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t*)out)[0] = h1;
+  ((uint64_t*)out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
+
--- a/ext/MurmurHash3.h
+++ b/ext/MurmurHash3.h
@ -0,0 +1,45 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+typedef unsigned char uint8_t;
+typedef unsigned long uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
+
+#ifdef __cplusplus
+}
+#endif
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
--- a/ext/init.py
+++ b/ext/init.py
--- a/ext/murmurhash.pxd
+++ b/ext/murmurhash.pxd
@ -0,0 +1,13 @@
+# cython profile=True
+
+from libc.stdint cimport uint64_t, int64_t
+
+
+cdef extern from "../include/MurmurHash3.h":
+    void MurmurHash3_x86_32(void * key, uint64_t len, uint64_t seed, void* out) nogil
+    void MurmurHash3_x86_128(void * key, uint64_t len, uint64_t seed, void* out) nogil
+
+
+cdef extern from "../include/MurmurHash2.h":
+    uint64_t MurmurHash64A(void * key, uint64_t len, int64_t seed) nogil
+    uint64_t MurmurHash64B(void * key, uint64_t len, int64_t seed) nogil
--- a/ext/murmurhash.pyx
+++ b/ext/murmurhash.pyx
@ -0,0 +1 @@
+# cython: profile=True
--- a/ext/sparsehash.pxd
+++ b/ext/sparsehash.pxd
@ -0,0 +1,48 @@
+from libcpp.utility cimport pair
+from libcpp.vector cimport vector
+from libc.stdint cimport uint64_t, int64_t
+
+
+cdef extern from "sparsehash/dense_hash_map" namespace "google":
+    cdef cppclass dense_hash_map[K, D]:
+        K& key_type
+        D& data_type
+        pair[K, D]& value_type
+        uint64_t size_type
+        cppclass iterator:
+            pair[K, D]& operator*() nogil
+            iterator operator++() nogil
+            iterator operator--() nogil
+            bint operator==(iterator) nogil
+            bint operator!=(iterator) nogil
+        iterator begin()
+        iterator end()
+        uint64_t size()
+        uint64_t max_size()
+        bint empty()
+        uint64_t bucket_count()
+        uint64_t bucket_size(uint64_t i)
+        uint64_t bucket(K& key)
+        double max_load_factor()
+        void max_load_vactor(double new_grow)
+        double min_load_factor()
+        double min_load_factor(double new_grow)
+        void set_resizing_parameters(double shrink, double grow)
+        void resize(uint64_t n)
+        void rehash(uint64_t n)
+        dense_hash_map()
+        dense_hash_map(uint64_t n)
+        void swap(dense_hash_map&)
+        pair[iterator, bint] insert(pair[K, D]) nogil
+        void set_empty_key(K&)
+        void set_deleted_key(K& key)
+        void clear_deleted_key()
+        void erase(iterator pos)
+        uint64_t erase(K& k)
+        void erase(iterator first, iterator last)
+        void clear()
+        void clear_no_resize()
+        pair[iterator, iterator] equal_range(K& k)
+        D& operator[](K&) nogil
+
+
--- a/ext/sparsehash.pyx
+++ b/ext/sparsehash.pyx
@ -0,0 +1 @@
+# cython profile=True
--- a/fabfile.py
+++ b/fabfile.py
@ -0,0 +1,10 @@
+from fabric.api import local, run, lcd, cd, env
+
+def make():
+    local('python setup.py build_ext --inplace')
+
+def clean():
+    local('python setup.py clean --all')
+
+def test():
+    local('py.test -x')
--- a/include/MurmurHash2.h
+++ b/include/MurmurHash2.h
@ -0,0 +1,39 @@
+//-----------------------------------------------------------------------------
+// MurmurHash2 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH2_H_
+#define _MURMURHASH2_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+typedef unsigned char uint8_t;
+typedef unsigned long uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+uint32_t MurmurHash2        ( const void * key, int len, uint32_t seed );
+uint64_t MurmurHash64A      ( const void * key, int len, uint64_t seed );
+uint64_t MurmurHash64B      ( const void * key, int len, uint64_t seed );
+uint32_t MurmurHash2A       ( const void * key, int len, uint32_t seed );
+uint32_t MurmurHashNeutral2 ( const void * key, int len, uint32_t seed );
+uint32_t MurmurHashAligned2 ( const void * key, int len, uint32_t seed );
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH2_H_
+
--- a/include/MurmurHash3.h
+++ b/include/MurmurHash3.h
@ -0,0 +1,45 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+typedef unsigned char uint8_t;
+typedef unsigned long uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
+
+#ifdef __cplusplus
+}
+#endif
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
+cython
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,81 @@
+#!/usr/bin/env python
+import Cython.Distutils
+from distutils.extension import Extension
+import distutils.core
+
+import sys
+import os
+import os.path
+from os import path
+
+
+def clean(ext):
+    for pyx in ext.sources:
+        if pyx.endswith('.pyx'):
+            c = pyx[:-4] + '.c'
+            cpp = pyx[:-4] + '.cpp'
+            so = pyx[:-4] + '.so'
+            html = pyx[:-4] + '.html'
+            if os.path.exists(so):
+                os.unlink(so)
+            if os.path.exists(c):
+                os.unlink(c)
+            elif os.path.exists(cpp):
+                os.unlink(cpp)
+            if os.path.exists(html):
+                os.unlink(html)
+
+
+HERE = os.path.dirname(__file__)
+virtual_env = os.environ.get('VIRTUAL_ENV', '')
+compile_args = []
+link_args = []
+libs = []
+
+includes = []
+
+exts = [
+    Extension("ext.sparsehash", ["ext/sparsehash.pyx"], language="c++"),
+    Extension('ext.murmurhash',
+              ["ext/murmurhash.pyx", "ext/MurmurHash2.cpp",
+              "ext/MurmurHash3.cpp"], language="c++",
+              include_dirs=[path.join(HERE, 'ext')]),
+
+    Extension("spacy.en",
+              ["spacy/en.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
+              language="c++",
+              include_dirs=[path.join(HERE, 'ext')]),
+    Extension("spacy.en_ptb",
+              ["spacy/en_ptb.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
+              language="c++",
+              include_dirs=[path.join(HERE, 'ext')]),
+ 
+    Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.spacy",
+             ["spacy/spacy.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
+             language="c++", include_dirs=includes),
+    Extension("spacy.tokens",
+             ["spacy/tokens.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
+             language="c++", include_dirs=includes),
+    Extension("spacy.string_tools",
+             ["spacy/string_tools.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
+             language="c++", include_dirs=includes),
+]
+
+
+if sys.argv[1] == 'clean':
+    print >> sys.stderr, "cleaning .c, .c++ and .so files matching sources"
+    map(clean, exts)
+
+distutils.core.setup(
+    name='Sparse linear models with Cython',
+    packages=['thinc'],
+    author='Matthew Honnibal',
+    author_email='honnibal@gmail.com',
+    version='1.0',
+    cmdclass={'build_ext': Cython.Distutils.build_ext},
+    ext_modules=exts,
+)
+
+
+
--- a/spacy/init.py
+++ b/spacy/init.py
@ -0,0 +1,37 @@
+from .lexeme import lex_of
+from .lexeme import sic_of
+
+from .tokens import Tokens
+
+# Don't know how to get the enum Python visible :(
+
+SIC = 0
+LEX = 1
+NORM = 2
+SHAPE = 3
+LAST3 = 4
+
+__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
+
+
+"""
+from .tokens import ids_from_string
+from .tokens import group_by
+
+from .lex import sic_of
+from .lex import lex_of
+from .lex import normed_of
+from .lex import first_of
+from .lex import last_three_of
+
+from .lex import cluster_of
+from .lex import prob_of
+
+from .lex import is_oft_upper
+from .lex import is_oft_title
+
+from .lex import can_noun
+from .lex import can_verb
+from .lex import can_adj
+from .lex import can_adv
+"""
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -0,0 +1,18 @@
+from libcpp.vector cimport vector
+
+from spacy.spacy cimport StringHash
+from spacy.spacy cimport Lexeme
+from spacy.spacy cimport Lexeme_addr
+
+from spacy.spacy cimport Language
+from spacy.tokens cimport Tokens
+
+
+cdef class English(spacy.Language):
+    cdef int find_split(self, unicode word, size_t length)
+
+cdef English EN
+
+cpdef Lexeme_addr lookup(unicode word) except 0
+cpdef Tokens tokenize(unicode string)
+cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -0,0 +1,64 @@
+# cython: profile=True
+'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
+so that strings can be retrieved from hashes.  Use 64-bit hash values and
+boldly assume no collisions.
+'''
+from __future__ import unicode_literals
+
+from libc.stdlib cimport malloc, calloc, free
+from libc.stdint cimport uint64_t
+from libcpp.vector cimport vector
+
+from spacy.string_tools cimport substr
+
+from . import util
+
+cimport spacy
+
+
+cdef class English(spacy.Language):
+    cdef int find_split(self, unicode word, size_t length):
+        cdef int i = 0
+        # Contractions
+        if word.endswith("'s"):
+            return length - 2
+        # Leading punctuation
+        if is_punct(word, 0, length):
+            return 1
+        elif length >= 1:
+            # Split off all trailing punctuation characters
+            i = 0
+            while i < length and not is_punct(word, i, length):
+                i += 1
+        return i
+
+
+cdef bint is_punct(unicode word, size_t i, size_t length):
+    # Don't count appostrophes as punct if the next char is a letter
+    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
+        # ...Unless we're at 0
+        return i == 0
+    if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
+        return False
+    # Don't count commas as punct if the next char is a number
+    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
+        return False
+    # Don't count periods as punct if the next char is not whitespace
+    if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
+        return False
+    return not word[i].isalnum()
+
+
+EN = English('en')
+
+
+cpdef Tokens tokenize(unicode string):
+    return EN.tokenize(string)
+ 
+
+cpdef Lexeme_addr lookup(unicode string) except 0:
+    return EN.lookup(-1, string, len(string))
+
+
+cpdef unicode unhash(StringHash hash_value):
+    return EN.unhash(hash_value)
--- a/spacy/en_ptb.pxd
+++ b/spacy/en_ptb.pxd
@ -0,0 +1,18 @@
+from libcpp.vector cimport vector
+
+from spacy.spacy cimport StringHash
+from spacy.spacy cimport Language
+from spacy.spacy cimport Lexeme
+from spacy.spacy cimport Lexeme_addr
+from spacy.tokens cimport Tokens
+
+
+cdef class EnglishPTB(Language):
+    cdef int find_split(self, unicode word, size_t length)
+    
+
+cdef EnglishPTB EN_PTB
+
+cpdef Lexeme_addr lookup(unicode word) except 0
+cpdef Tokens tokenize(unicode string)
+cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@ -0,0 +1,60 @@
+'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
+so that strings can be retrieved from hashes.  Use 64-bit hash values and
+boldly assume no collisions.
+'''
+from __future__ import unicode_literals
+
+
+from libc.stdlib cimport malloc, calloc, free
+from libc.stdint cimport uint64_t
+from libcpp.vector cimport vector
+
+from spacy.string_tools cimport substr
+from spacy.spacy cimport Language
+from . import util
+
+cimport spacy
+
+
+cdef class EnglishPTB(Language):
+    cdef int find_split(self, unicode word, size_t length):
+        cdef int i = 0
+        # Contractions
+        if word.endswith("'s"):
+            return length - 2
+        # Leading punctuation
+        if is_punct(word, 0, length):
+            return 1
+        elif length >= 1:
+            # Split off all trailing punctuation characters
+            i = 0
+            while i < length and not is_punct(word, i, length):
+                i += 1
+        return i
+
+
+cdef bint is_punct(unicode word, size_t i, size_t length):
+    is_final = i == (length - 1)
+    if word[i] == '.':
+        return False
+    if not is_final and word[i] == '-' and word[i+1] == '-':
+        return True
+    # Don't count appostrophes as punct if the next char is a letter
+    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
+        return False
+    punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
+    return word[i] in punct_chars
+
+
+cdef EnglishPTB EN_PTB = EnglishPTB('en_ptb')
+
+cpdef Tokens tokenize(unicode string):
+    return EN_PTB.tokenize(string)
+
+
+cpdef Lexeme_addr lookup(unicode string) except 0:
+    return EN_PTB.lookup(-1, string, len(string))
+
+
+cpdef unicode unhash(StringHash hash_value):
+    return EN_PTB.unhash(hash_value)
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -0,0 +1,56 @@
+from libc.stdint cimport uint64_t
+
+# Put these above import to avoid circular import problem
+ctypedef int ClusterID
+ctypedef uint64_t StringHash
+ctypedef size_t Lexeme_addr
+ctypedef char Bits8
+ctypedef uint64_t Bits64
+
+
+from spacy.spacy cimport Language
+
+
+cdef struct Orthography:
+    StringHash last3
+    StringHash shape
+    StringHash norm
+
+    Py_UNICODE first
+    Bits8 flags
+
+
+cdef struct Distribution:
+    double prob
+    ClusterID cluster
+    Bits64 tagdict
+    Bits8 flags
+
+
+cdef struct Lexeme:
+    StringHash sic # Hash of the original string
+    StringHash lex # Hash of the word, with punctuation and clitics split off
+
+    Distribution* dist # Distribution info, lazy loaded
+    Orthography* orth  # Extra orthographic views
+    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
+
+
+cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
+
+
+cdef enum StringAttr:
+    SIC
+    LEX
+    NORM
+    SHAPE
+    LAST3
+
+
+cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
+
+cpdef StringHash sic_of(size_t lex_id) except 0
+cpdef StringHash lex_of(size_t lex_id) except 0
+cpdef StringHash norm_of(size_t lex_id) except 0
+cpdef StringHash shape_of(size_t lex_id) except 0
+cpdef StringHash last3_of(size_t lex_id) except 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -0,0 +1,170 @@
+# cython: profile=True
+'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
+Mostly useful from Python-space. From Cython-space, you can just cast to
+Lexeme* yourself.
+'''
+from __future__ import unicode_literals
+
+from spacy.string_tools cimport substr
+
+from libc.stdlib cimport malloc, calloc, free
+from libc.stdint cimport uint64_t
+from libcpp.vector cimport vector
+
+from spacy.spacy cimport StringHash
+
+# Reiterate the enum, for python
+#SIC = StringAttr.sic
+#LEX = StringAttr.lex
+#NORM = StringAttr.norm
+#SHAPE = StringAttr.shape
+#LAST3 = StringAttr.last3
+
+
+cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
+    if attr == SIC:
+        return sic_of(lex_id)
+    elif attr == LEX:
+        return lex_of(lex_id)
+    elif attr == NORM:
+        return norm_of(lex_id)
+    elif attr == SHAPE:
+        return shape_of(lex_id)
+    elif attr == LAST3:
+        return last3_of(lex_id)
+    else:
+        raise StandardError
+
+
+cpdef StringHash sic_of(size_t lex_id) except 0:
+    '''Access the `sic' field of the Lexeme pointed to by lex_id.
+    
+    The sic field stores the hash of the whitespace-delimited string-chunk used to
+    construct the Lexeme.
+    
+    >>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]
+    [u'Hi!', u'', u'world]
+    '''
+    return (<Lexeme*>lex_id).sic
+
+
+cpdef StringHash lex_of(size_t lex_id) except 0:
+    '''Access the `lex' field of the Lexeme pointed to by lex_id.
+
+    The lex field is the hash of the string you would expect to get back from
+    a standard tokenizer, i.e. the word with punctuation and other non-whitespace
+    delimited tokens split off.  The other fields refer to properties of the
+    string that the lex field stores a hash of, except sic and tail.
+
+    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
+    [u'Hi', u'!', u'world']
+    '''
+    return (<Lexeme*>lex_id).lex
+
+
+cpdef StringHash norm_of(size_t lex_id) except 0:
+    '''Access the `lex' field of the Lexeme pointed to by lex_id.
+
+    The lex field is the hash of the string you would expect to get back from
+    a standard tokenizer, i.e. the word with punctuation and other non-whitespace
+    delimited tokens split off.  The other fields refer to properties of the
+    string that the lex field stores a hash of, except sic and tail.
+
+    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
+    [u'Hi', u'!', u'world']
+    '''
+    return (<Lexeme*>lex_id).orth.norm
+
+
+cpdef StringHash shape_of(size_t lex_id) except 0:
+    return (<Lexeme*>lex_id).orth.shape
+
+
+cpdef StringHash last3_of(size_t lex_id) except 0:
+    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
+    the hash of the last three characters of the word:
+
+    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
+    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
+    [u'llo', u'!']
+    '''
+    return (<Lexeme*>lex_id).orth.last3
+
+
+cpdef ClusterID cluster_of(size_t lex_id):
+    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
+    gives an integer representation of the cluster ID of the word, 
+    which should be understood as a binary address:
+
+    >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
+    >>> token_ids = [lookup(s) for s in strings]
+    >>> clusters = [cluster_of(t) for t in token_ids]
+    >>> print ["{0:b"} % cluster_of(t) for t in token_ids]
+    ["100111110110", "100111100100", "01010111011001", "100111110110"]
+
+    The clusterings are unideal, but often slightly useful.
+    "pineapple" and "apple" share a long prefix, indicating a similar meaning,
+    while "dapple" is totally different. On the other hand, "scalable" receives
+    the same cluster ID as "pineapple", which is not what we'd like.
+    '''
+    return (<Lexeme*>lex_id).dist.cluster
+
+
+cpdef Py_UNICODE first_of(size_t lex_id):
+    '''Access the `first' field of the Lexeme pointed to by lex_id, which
+    stores the first character of the lex string of the word.
+
+    >>> lex_id = lookup(u'Hello')
+    >>> unhash(first_of(lex_id))
+    u'H'
+    '''
+    return (<Lexeme*>lex_id).orth.first
+
+
+cpdef double prob_of(size_t lex_id):
+    '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
+    the smoothed unigram log probability of the word, as estimated from a large
+    text corpus.  By default, probabilities are based on counts from Gigaword,
+    smoothed using Knesser-Ney; but any probabilities file can be supplied to
+    load_probs.
+    
+    >>> prob_of(lookup(u'world'))
+    -20.10340371976182
+    '''
+    return (<Lexeme*>lex_id).dist.prob
+
+
+cpdef bint is_oft_upper(size_t lex_id):
+    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    stores whether the lowered version of the string hashed by `lex' is found
+    in all-upper case frequently in a large sample of text.  Users are free
+    to load different data, by default we use a sample from Wikipedia, with
+    a threshold of 0.95, picked to maximize mutual information for POS tagging.
+
+    >>> is_oft_upper(lookup(u'abc'))
+    True
+    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
+    True
+    '''
+    return False
+    #cdef Lexeme* w = <Lexeme*>lex_id
+    #return w.orth.last3 if w.orth != NULL else 0
+
+
+    #return (<Lexeme*>lex_id).oft_upper
+
+
+cpdef bint is_oft_title(size_t lex_id):
+    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    stores whether the lowered version of the string hashed by `lex' is found
+    title-cased frequently in a large sample of text.  Users are free
+    to load different data, by default we use a sample from Wikipedia, with
+    a threshold of 0.3, picked to maximize mutual information for POS tagging.
+
+    >>> is_oft_title(lookup(u'marcus'))
+    True
+    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
+    True
+    '''
+    return False
+    #return (<Lexeme*>lex_id).oft_title
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -0,0 +1,44 @@
+from libcpp.vector cimport vector
+from libc.stdint cimport uint64_t
+
+from ext.sparsehash cimport dense_hash_map
+
+
+# Circular import problems here
+ctypedef size_t Lexeme_addr
+ctypedef uint64_t StringHash
+ctypedef dense_hash_map[StringHash, size_t] Vocab
+from spacy.lexeme cimport Lexeme
+
+from spacy.tokens cimport Tokens
+
+# Put these above import to avoid circular import problem
+ctypedef char Bits8
+ctypedef uint64_t Bits64
+ctypedef int ClusterID
+
+
+from spacy.lexeme cimport Lexeme
+from spacy.lexeme cimport Distribution
+from spacy.lexeme cimport Orthography
+
+
+cdef class Language:
+    cdef object name
+    cdef Vocab* vocab
+    cdef Vocab* distri
+    cdef Vocab* ortho
+    cdef dict bacov
+    cdef int find_split(self, unicode word, size_t length)
+
+    cdef Lexeme_addr lookup(self, int split, Py_UNICODE* string, size_t length) except 0
+    cdef StringHash hash_string(self, Py_UNICODE* string, size_t length) except 0
+    cdef unicode unhash(self, StringHash hashed)
+    
+    cpdef Tokens tokenize(self, unicode text)
+    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
+    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
+                             int split, size_t length)
+    cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
+
+ 
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -0,0 +1,235 @@
+# cython: profile=True
+from __future__ import unicode_literals
+
+from libc.stdlib cimport calloc, free
+
+from ext.murmurhash cimport MurmurHash64A
+from ext.murmurhash cimport MurmurHash64B
+
+from spacy.lexeme cimport Lexeme
+from spacy.lexeme cimport BLANK_WORD
+
+from spacy.string_tools cimport substr
+
+
+from . import util
+from os import path
+cimport cython
+
+
+def get_normalized(unicode lex, size_t length):
+    if lex.isalpha() and lex.islower():
+        return lex
+    else:
+        return get_word_shape(lex, length)
+
+
+def get_word_shape(lex, length):
+    shape = ""
+    last = ""
+    shape_char = ""
+    seq = 0
+    for c in lex:
+        if c.isalpha():
+            if c.isupper():
+                shape_char = "X"
+            else:
+                shape_char = "x"
+        elif c.isdigit():
+            shape_char = "d"
+        else:
+            shape_char = c
+        if shape_char == last:
+            seq += 1
+        else:
+            seq = 0
+            last = shape_char
+        if seq < 3:
+            shape += shape_char
+    assert shape
+    return shape
+
+
+
+def set_orth_flags(lex, length):
+    return 0
+
+
+cdef class Language:
+    def __cinit__(self, name):
+        self.name = name
+        self.bacov = {}
+        self.vocab = new Vocab()
+        self.ortho = new Vocab()
+        self.distri = new Vocab()
+        self.vocab[0].set_empty_key(0)
+        self.distri[0].set_empty_key(0)
+        self.ortho[0].set_empty_key(0)
+        self.load_tokenization(util.read_tokenization(name))
+
+    def load_tokenization(self, token_rules=None):
+        cdef Lexeme* word
+        cdef StringHash hashed
+        for chunk, lex, tokens in token_rules:
+            hashed = self.hash_string(chunk, len(chunk))
+            word = self._add(hashed, lex, len(lex), len(lex))
+            for i, lex in enumerate(tokens):
+                token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
+                length = len(token_string)
+                hashed = self.hash_string(token_string, length)
+                word.tail = self._add(hashed, lex, 0, len(lex))
+                word = word.tail
+
+    def load_clusters(self):
+        cdef Lexeme* w
+        data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
+        case_stats = util.load_case_stats(data_dir)
+        brown_loc = path.join(data_dir, 'clusters')
+        cdef size_t start 
+        cdef int end 
+        with util.utf8open(brown_loc) as browns_file:
+            for i, line in enumerate(browns_file):
+                cluster_str, token_string, freq_str = line.split()
+                # Decode as a little-endian string, so that we can do & 15 to get
+                # the first 4 bits. See redshift._parse_features.pyx
+                cluster = int(cluster_str[::-1], 2)
+                upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
+                hashed = self.hash_string(token_string, len(token_string))
+                word = self._add(hashed, token_string,
+                                len(token_string), len(token_string))
+   
+    cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
+        '''Hash unicode with MurmurHash64A'''
+        return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
+
+    cdef unicode unhash(self, StringHash hash_value):
+        '''Fetch a string from the reverse index, given its hash value.'''
+        return self.bacov[hash_value]
+
+    cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0:
+        '''Fetch a Lexeme representing a word string. If the word has not been seen,
+        construct one, splitting off any attached punctuation or clitics.  A
+        reference to BLANK_WORD is returned for the empty string.
+    
+        To specify the boundaries of the word if it has not been seen, use lookup_chunk.
+        '''
+        if length == 0:
+            return <Lexeme_addr>&BLANK_WORD
+        cdef StringHash hashed = self.hash_string(string, length)
+        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
+        if word_ptr == NULL:
+            start = self.find_split(string, length) if start == -1 else start
+            word_ptr = self._add(hashed, string, start, length)
+        return <Lexeme_addr>word_ptr
+
+    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
+        word = self.init_lexeme(string, hashed, split, length)
+        self.vocab[0][hashed] = <Lexeme_addr>word
+        self.bacov[hashed] = string
+        return word   
+
+    cpdef Tokens tokenize(self, unicode string):
+        cdef size_t length = len(string)
+        cdef Py_UNICODE* characters = <Py_UNICODE*>string
+
+        cdef size_t i
+        cdef Py_UNICODE c
+
+        cdef Tokens tokens = Tokens(self)
+        cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
+        cdef size_t word_len = 0
+        cdef Lexeme* token
+        for i in range(length):
+            c = characters[i]
+            if _is_whitespace(c):
+                if word_len != 0:
+                    token = <Lexeme*>self.lookup(-1, current, word_len)
+                    while token != NULL:
+                        tokens.append(<Lexeme_addr>token)
+                        token = token.tail
+                        for j in range(word_len+1):
+                            current[j] = 0
+                    word_len = 0
+            else:
+                current[word_len] = c
+                word_len += 1
+        if word_len != 0:
+            token = <Lexeme*>self.lookup(-1, current, word_len)
+            while token != NULL:
+                tokens.append(<Lexeme_addr>token)
+                token = token.tail
+        free(current)
+        return tokens
+
+    cdef int find_split(self, unicode word, size_t length):
+        return -1
+
+    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
+                             int split, size_t length):
+        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+    
+        word.sic = hashed
+    
+        cdef unicode tail_string
+        cdef unicode lex 
+        if split != 0 and split < length:
+            lex = substr(string, 0, split, length)
+            tail_string = substr(string, split, length, length)
+        else:
+            lex = string
+            tail_string = ''
+    
+        word.lex = self.hash_string(lex, len(lex))
+        self.bacov[word.lex] = lex
+        word.orth = <Orthography*>self.ortho[0][word.lex]
+        if word.orth == NULL:
+            word.orth = self.init_orth(word.lex, lex)
+        word.dist = <Distribution*>self.distri[0][word.lex]
+    
+        # Now recurse, and deal with the tail
+        if tail_string:
+            word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
+        return word
+
+    cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
+        cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
+        orth.first = <Py_UNICODE>lex[0]
+
+        cdef int length = len(lex)
+        
+        orth.flags = set_orth_flags(lex, length)
+        
+        cdef unicode last3 = substr(lex, length - 3, length, length)
+        cdef unicode norm = get_normalized(lex, length)
+        cdef unicode shape = get_word_shape(lex, length)
+
+        orth.last3 = self.hash_string(last3, len(last3))
+        orth.shape = self.hash_string(shape, len(shape))
+        orth.norm = self.hash_string(norm, len(norm))
+
+        self.bacov[orth.last3] = last3
+        self.bacov[orth.shape] = shape
+        self.bacov[orth.norm] = norm
+
+        self.ortho[0][hashed] = <size_t>orth
+        return orth
+
+
+cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
+    if c == ' ':
+        return True
+    elif c == '\n':
+        return True
+    elif c == '\t':
+        return True
+    else:
+        return False
+
+
+cpdef vector[size_t] expand_chunk(size_t addr) except *:
+    cdef vector[size_t] tokens = vector[size_t]()
+    word = <Lexeme*>addr
+    while word != NULL:
+        tokens.push_back(<size_t>word)
+        word = word.tail
+    return tokens
--- a/spacy/string_tools.pxd
+++ b/spacy/string_tools.pxd
@ -0,0 +1,3 @@
+cpdef unicode substr(unicode string, int start, int end, size_t length)
+
+cdef bint is_whitespace(Py_UNICODE c)
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -0,0 +1,28 @@
+# cython: profile=True
+
+
+cpdef unicode substr(unicode string, int start, int end, size_t length):
+    if end >= length:
+        end = -1
+    if start >= length:
+        start = 0
+    if start <= 0 and end < 0:
+        return string
+    elif start < 0:
+        start = 0
+    elif end < 0:
+        end = length
+    return string[start:end]
+  
+
+cdef bint is_whitespace(Py_UNICODE c):
+    # TODO: Support other unicode spaces
+    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
+    if c == u' ':
+        return True
+    elif c == u'\n':
+        return True
+    elif c == u'\t':
+        return True
+    else:
+        return False
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -0,0 +1,18 @@
+from libcpp.vector cimport vector
+from spacy.spacy cimport Lexeme_addr
+
+from cython.operator cimport dereference as deref
+from spacy.spacy cimport Language
+from spacy.lexeme cimport StringAttr
+
+
+cdef class Tokens:
+    cdef Language lang
+    cdef vector[Lexeme_addr]* vctr
+    cdef size_t length
+    
+    cpdef int append(self, Lexeme_addr token)
+    cpdef int extend(self, Tokens other) except -1
+    
+    cpdef object group_by(self, StringAttr attr)
+    cpdef dict count_by(self, StringAttr attr)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -0,0 +1,89 @@
+from cython.operator cimport dereference as deref
+from cython.operator cimport preincrement as inc
+
+
+from spacy.lexeme cimport Lexeme
+from spacy.lexeme cimport attr_of, norm_of, shape_of
+from spacy.spacy cimport StringHash
+
+
+cdef class Tokens:
+    def __cinit__(self, Language lang):
+        self.lang = lang
+        self.vctr = new vector[Lexeme_addr]()
+        self.length = 0
+
+    def __dealloc__(self):
+        del self.vctr
+
+    def __iter__(self):
+        cdef vector[Lexeme_addr].iterator it = self.vctr[0].begin()
+        while it != self.vctr[0].end():
+            yield deref(it)
+            inc(it)
+
+    def __getitem__(self, size_t idx):
+        return self.vctr[0].at(idx)
+
+    def __len__(self):
+        return self.length
+
+    cpdef int append(self, Lexeme_addr token):
+        self.vctr[0].push_back(token)
+        self.length += 1
+
+    cpdef int extend(self, Tokens other) except -1:
+        cdef Lexeme_addr el
+        for el in other:
+            self.append(el)
+
+    cpdef object group_by(self, StringAttr attr):
+        '''Group tokens that share the property attr into Tokens instances, and
+        return a list of them. Returns a tuple of three lists:
+        
+        (string names, hashes, tokens)
+
+        The lists are aligned, so the ith entry in string names is the string
+        that the ith entry in hashes unhashes to, which the Tokens instance
+        is grouped by.
+        
+        You can then use count_by or group_by on the Tokens
+        for further processing. Calling group_by and then asking the length
+        of the Tokens objects is equivalent to count_by, but somewhat slower.
+        '''
+        # Implementation here is working around some of the constraints in
+        # Cython about what type of thing can go in what type of container.
+        # Long story short, it's pretty hard to get a Python object like
+        # Tokens into a vector or array. If we really need this to run faster,
+        # we can be tricky and get the Python list access out of the loop. What
+        # we'd do is store pointers to the underlying vectors.
+        # So far, speed isn't mattering here.
+        cdef dict indices = {}
+        cdef list groups = []
+        cdef list names = []
+        cdef list hashes = []
+
+        cdef StringHash key
+        cdef Lexeme_addr t
+        for t in self.vctr[0]:
+            key = attr_of(t, attr)
+            if key in indices:
+                groups[indices[key]].append(t)
+            else:
+                indices[key] = len(groups)
+                groups.append(Tokens(self.lang))
+                names.append(self.lang.unhash(key))
+                hashes.append(key)
+                groups[-1].append(t)
+        return names, hashes, groups
+
+    cpdef dict count_by(self, StringAttr attr):
+        counts = {}
+        cdef Lexeme_addr t
+        cdef StringHash key
+        for t in self.vctr[0]:
+            key = attr_of(t, attr)
+            if key not in counts:
+                counts[key] = 0
+            counts[key] += 1
+        return counts
--- a/spacy/util.py
+++ b/spacy/util.py
@ -0,0 +1,44 @@
+import os
+from os import path
+import codecs
+
+DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
+
+
+def utf8open(loc, mode='r'):
+    return codecs.open(loc, mode, 'utf8')
+
+
+def load_case_stats(data_dir):
+    case_loc = path.join(data_dir, 'case')
+    case_stats = {}
+    with utf8open(case_loc) as cases_file:
+        for line in cases_file:
+            word, upper, title = line.split()
+            case_stats[word] = (float(upper), float(title))
+    return case_stats
+
+
+def read_tokenization(lang):
+    loc = path.join(DATA_DIR, lang, 'tokenization')
+    entries = []
+    seen = set()
+    with utf8open(loc) as file_:
+        for line in file_:
+            line = line.strip()
+            if line.startswith('#'):
+                continue
+            if not line:
+                continue
+            pieces = line.split()
+            chunk = pieces.pop(0)
+            lex = pieces.pop(0)
+            assert chunk not in seen, chunk
+            seen.add(chunk)
+            entries.append((chunk, lex, pieces))
+            if chunk[0].isalpha() and chunk[0].islower():
+                chunk = chunk[0].title() + chunk[1:]
+                lex = lex[0].title() + lex[1:]
+                seen.add(chunk)
+                entries.append((chunk, lex, pieces))
+    return entries
--- a/tests/my_test.py
+++ b/tests/my_test.py
--- a/tests/sun.tokens
+++ b/tests/sun.tokens
@ -0,0 +1,4 @@
+The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ] 
+
+The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ] 
+Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ] 
--- a/tests/sun.txt
+++ b/tests/sun.txt
@ -0,0 +1,4 @@
+The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields.[12][13] It has a diameter of about 1,392,684 km (865,374 mi),[5] around 109 times that of Earth, and its mass (1.989×1030 kilograms, approximately 330,000 times the mass of Earth) accounts for about 99.86% of the total mass of the Solar System.[14] Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium. The remaining 1.69% (equal to 5,600 times the mass of Earth) consists of heavier elements, including oxygen, carbon, neon and iron, among others.[15]
+
+The Sun formed about 4.567 billion[a][16] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center, while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense, eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star (G2V) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum, and although it is actually white in color, from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light.[17] In the spectral class label, G2 indicates its surface temperature, of approximately 5778 K (5505 °C), and V indicates that the Sun, like most stars, is a main-sequence star, and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core, the Sun fuses about 620 million metric tons of hydrogen each second.[18][19]
+Once regarded by astronomers as a small and relatively insignificant star, the Sun is now thought to be brighter than about 85% of the stars in the Milky Way, most of which are red dwarfs.[20][21] The absolute magnitude of the Sun is +4.83; however, as the star closest to Earth, the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74.[22][23] This is about 13 billion times brighter than the next brightest star, Sirius, with an apparent magnitude of −1.46. The Sun's hot corona continuously expands in space creating the solar wind, a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind, the heliosphere, is the largest continuous structure in the Solar System.[24][25]
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -0,0 +1,44 @@
+from __future__ import unicode_literals
+
+from spacy.spacy import expand_chunk
+from spacy.en import lookup, unhash
+
+from spacy import lex_of
+
+
+def test_possess():
+    tokens = expand_chunk(lookup("Mike's"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "Mike"
+    assert unhash(lex_of(tokens[1])) == "'s"
+
+
+def test_apostrophe():
+    tokens = expand_chunk(lookup("schools'"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[1])) == "'"
+    assert unhash(lex_of(tokens[0])) == "schools"
+
+
+def test_LL():
+    tokens = expand_chunk(lookup("we'll"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[1])) == "will"
+    assert unhash(lex_of(tokens[0])) == "we"
+
+
+def test_aint():
+    tokens = expand_chunk(lookup("ain't"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "are"
+    assert unhash(lex_of(tokens[1])) == "not"
+
+
+def test_capitalized():
+    tokens = expand_chunk(lookup("can't"))
+    assert len(tokens) == 2
+    tokens = expand_chunk(lookup("Can't"))
+    assert len(tokens) == 2
+    tokens = expand_chunk(lookup("Ain't"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "Are"
--- a/tests/test_group_by.py
+++ b/tests/test_group_by.py
@ -0,0 +1,35 @@
+from __future__ import unicode_literals
+
+import pytest
+
+from spacy import en
+from spacy.lexeme import lex_of
+
+from spacy import SIC, LEX, NORM, SHAPE, LAST3
+
+
+def test_group_by_lex():
+    tokens = en.tokenize("I like the red one and I like the blue one")
+    names, hashes, groups = tokens.group_by(LEX)
+
+    assert len(groups[0]) == 2
+    assert en.unhash(lex_of(groups[0][0])) == 'I'
+    assert names[0] == 'I'
+    assert len(groups[1]) == 2
+    assert en.unhash(lex_of(groups[1][0])) == 'like'
+    assert names[1] == "like"
+    assert len(groups[2]) == 2
+    assert len(groups[3]) == 1
+
+
+def test_group_by_last3():
+    tokens = en.tokenize("I the blithe swarthy mate ate on the filthy deck")
+    names, hashes, groups = tokens.group_by(LAST3)
+
+    assert len(groups[0]) == 1
+    assert en.unhash(lex_of(groups[0][0])) == 'I'
+    assert len(groups[1]) == 3
+    assert en.unhash(lex_of(groups[1][0])) == 'the'
+    assert len(groups[2]) == 2
+    assert len(groups[3]) == 2
+    assert len(groups[4]) == 1
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@ -0,0 +1,16 @@
+from __future__ import unicode_literals
+
+import pytest
+
+from spacy.en import lookup, unhash
+
+from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of
+from spacy.lexeme import shape_of
+
+@pytest.fixture
+def C3P0():
+    return lookup("C3P0")
+
+
+def test_shape(C3P0):
+    assert unhash(shape_of(C3P0)) == "XdXd"
--- a/tests/test_post_punct.py
+++ b/tests/test_post_punct.py
@ -0,0 +1,47 @@
+from __future__ import unicode_literals
+
+from spacy import lex_of
+from spacy.spacy import expand_chunk
+from spacy.en import lookup
+from spacy.en import unhash
+
+import pytest
+
+
+@pytest.fixture
+def close_puncts():
+    return [')', ']', '}', '*']
+
+
+def test_close(close_puncts):
+    word_str = 'Hello'
+    for p in close_puncts:
+        string = word_str + p
+        token = lookup(string)
+        tokens = expand_chunk(token)
+        assert len(tokens) == 2
+        assert unhash(lex_of(tokens[1])) == p
+        assert unhash(lex_of(tokens[0])) == word_str
+
+
+def test_two_different_close(close_puncts):
+    word_str = 'Hello'
+    for p in close_puncts:
+        string = word_str + p + "'"
+        token = lookup(string)
+        assert unhash(lex_of(token)) == word_str
+        tokens = expand_chunk(token)
+        assert len(tokens) == 3
+        assert unhash(lex_of(tokens[0])) == word_str
+        assert unhash(lex_of(tokens[1])) == p
+        assert unhash(lex_of(tokens[2])) == "'"
+
+
+def test_three_same_close(close_puncts):
+    word_str = 'Hello'
+    for p in close_puncts:
+        string = word_str + p + p + p
+        tokens = expand_chunk(lookup(string))
+        assert len(tokens) == 4
+        assert unhash(lex_of(tokens[0])) == word_str
+        assert unhash(lex_of(tokens[1])) == p
--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@ -0,0 +1,57 @@
+from __future__ import unicode_literals
+
+from spacy import lex_of
+from spacy.spacy import expand_chunk
+from spacy.en import lookup
+from spacy.en import unhash
+
+import pytest
+
+
+@pytest.fixture
+def open_puncts():
+    return ['(', '[', '{', '*']
+
+
+def test_open(open_puncts):
+    word_str = 'Hello'
+    for p in open_puncts:
+        string = p + word_str
+        token = lookup(string)
+        assert unhash(lex_of(token)) == p
+        tokens = expand_chunk(token)
+        assert len(tokens) == 2
+        assert unhash(lex_of(tokens[0])) == p
+        assert unhash(lex_of(tokens[1])) == word_str
+
+
+def test_two_different_open(open_puncts):
+    word_str = 'Hello'
+    for p in open_puncts:
+        string = p + "`" + word_str
+        token = lookup(string)
+        assert unhash(lex_of(token)) == p
+        tokens = expand_chunk(token)
+        assert len(tokens) == 3
+        assert unhash(lex_of(tokens[0])) == p
+        assert unhash(lex_of(tokens[1])) == "`"
+        assert unhash(lex_of(tokens[2])) == word_str
+
+
+def test_three_same_open(open_puncts):
+    word_str = 'Hello'
+    for p in open_puncts:
+        string = p + p + p + word_str
+        token = lookup(string)
+        assert unhash(lex_of(token)) == p
+        tokens = expand_chunk(token)
+        assert len(tokens) == 4
+        assert unhash(lex_of(tokens[0])) == p
+        assert unhash(lex_of(tokens[3])) == word_str
+
+
+def test_open_appostrophe():
+    string = "'The"
+    tokens = expand_chunk(lookup(string))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "'"
--- a/tests/test_ptb_match_wiki_sun.py
+++ b/tests/test_ptb_match_wiki_sun.py
@ -0,0 +1,46 @@
+from __future__ import unicode_literals
+
+from spacy.en import unhash
+from spacy import lex_of
+from spacy.util import utf8open
+from spacy.en_ptb import tokenize, lookup, unhash
+
+import pytest
+import os
+from os import path
+
+
+HERE = path.dirname(__file__)
+
+
+@pytest.fixture
+def sun_txt():
+    loc = path.join(HERE, 'sun.txt')
+    return utf8open(loc).read()
+
+
+@pytest.fixture
+def my_tokens(sun_txt):
+    assert len(sun_txt) != 0
+    tokens = tokenize(sun_txt)
+    return [unhash(lex_of(t)) for t in tokens]
+
+
+@pytest.fixture
+def sed_tokens():
+    loc = path.join(HERE, 'sun.tokens')
+    return utf8open(loc).read().split()
+
+
+def test_compare_tokens(my_tokens, sed_tokens):
+    me = my_tokens
+    sed = sed_tokens
+    i = 0
+    while i < len(me) and i < len(sed):
+        assert me[i] == sed[i]
+        i += 1
+
+    assert len(me) == len(sed)
+
+
+
--- a/tests/test_rules.py
+++ b/tests/test_rules.py
@ -0,0 +1,11 @@
+from spacy import util
+
+
+def test_load_en():
+    rules = util.read_tokenization('en')
+    assert len(rules) != 0
+    aint = [rule for rule in rules if rule[0] == "ain't"][0]
+    chunk, lex, pieces = aint
+    assert chunk == "ain't"
+    assert lex == "are"
+    assert pieces == ["not"]
--- a/tests/test_surround_punct.py
+++ b/tests/test_surround_punct.py
@ -0,0 +1,39 @@
+from __future__ import unicode_literals
+
+from spacy import lex_of, sic_of
+from spacy.spacy import expand_chunk
+from spacy.en import lookup
+from spacy.en import unhash
+
+import pytest
+
+
+@pytest.fixture
+def paired_puncts():
+    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
+
+
+def test_token(paired_puncts):
+    word_str = 'Hello'
+    for open_, close_ in paired_puncts:
+        string = open_ + word_str + close_
+        tokens = expand_chunk(lookup(string))
+        assert len(tokens) == 3
+        assert unhash(lex_of(tokens[0])) == open_
+        assert unhash(lex_of(tokens[1])) == word_str
+        assert unhash(lex_of(tokens[2])) == close_
+        assert unhash(sic_of(tokens[0])) == string
+
+
+def test_two_different(paired_puncts):
+    word_str = 'Hello'
+    for open_, close_ in paired_puncts:
+        string = "`" + open_ + word_str + close_ + "'"
+        tokens = expand_chunk(lookup(string))
+        assert len(tokens) == 5
+        assert unhash(lex_of(tokens[0])) == "`"
+        assert unhash(lex_of(tokens[1])) == open_
+        assert unhash(lex_of(tokens[2])) == word_str
+        assert unhash(lex_of(tokens[2])) == word_str
+        assert unhash(lex_of(tokens[3])) == close_
+        assert unhash(lex_of(tokens[4])) == "'"
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -0,0 +1,47 @@
+from __future__ import unicode_literals
+
+from spacy.en import tokenize
+from spacy.en import lookup
+
+from spacy.lexeme import lex_of
+
+
+def test_single_word():
+    lex_ids = tokenize(u'hello')
+    assert lex_ids[0] == lookup(u'hello')
+
+
+def test_two_words():
+    lex_ids = tokenize(u'hello possums')
+    assert len(lex_ids) == 2
+    assert lex_ids[0] == lookup(u'hello')
+    assert lex_ids[0] != lex_ids[1]
+
+
+def test_punct():
+    lex_ids = tokenize('hello, possums.')
+    assert len(lex_ids) == 4
+    assert lex_ids[0] != lookup('hello')
+    assert lex_of(lex_ids[0]) == lex_of(lookup('hello'))
+    assert lex_ids[2] == lookup('possums.')
+    assert lex_of(lex_ids[2]) == lex_of(lookup('possums.'))
+    assert lex_of(lex_ids[2]) == lex_of(lookup('possums'))
+    assert lex_of(lex_ids[1]) != lex_of(lookup('hello'))
+    assert lex_ids[0] != lookup('hello.')
+
+
+def test_digits():
+    lex_ids = tokenize('The year: 1984.')
+    assert len(lex_ids) == 5
+    assert lex_of(lex_ids[0]) == lex_of(lookup('The'))
+    assert lex_of(lex_ids[3]) == lex_of(lookup('1984'))
+    assert lex_of(lex_ids[4]) == lex_of(lookup('.'))
+
+
+def test_contraction():
+    lex_ids = tokenize("don't giggle")
+    assert len(lex_ids) == 3
+    assert lex_of(lex_ids[1]) == lex_of(lookup("not"))
+    lex_ids = tokenize("i said don't!")
+    assert len(lex_ids) == 4
+    assert lex_of(lex_ids[3]) == lex_of(lookup('!'))
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -0,0 +1,37 @@
+from __future__ import unicode_literals
+
+from spacy import lex_of
+from spacy.en import lookup
+from spacy.en import unhash
+
+
+def test_neq():
+    addr = lookup('Hello')
+    assert lookup('bye') != addr
+
+
+def test_eq():
+    addr = lookup('Hello')
+    assert lookup('Hello') == addr
+
+
+def test_round_trip():
+    hello = lookup('Hello')
+    assert unhash(lex_of(hello)) == 'Hello'
+
+
+def test_case_neq():
+    addr = lookup('Hello')
+    assert lookup('hello') != addr
+
+
+def test_punct_neq():
+    addr = lookup('Hello')
+    assert lookup('Hello,') != addr
+
+
+def test_short():
+    addr = lookup('I')
+    assert unhash(lex_of(addr)) == 'I'
+    addr = lookup('not')
+    assert unhash(lex_of(addr)) == 'not'
--- a/tests/test_wiki_sun.py
+++ b/tests/test_wiki_sun.py
@ -0,0 +1,25 @@
+from __future__ import unicode_literals
+
+from spacy.en import unhash
+from spacy import lex_of
+from spacy import en
+from spacy.util import utf8open
+
+import pytest
+import os
+from os import path
+
+
+HERE = path.dirname(__file__)
+
+
+@pytest.fixture
+def sun_txt():
+    loc = path.join(HERE, 'sun.txt')
+    return utf8open(loc).read()
+
+
+def test_tokenize(sun_txt):
+    assert len(sun_txt) != 0
+    tokens = en.tokenize(sun_txt)
+    assert True
--- a/tests/tokenizer.sed
+++ b/tests/tokenizer.sed
@ -0,0 +1,82 @@
+#!/bin/sed -f
+
+# Sed script to produce Penn Treebank tokenization on arbitrary raw text.
+# Yeah, sure.
+
+# expected input: raw text with ONE SENTENCE TOKEN PER LINE
+
+# by Robert MacIntyre, University of Pennsylvania, late 1995.
+
+# If this wasn't such a trivial program, I'd include all that stuff about
+# no warrantee, free use, etc. from the GNU General Public License.  If you
+# want to be picky, assume that all of its terms apply.  Okay?
+
+# attempt to get correct directional quotes
+s=^"=`` =g
+s=\([ ([{<]\)"=\1 `` =g
+# close quotes handled at end
+
+s=\.\.\.= ... =g
+s=[,;:@#$%&]= & =g
+
+# Assume sentence tokenization has been done first, so split FINAL periods
+# only. 
+s=\([^.]\)\([.]\)\([])}>"']*\)[ 	]*$=\1 \2\3 =g
+# however, we may as well split ALL question marks and exclamation points,
+# since they shouldn't have the abbrev.-marker ambiguity problem
+s=[?!]= & =g
+
+# parentheses, brackets, etc.
+s=[][(){}<>]= & =g
+# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file
+# version of these symbols.
+# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST.
+# s/(/-LRB-/g
+# s/)/-RRB-/g
+# s/\[/-LSB-/g
+# s/\]/-RSB-/g
+# s/{/-LCB-/g
+# s/}/-RCB-/g
+
+s=--= -- =g
+
+# NOTE THAT SPLIT WORDS ARE NOT MARKED.  Obviously this isn't great, since
+# you might someday want to know how the words originally fit together --
+# but it's too late to make a better system now, given the millions of
+# words we've already done "wrong".
+
+# First off, add a space to the beginning and end of each line, to reduce
+# necessary number of regexps.
+s=$= =
+s=^= =
+
+s="= '' =g
+# possessive or close-single-quote
+s=\([^']\)' =\1 ' =g
+# as in it's, I'm, we'd
+s='\([sSmMdD]\) = '\1 =g
+s='ll = 'll =g
+s='re = 're =g
+s='ve = 've =g
+s=n't = n't =g
+s='LL = 'LL =g
+s='RE = 'RE =g
+s='VE = 'VE =g
+s=N'T = N'T =g
+
+s= \([Cc]\)annot = \1an not =g
+s= \([Dd]\)'ye = \1' ye =g
+s= \([Gg]\)imme = \1im me =g
+s= \([Gg]\)onna = \1on na =g
+s= \([Gg]\)otta = \1ot ta =g
+s= \([Ll]\)emme = \1em me =g
+s= \([Mm]\)ore'n = \1ore 'n =g
+s= '\([Tt]\)is = '\1 is =g
+s= '\([Tt]\)was = '\1 was =g
+s= \([Ww]\)anna = \1an na =g
+# s= \([Ww]\)haddya = \1ha dd ya =g
+# s= \([Ww]\)hatcha = \1ha t cha =g
+
+# clean out extra spaces
+s=  *= =g
+s=^ *==g