diff --git a/spacy/en.pyx b/spacy/en.pyx
index f90af1549..4357addfe 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -64,3 +64,33 @@ cpdef Lexeme_addr lookup(unicode string) except 0:
 
 cpdef unicode unhash(StringHash hash_value):
     return EN.unhash(hash_value)
+
+
+cpdef bint is_oft_upper(size_t lex_id):
+    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    stores whether the lowered version of the string hashed by `lex' is found
+    in all-upper case frequently in a large sample of text.  Users are free
+    to load different data, by default we use a sample from Wikipedia, with
+    a threshold of 0.95, picked to maximize mutual information for POS tagging.
+
+    >>> is_oft_upper(lookup(u'abc'))
+    True
+    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
+    True
+    '''
+    return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
+
+
+cpdef bint is_oft_title(size_t lex_id):
+    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    stores whether the lowered version of the string hashed by `lex' is found
+    title-cased frequently in a large sample of text.  Users are free
+    to load different data, by default we use a sample from Wikipedia, with
+    a threshold of 0.3, picked to maximize mutual information for POS tagging.
+
+    >>> is_oft_title(lookup(u'marcus'))
+    True
+    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
+    True
+    '''
+    return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 90d06587e..6175ec3a8 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -9,13 +9,35 @@ ctypedef char Bits8
 ctypedef uint64_t Bits64
 
 
+cdef enum OrthFlag:
+    IS_ALPHA
+    IS_DIGIT
+    IS_PUNCT
+    IS_WHITE
+    IS_LOWER
+    IS_UPPER
+    IS_TITLE
+    IS_ASCII
+
+
+cdef enum DistFlag:
+    OFT_UPPER
+    OFT_TITLE
+    DIST_FLAG3
+    DIST_FLAG4
+    DIST_FLAG5
+    DIST_FLAG6
+    DIST_FLAG7
+    DIST_FLAG8
+
+
 cdef struct Orthography:
     StringHash last3
     StringHash shape
     StringHash norm
 
     size_t length
-    unsigned char first
+    Py_UNICODE first
     Bits8 flags
 
 
@@ -27,15 +49,12 @@ cdef struct Distribution:
 
 
 cdef struct Lexeme:
-    StringHash sic # Hash of the original string
-    StringHash lex # Hash of the word, with punctuation and clitics split off
-
-    Distribution* dist # Distribution info, lazy loaded
+    StringHash lex # Hash of the word
     Orthography* orth  # Extra orthographic views
-    #Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
+    Distribution* dist # Distribution info
 
 
-cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
+cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL)
 
 
 cdef enum StringAttr:
@@ -49,7 +68,16 @@ cdef enum StringAttr:
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
 
 cpdef StringHash lex_of(size_t lex_id) except 0
+
 cpdef StringHash norm_of(size_t lex_id) except 0
 cpdef StringHash shape_of(size_t lex_id) except 0
 cpdef StringHash last3_of(size_t lex_id) except 0
-cpdef StringHash length_of(size_t lex_id)
+
+cpdef size_t length_of(size_t lex_id) except *
+cpdef Py_UNICODE first_of(size_t lex_id) except *
+
+cpdef double prob_of(size_t lex_id) except 0
+cpdef ClusterID cluster_of(size_t lex_id) except 0
+
+cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
+cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 42c93ec60..27ca13bd7 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -13,13 +13,6 @@ from libcpp.vector cimport vector
 
 from spacy.spacy cimport StringHash
 
-# Reiterate the enum, for python
-#SIC = StringAttr.sic
-#LEX = StringAttr.lex
-#NORM = StringAttr.norm
-#SHAPE = StringAttr.shape
-#LAST3 = StringAttr.last3
-
 
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
     if attr == LEX:
@@ -133,37 +126,9 @@ cpdef double prob_of(size_t lex_id):
     return (<Lexeme*>lex_id).dist.prob
 
 
-cpdef bint is_oft_upper(size_t lex_id):
-    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
-    stores whether the lowered version of the string hashed by `lex' is found
-    in all-upper case frequently in a large sample of text.  Users are free
-    to load different data, by default we use a sample from Wikipedia, with
-    a threshold of 0.95, picked to maximize mutual information for POS tagging.
-
-    >>> is_oft_upper(lookup(u'abc'))
-    True
-    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
-    True
-    '''
-    return False
-    #cdef Lexeme* w = <Lexeme*>lex_id
-    #return w.orth.last3 if w.orth != NULL else 0
+cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *:
+    return (<Lexeme*>lex_id).orth.flags & (1 << flag)
 
 
-    #return (<Lexeme*>lex_id).oft_upper
-
-
-cpdef bint is_oft_title(size_t lex_id):
-    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
-    stores whether the lowered version of the string hashed by `lex' is found
-    title-cased frequently in a large sample of text.  Users are free
-    to load different data, by default we use a sample from Wikipedia, with
-    a threshold of 0.3, picked to maximize mutual information for POS tagging.
-
-    >>> is_oft_title(lookup(u'marcus'))
-    True
-    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
-    True
-    '''
-    return False
-    #return (<Lexeme*>lex_id).oft_title
+cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *:
+    return (<Lexeme*>lex_id).dist.flags & (1 << flag)
diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index befa82cc7..e4c5cf240 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -60,6 +60,7 @@ cdef class Language:
         self.chunks.set_empty_key(0)
         self.vocab.set_empty_key(0)
         self.load_tokenization(util.read_tokenization(name))
+        self.load_dist_info(util.read_dist_info(name))
 
     cdef Tokens tokenize(self, unicode string):
         cdef Lexeme** chunk
@@ -108,7 +109,8 @@ cdef class Language:
         word.lex = hash(string)
         self.bacov[word.lex] = string
         word.orth = self.new_orth(string)
-        word.dist = self.new_dist(string)
+
+        word.dist = <Distribution*>calloc(1, sizeof(Distribution))
         self.vocab[word.lex] = <size_t>word
         return word
 
@@ -135,13 +137,8 @@ cdef class Language:
         self.bacov[orth.last3] = last3
         self.bacov[orth.norm] = norm
         self.bacov[orth.shape] = shape
-
         return orth
 
-    cdef Distribution* new_dist(self, unicode lex) except NULL:
-        dist = <Distribution*>calloc(1, sizeof(Distribution))
-        return dist
-
     cdef unicode unhash(self, StringHash hash_value):
         '''Fetch a string from the reverse index, given its hash value.'''
         return self.bacov[hash_value]
@@ -164,21 +161,18 @@ cdef class Language:
         for chunk, tokens in token_rules:
             self.new_chunk(chunk, tokens)
 
-    def load_clusters(self):
+    def load_dist_info(self, dist_info):
+        cdef unicode string
+        cdef dict word_dist
         cdef Lexeme* w
-        data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
-        case_stats = util.load_case_stats(data_dir)
-        brown_loc = path.join(data_dir, 'clusters')
-        cdef size_t start 
-        cdef int end 
-        with util.utf8open(brown_loc) as browns_file:
-            for i, line in enumerate(browns_file):
-                cluster_str, token_string, freq_str = line.split()
-                # Decode as a little-endian string, so that we can do & 15 to get
-                # the first 4 bits. See redshift._parse_features.pyx
-                cluster = int(cluster_str[::-1], 2)
-                upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
-                self.new_lexeme(token_string)
+        for string, word_dist in dist_info.items():
+            w = self.lookup(string)
+            w.prob = word_dist.prob
+            w.cluster = word_dist.cluster
+            for flag in word_dist.flags:
+                w.flags |= lexeme.DIST_FLAGS[flag]
+            for tag in word_dist.tagdict:
+                w.tagdict |= lexeme.TAGS[tag]
 
 
 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
diff --git a/spacy/util.py b/spacy/util.py
index 4d12014ca..c9bce0171 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -19,6 +19,12 @@ def load_case_stats(data_dir):
     return case_stats
 
 
+def load_dist_info(lang):
+    with path.join(DATA_DIR, lang, 'distribution_info.json') as file_:
+        dist_info = json.load(file_)
+    return dist_info
+
+
 def read_tokenization(lang):
     loc = path.join(DATA_DIR, lang, 'tokenization')
     entries = []