diff --git a/spacy/en.pxd b/spacy/en.pxd
index a7c643eba..cccfb60a8 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -1,6 +1,32 @@
 from spacy.lang cimport Language
 from spacy.tokens cimport Tokens
 
+# Flags
+cpdef enum FlagID:
+    IS_ALPHA
+    IS_ASCII
+    IS_DIGIT
+    IS_LOWER
+    IS_PUNCT
+    IS_SPACE
+    IS_TITLE
+    IS_UPPER
+
+    LIKE_URL
+    LIKE_NUMBER
+
+    OFT_LOWER
+    OFT_TITLE
+    OFT_UPPER
+
+    IN_MALES
+    IN_FEMALES
+    IN_SURNAMES
+    IN_PLACES
+    IN_GAMES
+    IN_CELEBS
+    IN_NAMES
+
 
 cdef class English(Language):
     pass
diff --git a/spacy/en.pyx b/spacy/en.pyx
index 95c1cbd94..92be97aad 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -38,6 +38,8 @@ provides a fully Penn Treebank 3-compliant tokenizer.
 from __future__ import unicode_literals
 
 cimport lang
+from .typedefs cimport flags_t
+import orth
 
 
 cdef class English(Language):
@@ -47,7 +49,20 @@ cdef class English(Language):
         name (unicode): The two letter code used by Wikipedia for the language.
         lexicon (Lexicon): The lexicon. Exposes the lookup method.
     """
-    pass
+    def set_flags(self, unicode string):
+        cdef flags_t flags = 0
+        flags |= orth.is_alpha(string) << IS_ALPHA
+        flags |= orth.is_ascii(string) << IS_ASCII
+        flags |= orth.is_digit(string) << IS_DIGIT
+        flags |= orth.is_lower(string) << IS_LOWER
+        flags |= orth.is_punct(string) << IS_PUNCT
+        flags |= orth.is_space(string) << IS_SPACE
+        flags |= orth.is_title(string) << IS_TITLE
+        flags |= orth.is_upper(string) << IS_UPPER
+
+        flags |= orth.like_url(string) << LIKE_URL
+        flags |= orth.like_number(string) << LIKE_NUMBER
+        return flags
 
 
 EN = English('en')
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index dc3262771..9e4bc7b5d 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -8,23 +8,17 @@ from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
 from .tokens cimport Tokens
 from .lexeme cimport Lexeme
-from .tagger cimport Tagger
-from .utf8string cimport StringStore
-
-
-cdef struct String:
-    Py_UNICODE* chars
-    size_t n
-    hash_t key
+from .utf8string cimport StringStore, UniStr
 
 
 cdef class Lexicon:
+    cpdef public set_flags
     cdef Pool mem
     cpdef readonly size_t size
     cpdef readonly StringStore strings
     cdef vector[Lexeme*] lexemes
 
-    cdef Lexeme* get(self, String* s) except NULL
+    cdef Lexeme* get(self, UniStr* s) except NULL
     
     cdef PreshMap _map
     
@@ -43,10 +37,10 @@ cdef class Language:
     cpdef Tokens tokens_from_list(self, list strings)
     cpdef Tokens tokenize(self, unicode text)
 
-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                              vector[Lexeme*] *suffixes) except NULL
-    cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
                             vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
     cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
     cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index df9cf3166..2a284b9df 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -19,6 +19,8 @@ from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
 
+from .utf8string cimport slice_unicode
+
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
@@ -34,7 +36,7 @@ cdef class Language:
         self._prefix_re = re.compile(prefix)
         self._suffix_re = re.compile(suffix)
         self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon()
+        self.lexicon = Lexicon(self.set_flags)
         if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
             self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
             self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
@@ -45,11 +47,11 @@ cdef class Language:
         cdef Tokens tokens = Tokens(self.lexicon.strings, length)
         if length == 0:
             return tokens
-        cdef String string_struct
+        cdef UniStr string_struct
         cdef unicode py_string
         cdef int idx = 0
         for i, py_string in enumerate(strings):
-            string_from_unicode(&string_struct, py_string)
+            slice_unicode(&string_struct, py_string, 0, len(py_string))
             tokens.push_back(idx, self.lexicon.get(&string_struct))
             idx += len(py_string) + 1
         return tokens
@@ -77,11 +79,11 @@ cdef class Language:
         cdef int start = 0
         cdef Py_UNICODE* chars = string
         cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
-        cdef String span
+        cdef UniStr span
         for i in range(1, length):
             if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                 if start < i:
-                    string_slice(&span, chars, start, i)
+                    slice_unicode(&span, chars, start, i)
                     lexemes = <Lexeme**>self._cache.get(span.key)
                     if lexemes != NULL:
                         tokens.extend(start, lexemes, 0)
@@ -93,7 +95,7 @@ cdef class Language:
                     start += 1
         i += 1
         if start < i:
-            string_slice(&span, chars, start, i)
+            slice_unicode(&span, chars, start, i)
             lexemes = <Lexeme**>self._cache.get(span.key)
             if lexemes != NULL:
                 tokens.extend(start, lexemes, 0)
@@ -101,7 +103,7 @@ cdef class Language:
                 self._tokenize(tokens, &span, start, i)
         return tokens
 
-    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
+    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
         cdef vector[Lexeme*] prefixes
         cdef vector[Lexeme*] suffixes
         cdef hash_t orig_key
@@ -112,20 +114,20 @@ cdef class Language:
         self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
         self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
 
-    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                                 vector[Lexeme*] *suffixes) except NULL:
         cdef size_t i
-        cdef String prefix
-        cdef String suffix
-        cdef String minus_pre
-        cdef String minus_suf
+        cdef UniStr prefix
+        cdef UniStr suffix
+        cdef UniStr minus_pre
+        cdef UniStr minus_suf
         cdef size_t last_size = 0
         while string.n != 0 and string.n != last_size:
             last_size = string.n
             pre_len = self._find_prefix(string.chars, string.n)
             if pre_len != 0:
-                string_slice(&prefix, string.chars, 0, pre_len)
-                string_slice(&minus_pre, string.chars, pre_len, string.n)
+                slice_unicode(&prefix, string.chars, 0, pre_len)
+                slice_unicode(&minus_pre, string.chars, pre_len, string.n)
                 # Check whether we've hit a special-case
                 if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
                     string[0] = minus_pre
@@ -133,15 +135,15 @@ cdef class Language:
                     break
             suf_len = self._find_suffix(string.chars, string.n)
             if suf_len != 0:
-                string_slice(&suffix, string.chars, string.n - suf_len, string.n)
-                string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
+                slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
+                slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
                 # Check whether we've hit a special-case
                 if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
                     string[0] = minus_suf
                     suffixes.push_back(self.lexicon.get(&suffix))
                     break
             if pre_len and suf_len and (pre_len + suf_len) <= string.n:
-                string_slice(string, string.chars, pre_len, string.n - suf_len)
+                slice_unicode(string, string.chars, pre_len, string.n - suf_len)
                 prefixes.push_back(self.lexicon.get(&prefix))
                 suffixes.push_back(self.lexicon.get(&suffix))
             elif pre_len:
@@ -155,13 +157,13 @@ cdef class Language:
         return string
 
     cdef int _attach_tokens(self, Tokens tokens,
-                            int idx, String* string,
+                            int idx, UniStr* string,
                             vector[Lexeme*] *prefixes,
                             vector[Lexeme*] *suffixes) except -1:
         cdef int split
         cdef Lexeme** lexemes
         cdef Lexeme* lexeme
-        cdef String span
+        cdef UniStr span
         if prefixes.size():
             idx = tokens.extend(idx, prefixes.data(), prefixes.size())
         if string.n != 0:
@@ -174,11 +176,11 @@ cdef class Language:
                 if split == 0 or split == -1:
                     idx = tokens.push_back(idx, self.lexicon.get(string))
                 else:
-                    string_slice(&span, string.chars, 0, split)
+                    slice_unicode(&span, string.chars, 0, split)
                     idx = tokens.push_back(idx, self.lexicon.get(&span))
-                    string_slice(&span, string.chars, split, split+1)
+                    slice_unicode(&span, string.chars, split, split+1)
                     idx = tokens.push_back(idx, self.lexicon.get(&span))
-                    string_slice(&span, string.chars, split + 1, string.n)
+                    slice_unicode(&span, string.chars, split + 1, string.n)
                     idx = tokens.push_back(idx, self.lexicon.get(&span))
         cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
@@ -222,14 +224,14 @@ cdef class Language:
         '''
         cdef Lexeme** lexemes
         cdef hash_t hashed
-        cdef String string
+        cdef UniStr string
         for uni_string, substrings in token_rules:
             lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
             for i, substring in enumerate(substrings):
-                string_from_unicode(&string, substring)
+                slice_unicode(&string, substring, 0, len(substring))
                 lexemes[i] = <Lexeme*>self.lexicon.get(&string)
             lexemes[i + 1] = NULL
-            string_from_unicode(&string, uni_string)
+            slice_unicode(&string, uni_string, 0, len(uni_string))
             self._specials.set(string.key, lexemes)
             self._cache.set(string.key, lexemes)
 
@@ -239,21 +241,23 @@ cdef class Lexicon:
     
     Also interns UTF-8 strings, and maps them to consecutive integer IDs.
     '''
-    def __init__(self):
+    def __init__(self, object set_flags=None):
         self.mem = Pool()
         self._map = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.lexemes.push_back(&EMPTY_LEXEME)
         self.size = 1
+        self.set_flags = set_flags
 
-    cdef Lexeme* get(self, String* string) except NULL:
+    cdef Lexeme* get(self, UniStr* string) except NULL:
         '''Retrieve a pointer to a Lexeme from the lexicon.'''
         cdef Lexeme* lex
         lex = <Lexeme*>self._map.get(string.key)
         if lex != NULL:
             return lex
         lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
+        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
+                self.strings, {'flags': self.set_flags(string.chars[:string.n])})
         self._map.set(string.key, lex)
         while self.lexemes.size() < (lex.id + 1):
             self.lexemes.push_back(&EMPTY_LEXEME)
@@ -283,14 +287,14 @@ cdef class Lexicon:
         '''
         if type(id_or_string) == int:
             return self.lexemes.at(id_or_string)[0]
-        cdef String string
-        string_from_unicode(&string, id_or_string)
+        cdef UniStr string
+        slice_unicode(&string, id_or_string, 0, len(id_or_string))
         cdef Lexeme* lexeme = self.get(&string)
         return lexeme[0]
 
     def __setitem__(self, unicode uni_string, dict props):
-        cdef String s
-        string_from_unicode(&s, uni_string)
+        cdef UniStr s
+        slice_unicode(&s, uni_string, 0, len(uni_string))
         cdef Lexeme* lex = self.get(&s)
         lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
 
@@ -338,14 +342,3 @@ cdef class Lexicon:
             i += 1
             self.size += 1
         fclose(fp)
-        
-
-cdef void string_from_unicode(String* s, unicode uni):
-    cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
-    string_slice(s, c_uni, 0, len(uni))
-
-
-cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
-    s.chars = &chars[start]
-    s.n = end - start
-    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 0d7d206e5..9d5dddd6d 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,61 +1,119 @@
-from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
+from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
 
 from .utf8string cimport StringStore
-from libc.stdint cimport uint16_t
 
-cpdef flag_t OOV_DIST_FLAGS
 
-# Flags
-cpdef enum:
-    IS_ALPHA
-    IS_ASCII
-    IS_DIGIT
-    IS_LOWER
-    IS_PUNCT
-    IS_SPACE
-    IS_TITLE
-    IS_UPPER
+# Reserve 64 values for flag features
+cpdef enum attr_id_t:
+    FLAG0
+    FLAG1
+    FLAG2
+    FLAG3
+    FLAG4
+    FLAG5
+    FLAG6
+    FLAG7
+    FLAG8
+    FLAG9
+    FLAG10
+    FLAG11
+    FLAG12
+    FLAG13
+    FLAG14
+    FLAG15
+    FLAG16
+    FLAG17
+    FLAG18
+    FLAG19
+    FLAG20
+    FLAG21
+    FLAG22
+    FLAG23
+    FLAG24
+    FLAG25
+    FLAG26
+    FLAG27
+    FLAG28
+    FLAG29
+    FLAG30
+    FLAG31
+    FLAG32
+    FLAG33
+    FLAG34
+    FLAG35
+    FLAG36
+    FLAG37
+    FLAG38
+    FLAG39
+    FLAG40
+    FLAG41
+    FLAG42
+    FLAG43
+    FLAG44
+    FLAG45
+    FLAG46
+    FLAG47
+    FLAG48
+    FLAG49
+    FLAG50
+    FLAG51
+    FLAG52
+    FLAG53
+    FLAG54
+    FLAG55
+    FLAG56
+    FLAG57
+    FLAG58
+    FLAG59
+    FLAG60
+    FLAG61
+    FLAG62
+    FLAG63
 
-    LIKE_URL
-    LIKE_NUMBER
+    ID
+    SIC
+    NORM
+    SHAPE
+    ASCIIED
+    PREFIX
+    SUFFIX
 
-    OFT_LOWER
-    OFT_TITLE
-    OFT_UPPER
-
-    IN_MALES
-    IN_FEMALES
-    IN_SURNAMES
-    IN_PLACES
-    IN_GAMES
-    IN_CELEBS
-    IN_NAMES
+    LENGTH
+    CLUSTER
+    POS_TYPE
+    SENSE_TYPE
 
 
 cdef struct Lexeme:
-    flag_t flags
+    flags_t flags
    
-    id_t id
-    id_t sic
-    id_t norm
-    id_t shape
-    id_t asciied
-    id_t prefix
-    id_t suffix
+    attr_t id
+    attr_t sic
+    attr_t norm
+    attr_t shape
+    attr_t asciied
+    attr_t prefix
+    attr_t suffix
+ 
+    attr_t length
+    attr_t cluster
+    attr_t pos_type
+    attr_t sense_type
 
     float prob
-    
-    len_t length
-    tag_t cluster
-    tag_t postype
-    tag_t supersense
+    float upper_pc
+    float title_pc
 
 
 cdef Lexeme EMPTY_LEXEME
 
-cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
-                  StringStore store, dict props) except *
+
+cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
+                  dict props) except *
  
 
-cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
+cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil:
     return lexeme.flags & (1 << flag_id)
+
+
+cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id)
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 64eb699a6..888edc07b 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -6,67 +6,59 @@ from libc.string cimport memset
 
 import orth
 
-from .utf8string cimport Utf8Str
-
-OOV_DIST_FLAGS = 0
 
 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
 
 
-def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
-    cdef flag_t flags = 0
-    flags |= orth.is_alpha(string) << IS_ALPHA
-    flags |= orth.is_ascii(string) << IS_ASCII
-    flags |= orth.is_digit(string) << IS_DIGIT
-    flags |= orth.is_lower(string) << IS_LOWER
-    flags |= orth.is_punct(string) << IS_PUNCT
-    flags |= orth.is_space(string) << IS_SPACE
-    flags |= orth.is_title(string) << IS_TITLE
-    flags |= orth.is_upper(string) << IS_UPPER
-
-    flags |= orth.like_url(string) << LIKE_URL
-    flags |= orth.like_number(string) << LIKE_NUMBER
-    return flags
-
-
 cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
-                  StringStore store, dict props) except *:
+                  StringStore string_store, dict props) except *:
     cdef Lexeme lex
     lex.id = i
     lex.length = len(string)
-    lex.sic = get_string_id(string, store)
+    lex.sic = string_store[string]
     
     lex.cluster = props.get('cluster', 0)
-    lex.postype = props.get('postype', 0)
-    lex.supersense = props.get('supersense', 0)
+    lex.pos_type = props.get('pos_type', 0)
+    lex.sense_type = props.get('sense_type', 0)
     lex.prob = props.get('prob', 0)
 
-    cdef float upper_pc = props.get('upper_pc', 0.0)
-    cdef float lower_pc = props.get('lower_pc', 0.0)
-    cdef float title_pc = props.get('title_pc', 0.0)
+    lex.upper_pc = props.get('upper_pc', 0.0)
+    lex.title_pc = props.get('lower_pc', 0.0)
 
-    lex.prefix = get_string_id(string[0], store)
-    lex.suffix = get_string_id(string[-3:], store)
-    if upper_pc or lower_pc or title_pc:
-        canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
-        lex.norm = get_string_id(canon_cased, store)
-    else:
-        lex.norm = lex.sic
-    lex.shape = get_string_id(orth.word_shape(string), store)
-    lex.asciied = get_string_id(orth.asciied(string), store)
-    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
-    
-    lex.flags |= props.get('in_males', 0) << IN_MALES
-    lex.flags |= props.get('in_females', 0) << IN_FEMALES
-    lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
-    lex.flags |= props.get('in_places', 0) << IN_PLACES
-    lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
-    lex.flags |= props.get('in_games', 0) << IN_GAMES
-    lex.flags |= props.get('in_names', 0) << IN_NAMES
+    lex.prefix = string_store[string[:1]]
+    lex.suffix = string_store[string[-3:]]
+    lex.norm = lex.sic # TODO
+    lex.shape = string_store[orth.word_shape(string)]
+    lex.asciied = string_store[orth.asciied(string)]
+   
+    lex.flags = props.get('flags', 0)
     return lex
 
 
-cdef id_t get_string_id(unicode string, StringStore store) except 0:
-    cdef bytes byte_string = string.encode('utf8')
-    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
-    return orig_str.i
+cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name):
+    if feat_name < (sizeof(flags_t) * 8):
+        return check_flag(lex, feat_name)
+    elif feat_name == ID:
+        return lex.id
+    elif feat_name == SIC:
+        return lex.sic
+    elif feat_name == NORM:
+        return lex.norm
+    elif feat_name == SHAPE:
+        return lex.shape
+    elif feat_name == ASCIIED:
+        return lex.asciied
+    elif feat_name == PREFIX:
+        return lex.prefix
+    elif feat_name == SUFFIX:
+        return lex.suffix
+    elif feat_name == LENGTH:
+        return lex.length
+    elif feat_name == CLUSTER:
+        return lex.cluster
+    elif feat_name == POS_TYPE:
+        return lex.pos_type
+    elif feat_name == SENSE_TYPE:
+        return lex.sense_type
+    else:
+        raise StandardError('Feature ID: %d not found' % feat_name)