* Large refactor, particularly to Python API

2025-09-14 08:02:40 +03:00 · 2014-10-24 00:59:17 +11:00 · 2014-10-24 00:59:17 +11:00 · 08ce602243
commit 08ce602243
parent 168b2b8cb2
21 changed files with 327 additions and 958 deletions
--- a/spacy/de.pxd
+++ b/spacy/de.pxd
@ -1,42 +0,0 @@
-from spacy.spacy cimport Language
-from spacy.word cimport Lexeme
-cimport cython
-
-
-cpdef size_t ALPHA
-cpdef size_t DIGIT 
-cpdef size_t PUNCT
-cpdef size_t SPACE
-cpdef size_t LOWER
-cpdef size_t UPPER
-cpdef size_t TITLE
-cpdef size_t ASCII
-
-cpdef size_t OFT_LOWER
-cpdef size_t OFT_TITLE
-cpdef size_t OFT_UPPER
-
-cpdef size_t PUNCT
-cpdef size_t CONJ
-cpdef size_t NUM
-cpdef size_t N
-cpdef size_t DET
-cpdef size_t ADP
-cpdef size_t ADJ
-cpdef size_t ADV
-cpdef size_t VERB
-cpdef size_t NOUN
-cpdef size_t PDT
-cpdef size_t POS
-cpdef size_t PRON
-cpdef size_t PRT
-
-cdef class English(spacy.Language):
-    cdef int find_split(self, unicode word)
-
-
-cdef English EN
-
-
-cpdef Word lookup(unicode word)
-cpdef list tokenize(unicode string)
--- a/spacy/de.pyx
+++ b/spacy/de.pyx
@ -1,126 +0,0 @@
-# cython: profile=True
-# cython: embedsignature=True
-'''Tokenize German text, using a scheme based on the Negra corpus.
-
-Tokenization is generally similar to English text, and the same set of orthographic
-flags are used.
-
-An abbreviation list is used to handle common abbreviations. Hyphenated words
-are not split, following the Treebank usage.
-'''
-from __future__ import unicode_literals
-
-from libc.stdint cimport uint64_t
-
-cimport spacy
-
-from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
-from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
-from spacy.common cimport check_punct
-
-# Python-readable flag constants --- can't read an enum from Python
-
-# Don't want to manually assign these numbers, or we'll insert one and have to
-# change them all.
-# Don't use "i", as we don't want it in the global scope!
-cdef size_t __i = 0
-
-ALPHA = __i; i += 1
-DIGIT = __i; __i += 1
-PUNCT = __i; __i += 1
-SPACE = __i; __i += 1
-LOWER = __i; __i += 1
-UPPER = __i; __i += 1
-TITLE = __i; __i += 1
-ASCII = __i; __i += 1
-
-OFT_LOWER = __i; __i += 1 
-OFT_UPPER = __i; __i += 1
-OFT_TITLE = __i; __i += 1
-
-PUNCT = __i; __i += 1
-CONJ = __i; __i += 1
-NUM = __i; __i += 1
-X = __i; __i += 1
-DET = __i; __i += 1
-ADP = __i; __i += 1
-ADJ = __i; __i += 1
-ADV = __i; __i += 1
-VERB = __i; __i += 1
-NOUN = __i; __i += 1
-PDT = __i; __i += 1
-POS = __i; __i += 1
-PRON = __i; __i += 1
-PRT = __i; __i += 1
-
-
-# These are for the string views
-__i = 0
-SIC = __i; __i += 1
-CANON_CASED = __i; __i += 1
-NON_SPARSE = __i; __i += 1
-SHAPE = __i; __i += 1
-NR_STRING_VIEWS = __i
-
-
-def get_string_views(unicode string, lexeme):
-    views = ['' for _ in range(NR_STRING_VIEWS)]
-    views[SIC] = string
-    views[CANON_CASED] = canonicalize_case(string, lexeme)
-    views[SHAPE] = get_string_shape(string)
-    views[ASCIIFIED] = get_asciified(string)
-    views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
-                                       views[SHAPE], lexeme)
-    return views
-
-
-def set_orth_flags(unicode string, flags_t flags)
-    setters = [
-        (ALPHA, is_alpha),
-        (DIGIT, is_digit),
-        (PUNCT, is_punct),
-        (SPACE, is_space),
-        (LOWER, is_lower),
-        (UPPER, is_upper),
-        (SPACE, is_space)
-    ]
-
-    for bit, setter in setters:
-        if setter(string):
-            flags |= 1 << bit
-    return flags
-
-
-cdef class German(spacy.Language):
-    cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
-                           tag_freqs=None):
-        return Lexeme(s, length, views, prob=prob, cluster=cluster,
-                      flags=self.get_flags(string)
-    
-    cdef int find_split(self, unicode word):
-        cdef size_t length = len(word)
-        cdef int i = 0
-        if word.startswith("'s") or word.startswith("'S"):
-            return 2
-        # Contractions
-        if word.endswith("'s") and length >= 3:
-            return length - 2
-        # Leading punctuation
-        if check_punct(word, 0, length):
-            return 1
-        elif length >= 1:
-            # Split off all trailing punctuation characters
-            i = 0
-            while i < length and not check_punct(word, i, length):
-                i += 1
-        return i
-
-
-DE = German('de')
-
-lookup = DE.lookup
-tokenize = DE.tokenize
-load_clusters = DE.load_clusters
-load_unigram_probs = DE.load_unigram_probs
-load_case_stats = DE.load_case_stats
-load_tag_stats = DE.load_tag_stats
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,5 +1,4 @@
 from spacy.lang cimport Language
-from spacy.word cimport Lexeme
 from spacy.tokens cimport Tokens


--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -1,14 +1,12 @@
-from libc.stdint cimport uint32_t
-from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
-from libc.stdint cimport uint64_t, int64_t

 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool

-from .word cimport Lexeme
+from .typedefs cimport hash_t
 from .tokens cimport Tokens
-from .lexeme cimport LexemeC
+from .lexeme cimport Lexeme
+from .utf8string cimport StringStore


 cdef extern from "Python.h":
@ -21,23 +19,25 @@ cdef extern from "Python.h":
 cdef struct String:
    Py_UNICODE* chars
    size_t n
-    uint64_t key
+    hash_t key


 cdef class Lexicon:
    cdef Pool mem
    cpdef readonly size_t size
+    cpdef readonly StringStore strings

-    cdef vector[LexemeC*] lexemes
+    cdef vector[Lexeme*] lexemes

    cpdef Lexeme lookup(self, unicode string)
-    cdef LexemeC* get(self, String* s) except NULL
+    cdef Lexeme* get(self, String* s) except NULL
    
    cdef PreshMap _dict
    
    cdef list _string_features
    cdef list _flag_features

+
 cdef class Language:
    cdef Pool _mem
    cdef unicode name
@ -52,12 +52,12 @@ cdef class Language:
    cpdef Tokens tokenize(self, unicode text)

    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
-    cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
-                             vector[LexemeC*] *suffixes) except NULL
+    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+                             vector[Lexeme*] *suffixes) except NULL
    cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
-                            vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
+                            vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
-    cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1
+    cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1
 
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -13,22 +13,21 @@ import random
 from os import path
 import re

-from .util import read_lang_data
-from .tokens import Tokens
-from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
-from .lexeme cimport LexStr_orig
-from murmurhash.mrmr cimport hash64
-
-from cpython.ref cimport Py_INCREF
-
 from cymem.cymem cimport Pool
-
 from cython.operator cimport preincrement as preinc
 from cython.operator cimport dereference as deref

+from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
-from spacy import orth
-from spacy import util
+
+from .lexeme cimport Lexeme
+from .lexeme cimport from_dict as lexeme_from_dict
+from .lexeme cimport from_string as lexeme_from_string
+
+from . import orth
+from . import util
+from .util import read_lang_data
+from .tokens import Tokens


 cdef class Language:
@ -64,7 +63,7 @@ cdef class Language:
            tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
        """
        cdef int length = len(string)
-        cdef Tokens tokens = Tokens(length)
+        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
        if length == 0:
            return tokens
        cdef int i = 0
@ -76,7 +75,7 @@ cdef class Language:
            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                if start < i:
                    string_slice(&span, chars, start, i)
-                    lexemes = <LexemeC**>self.cache.get(span.key)
+                    lexemes = <Lexeme**>self.cache.get(span.key)
                    if lexemes != NULL:
                        tokens.extend(start, lexemes, 0)
                    else: 
@ -88,7 +87,7 @@ cdef class Language:
        i += 1
        if start < i:
            string_slice(&span, chars, start, i)
-            lexemes = <LexemeC**>self.cache.get(span.key)
+            lexemes = <Lexeme**>self.cache.get(span.key)
            if lexemes != NULL:
                tokens.extend(start, lexemes, 0)
            else: 
@ -96,9 +95,9 @@ cdef class Language:
        return tokens

    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
-        cdef vector[LexemeC*] prefixes
-        cdef vector[LexemeC*] suffixes
-        cdef uint64_t orig_key
+        cdef vector[Lexeme*] prefixes
+        cdef vector[Lexeme*] suffixes
+        cdef hash_t orig_key
        cdef int orig_size
        orig_key = span.key
        orig_size = tokens.length
@ -106,8 +105,8 @@ cdef class Language:
        self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
        self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)

-    cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
-                                vector[LexemeC*] *suffixes) except NULL:
+    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+                                vector[Lexeme*] *suffixes) except NULL:
        cdef size_t i
        cdef String prefix
        cdef String suffix
@ -150,15 +149,15 @@ cdef class Language:

    cdef int _attach_tokens(self, Tokens tokens,
                            int idx, String* string,
-                            vector[LexemeC*] *prefixes,
-                            vector[LexemeC*] *suffixes) except -1:
+                            vector[Lexeme*] *prefixes,
+                            vector[Lexeme*] *suffixes) except -1:
        cdef int split
-        cdef LexemeC** lexemes
-        cdef LexemeC* lexeme
+        cdef Lexeme** lexemes
+        cdef Lexeme* lexeme
        cdef String span
        idx = tokens.extend(idx, prefixes.data(), prefixes.size())
        if string.n != 0:
-            lexemes = <LexemeC**>self.cache.get(string.key)
+            lexemes = <Lexeme**>self.cache.get(string.key)
            if lexemes != NULL:
                idx = tokens.extend(idx, lexemes, 0)
            else:
@ -172,13 +171,13 @@ cdef class Language:
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
                    string_slice(&span, string.chars, split + 1, string.n)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
-        cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
+        cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
            idx = tokens.push_back(idx, deref(it))
            preinc(it)

-    cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1:
-        lexemes = <LexemeC**>self._mem.alloc(n + 1, sizeof(LexemeC**))
+    cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
+        lexemes = <Lexeme**>self._mem.alloc(n + 1, sizeof(Lexeme**))
        cdef int i
        for i in range(n):
            lexemes[i] = tokens[i]
@ -212,14 +211,14 @@ cdef class Language:
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                a string and tokens is a list of strings.
        '''
-        cdef LexemeC** lexemes
-        cdef uint64_t hashed
+        cdef Lexeme** lexemes
+        cdef hash_t hashed
        cdef String string
        for uni_string, substrings in token_rules:
-            lexemes = <LexemeC**>self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*))
+            lexemes = <Lexeme**>self._mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
            for i, substring in enumerate(substrings):
                string_from_unicode(&string, substring)
-                lexemes[i] = <LexemeC*>self.lexicon.get(&string)
+                lexemes[i] = <Lexeme*>self.lexicon.get(&string)
            lexemes[i + 1] = NULL
            string_from_unicode(&string, uni_string)
            self.specials.set(string.key, lexemes)
@ -227,33 +226,29 @@ cdef class Language:


 cdef class Lexicon:
-    def __cinit__(self, lexemes):
+    def __init__(self, lexemes):
        self.mem = Pool()
        self._dict = PreshMap(2 ** 20)
+        self.strings = StringStore()
        self.size = 0
        cdef String string
-        cdef dict lexeme_dict
-        cdef LexemeC* lexeme
-        for py_string, lexeme_dict in lexemes.iteritems():
-            string_from_unicode(&string, py_string)
-            lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
-            lexeme_unpack(lexeme, lexeme_dict)
-            self._dict.set(string.key, lexeme)
-            self.lexemes.push_back(lexeme)
-            self.size += 1
+        cdef Lexeme* lexeme
+        #for py_string, lexeme_dict in lexemes.iteritems():
+        #    string_from_unicode(&string, py_string)
+        #    lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
+        #    lexeme_from_dict(lexeme, lexeme_dict, self.strings)
+        #    self._dict.set(string.key, lexeme)
+        #    self.lexemes.push_back(lexeme)
+        #    self.size += 1

-    def __getitem__(self, size_t i):
-        return Lexeme(<size_t>self.lexemes.at(i))
-
-    cdef LexemeC* get(self, String* string) except NULL:
-        cdef LexemeC* lex
-        lex = <LexemeC*>self._dict.get(string.key)
+    cdef Lexeme* get(self, String* string) except NULL:
+        cdef Lexeme* lex
+        lex = <Lexeme*>self._dict.get(string.key)
        if lex != NULL:
            return lex

-        lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
-        cdef unicode unicode_string = string.chars[:string.n]
-        lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
+        lex = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
+        lexeme_from_string(lex, string.chars[:string.n], self.strings)
        self._dict.set(string.key, lex)
        self.lexemes.push_back(lex)
        self.size += 1
@ -270,8 +265,8 @@ cdef class Lexicon:
        """
        cdef String string
        string_from_unicode(&string, uni_string)
-        cdef LexemeC* lexeme = self.get(&string)
-        return Lexeme(<size_t>lexeme)
+        cdef Lexeme* lexeme = self.get(&string)
+        return lexeme[0]


 cdef void string_from_unicode(String* s, unicode uni):
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,94 +1,55 @@
 from .typedefs cimport hash_t, utf8_t, flag_t, id_t
-from cymem.cymem cimport Pool

+from thinc.typedefs cimport atom_t
+
+from .utf8string cimport StringStore

 cpdef flag_t OOV_DIST_FLAGS

+# Flags
+cpdef enum:
+    IS_ALPHA
+    IS_ASCII
+    IS_DIGIT
+    IS_LOWER
+    IS_PUNCT
+    IS_SPACE
+    IS_TITLE
+    IS_UPPER

-cpdef enum LexInts:
-    LexInt_id
-    LexInt_length
-    LexInt_cluster
-    LexInt_pos
-    LexInt_supersense
-    LexInt_N
+    OFT_LOWER
+    OFT_TITLE
+    OFT_UPPER


-cpdef enum LexFloats:
-    LexFloat_prob
-    LexFloat_sentiment
-    LexFloat_N
+cdef struct Lexeme:
+    atom_t id
+    atom_t length
+    
+    atom_t norm
+    atom_t shape
+    atom_t vocab10k
+    atom_t asciied
+    atom_t prefix
+    atom_t suffix
+
+    atom_t cluster
+    atom_t pos
+    atom_t supersense
+
+    float prob
+
+    flag_t flags


-cpdef enum LexStrs:
-    LexStr_orig
-    LexStr_norm
-    LexStr_shape
-    LexStr_unsparse
-    LexStr_asciied
-    LexStr_pre
-    LexStr_suff
-    LexStr_N
+cdef Lexeme EMPTY_LEXEME


-cpdef enum LexOrthFlags:
-    LexOrth_alpha
-    LexOrth_ascii
-    LexOrth_digit
-    LexOrth_lower
-    LexOrth_punct
-    LexOrth_space
-    LexOrth_title
-    LexOrth_upper
-    LexOrth_N
+cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1


-cpdef enum LexDistFlags:
-    LexDist_adj
-    LexDist_adp
-    LexDist_adv
-    LexDist_conj
-    LexDist_det
-    LexDist_noun
-    LexDist_num
-    LexDist_pdt
-    LexDist_pos
-    LexDist_pron
-    LexDist_prt
-    LexDist_punct
-    LexDist_verb
-
-    LexDist_lower
-    LexDist_title
-    LexDist_upper
-
-    LexDist_N
+cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1


-cdef struct LexemeC:
-    int[<int>LexInt_N] ints
-    float[<int>LexFloat_N] floats
-    utf8_t[<int>LexStr_N] strings
-    flag_t orth_flags
-    flag_t dist_flags
-
-
-cdef LexemeC EMPTY_LEXEME
-
-
-cpdef dict get_lexeme_dict(size_t i, unicode string)
-
-cdef char* intern_and_encode(unicode string, size_t* length) except NULL
-
-cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *
-
-cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *
-
-cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i)
-
-cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *
-
-cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *
-
-cdef dict lexeme_pack(LexemeC* lexeme)
-cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
+cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
+    return lexeme.flags & (1 << flag_id)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -5,106 +5,40 @@ from libc.string cimport memset

 import orth

+from .utf8string cimport Utf8Str
+
 OOV_DIST_FLAGS = 0

-memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
+memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))

-cpdef dict get_lexeme_dict(size_t i, unicode string):
-    ints = [None for _ in range(LexInt_N)]
-    ints[<int>LexInt_id] = i
-    ints[<int>LexInt_length] = len(string)
-    ints[<int>LexInt_cluster] = 0
-    ints[<int>LexInt_pos] = 0
-    ints[<int>LexInt_supersense] = 0

-    floats = [None for _ in range(LexFloat_N)]
-    floats[<int>LexFloat_prob] = 0
-    floats[<int>LexFloat_sentiment] = 0
-
-    strings = [None for _ in range(LexStr_N)]
-    strings[<int>LexStr_orig] = string
-    strings[<int>LexStr_norm] = strings[<int>LexStr_orig]
-    strings[<int>LexStr_shape] = orth.word_shape(string)
-    strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
-    strings[<int>LexStr_asciied] = orth.asciied(string)
-    strings[<int>LexStr_pre] = string[0]
-    strings[<int>LexStr_suff] = string[-3:]
-
-    orth_flags = get_orth_flags(string)
-    dist_flags = OOV_DIST_FLAGS
-
-    return {'ints': ints, 'floats': floats, 'strings': strings,
-            'orth_flags': orth_flags, 'dist_flags': dist_flags}
-
-def get_orth_flags(unicode string):
+def get_flags(unicode string):
    cdef flag_t flags = 0
-
-    flags |= orth.is_ascii(string) << LexOrth_ascii
-    flags |= orth.is_alpha(string) << LexOrth_alpha
-    flags |= orth.is_digit(string) << LexOrth_digit
-    flags |= orth.is_lower(string) << LexOrth_lower
-    flags |= orth.is_punct(string) << LexOrth_punct
-    flags |= orth.is_space(string) << LexOrth_space
-    flags |= orth.is_title(string) << LexOrth_title
-    flags |= orth.is_upper(string) << LexOrth_upper
+    flags |= orth.is_alpha(string) << IS_ALPHA
+    flags |= orth.is_ascii(string) << IS_ASCII
+    flags |= orth.is_digit(string) << IS_DIGIT
+    flags |= orth.is_lower(string) << IS_LOWER
+    flags |= orth.is_punct(string) << IS_PUNCT
+    flags |= orth.is_space(string) << IS_SPACE
+    flags |= orth.is_title(string) << IS_TITLE
+    flags |= orth.is_upper(string) << IS_UPPER
    return flags


-def get_dist_flags(unicode string):
-    return 0
-
-
-cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
+cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1:
    cdef bytes byte_string = string.encode('utf8')
-    cdef bytes utf8_string = intern(byte_string)
-    Py_INCREF(utf8_string)
-    length[0] = len(utf8_string)
-    return <char*>utf8_string
+    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
+    lex.id = orig_str.i
+    lex.cluster = 0
+    lex.length = len(string)
+    lex.flags = get_flags(string)
+    # TODO: Hook this up
+    #lex.norm = norm_str.i
+    #lex.shape = norm_str.i
+    #lex.asciied = asciied_str.i
+    #lex.prefix = prefix_str.i
+    #lex.suffix = suffix_str.i


-cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:
-    return lexeme.ints[i]
-
-
-cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
-    return lexeme.floats[i]
-
-
-cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
-    cdef bytes byte_string = lexeme.strings[i]
-    return byte_string.decode('utf8')
-
-
-cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
-    return lexeme.orth_flags & (1 << flag_id)
-
-
-cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
-    return lexeme.dist_flags & (1 << flag_id)
-
-
-cdef dict lexeme_pack(LexemeC* lex):
-    cdef dict packed = {}
-    packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
-    packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
-    packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
-    packed['orth_flags'] = lex.orth_flags
-    packed['dist_flags'] = lex.orth_flags
-    return packed
-
-
-cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
-    cdef size_t i
-    cdef int lex_int
-    cdef float lex_float
-    cdef unicode string
-    for i, lex_int in enumerate(p['ints']):
-        lex.ints[i] = lex_int
-    for i, lex_float in enumerate(p['floats']):
-        lex.floats[i] = lex_float
-    cdef size_t _
-    for i in range(LexStr_N):
-        lex_string = p['strings'][i]
-        lex.strings[i] = intern_and_encode(lex_string, &_)
-    lex.orth_flags = p['orth_flags']
-    lex.dist_flags = p['dist_flags']
+cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1:
+    pass
--- a/spacy/pos.pyx
+++ b/spacy/pos.pyx
@ -113,8 +113,8 @@ cpdef enum:
    CONTEXT_SIZE


-cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC* n1,
-                   LexemeC* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
+cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
+                   Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
    _fill_token(&atoms[P2i], p2)
    _fill_token(&atoms[P1i], p1)
    _fill_token(&atoms[N0i], n0)
@ -124,16 +124,16 @@ cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC
    atoms[P2t] = prev_prev_tag


-cdef inline void _fill_token(atom_t* atoms, LexemeC* lex) nogil:
-    atoms[0] = lex.ints[<int>LexInt_id]
-    atoms[1] = lex.ints[<int>LexInt_cluster]
-    atoms[2] = <atom_t>lex.strings[<int>LexStr_norm]
-    atoms[3] = <atom_t>lex.strings[<int>LexStr_shape]
-    atoms[4] = <atom_t>lex.strings[<int>LexStr_pre]
-    atoms[5] = <atom_t>lex.strings[<int>LexStr_suff]
+cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
+    atoms[0] = lex.id
+    atoms[1] = lex.cluster
+    atoms[2] = lex.norm
+    atoms[3] = lex.shape
+    atoms[4] = lex.prefix
+    atoms[5] = lex.suffix

-    atoms[6] = lex.dist_flags & (1 << LexDist_title)
-    atoms[7] = lex.dist_flags & (1 << LexDist_upper)
+    atoms[6] = lex.flags & (1 << OFT_TITLE)
+    atoms[7] = lex.flags & (1 << OFT_UPPER)


 TEMPLATES = (
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@ -20,6 +20,8 @@ def realign_tagged(token_rules, tagged_line, sep='/'):
 def read_tagged(detoken_rules, file_, sep='/'):
    sentences = []
    for line in file_:
+        if not line.strip():
+            continue
        line = realign_tagged(detoken_rules, line, sep=sep)
        tokens, tags = _parse_line(line, sep)
        assert len(tokens) == len(tags)
@ -39,7 +41,7 @@ def _parse_line(line, sep):
            subtags.append('NULL')
        assert len(subtags) == len(subtokens), [t.string for t in subtokens]
        words.append(word)
-        tags.extend([Tagger.encode_pos(pos) for pos in subtags])
+        tags.extend([Tagger.encode_pos(ptb_to_univ(pos)) for pos in subtags])
    return EN.tokenize(' '.join(words)), tags


@ -53,3 +55,86 @@ def get_tagdict(train_sents):
            tagdict.setdefault(word, {}).setdefault(tag, 0)
            tagdict[word][tag] += 1
    return tagdict
+
+
+def ptb_to_univ(tag):
+    mapping = dict(tuple(line.split()) for line in """
+NULL    NULL
+HYPH   .
+ADD X
+NFP .
+AFX X
+XX  X
+BES VERB
+HVS VERB
+GW  X
+!	.
+#	.
+$	.
+''	.
+(	.
+)	.
+,	.
+-LRB-	.
+-RRB-	.
+.	.
+:	.
+?	.
+CC	CONJ
+CD	NUM
+CD|RB	X
+DT	DET
+EX	DET
+FW	X
+IN	ADP
+IN|RP	ADP
+JJ	ADJ
+JJR	ADJ
+JJRJR	ADJ
+JJS	ADJ
+JJ|RB	ADJ
+JJ|VBG	ADJ
+LS	X
+MD	VERB
+NN	NOUN
+NNP	NOUN
+NNPS	NOUN
+NNS	NOUN
+NN|NNS	NOUN
+NN|SYM	NOUN
+NN|VBG	NOUN
+NP	NOUN
+PDT	DET
+POS	PRT
+PRP	PRON
+PRP$	PRON
+PRP|VBP	PRON
+PRT	PRT
+RB	ADV
+RBR	ADV
+RBS	ADV
+RB|RP	ADV
+RB|VBG	ADV
+RN	X
+RP	PRT
+SYM	X
+TO	PRT
+UH	X
+VB	VERB
+VBD	VERB
+VBD|VBN	VERB
+VBG	VERB
+VBG|NN	VERB
+VBN	VERB
+VBP	VERB
+VBP|TO	VERB
+VBZ	VERB
+VP	VERB
+WDT	DET
+WH	X
+WP	PRON
+WP$	PRON
+WRB	ADV
+``	.""".strip().split('\n'))
+    return mapping[tag]
+
--- a/spacy/ptb3.pxd
+++ b/spacy/ptb3.pxd
@ -1,5 +0,0 @@
-from spacy.lang cimport Language
-
-
-cdef class PennTreebank3(Language):
-    cdef list _split(self, unicode split)
--- a/spacy/ptb3.pyx
+++ b/spacy/ptb3.pyx
@ -1,161 +0,0 @@
-'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
-so that strings can be retrieved from hashes.  Use 64-bit hash values and
-boldly assume no collisions.
-'''
-from __future__ import unicode_literals
-
-
-from libc.stdint cimport uint64_t
-
-
-cimport spacy
-
-import re
-
-from spacy import orth
-
-TAG_THRESH = 0.5
-UPPER_THRESH = 0.2
-LOWER_THRESH = 0.5
-TITLE_THRESH = 0.7
-
-NR_FLAGS = 0
-
-OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
-OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
-OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
-
-IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
-IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
-IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
-IS_SPACE = NR_FLAGS; NR_FLAGS += 1
-IS_ASCII = NR_FLAGS; NR_FLAGS += 1
-IS_TITLE = NR_FLAGS; NR_FLAGS += 1
-IS_LOWER = NR_FLAGS; NR_FLAGS += 1
-IS_UPPER = NR_FLAGS; NR_FLAGS += 1
-
-CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
-CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
-CAN_NUM = NR_FLAGS; NR_FLAGS += 1
-CAN_DET = NR_FLAGS; NR_FLAGS += 1
-CAN_ADP = NR_FLAGS; NR_FLAGS += 1
-CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
-CAN_ADV = NR_FLAGS; NR_FLAGS += 1
-CAN_VERB = NR_FLAGS; NR_FLAGS += 1
-CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
-CAN_PDT = NR_FLAGS; NR_FLAGS += 1
-CAN_POS = NR_FLAGS; NR_FLAGS += 1
-CAN_PRON = NR_FLAGS; NR_FLAGS += 1
-CAN_PRT = NR_FLAGS; NR_FLAGS += 1
-
-
-# List of contractions adapted from Robert MacIntyre's tokenizer.
-CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
-                 re.compile(r"(?i)\b(d)('ye)\b"),
-                 re.compile(r"(?i)\b(gim)(me)\b"),
-                 re.compile(r"(?i)\b(gon)(na)\b"),
-                 re.compile(r"(?i)\b(got)(ta)\b"),
-                 re.compile(r"(?i)\b(lem)(me)\b"),
-                 re.compile(r"(?i)\b(mor)('n)\b"),
-                 re.compile(r"(?i)\b(wan)(na) ")]
-
-CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
-                 re.compile(r"(?i) ('t)(was)\b")]
-
-CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
-                 re.compile(r"(?i)\b(wha)(t)(cha)\b")]
-
-def nltk_regex_tokenize(text):
-    # Implementation taken from NLTK 3.0, based on tokenizer.sed
-    
-    #starting quotes
-    text = re.sub(r'^\"', r'``', text)
-    text = re.sub(r'(``)', r' \1 ', text)
-    text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
-
-    #punctuation
-    text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
-    text = re.sub(r'\.\.\.', r' ... ', text)
-    text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
-    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
-    text = re.sub(r'[?!]', r' \g<0> ', text)
-
-    text = re.sub(r"([^'])' ", r"\1 ' ", text)
-
-    #parens, brackets, etc.
-    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
-    text = re.sub(r'--', r' -- ', text)
-
-    #add extra space to make things easier
-    text = " " + text + " "
-
-    #ending quotes
-    text = re.sub(r'"', " '' ", text)
-    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
-
-    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
-    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
-                  text)
-
-    for regexp in CONTRACTIONS2:
-        text = regexp.sub(r' \1 \2 ', text)
-    for regexp in CONTRACTIONS3:
-        text = regexp.sub(r' \1 \2 ', text)
-
-    # We are not using CONTRACTIONS4 since
-    # they are also commented out in the SED scripts
-    # for regexp in self.CONTRACTIONS4:
-    #     text = regexp.sub(r' \1 \2 \3 ', text)
-
-    return text.split()
-
-
-cdef class PennTreebank3(Language):
-    """Fully PTB compatible English tokenizer, tightly coupled to lexicon.
-
-    Attributes:
-        name (unicode): The two letter code used by Wikipedia for the language.
-        lexicon (Lexicon): The lexicon. Exposes the lookup method.
-    """
-
-
-    def __cinit__(self, name):
-        flag_funcs = [0 for _ in range(NR_FLAGS)]
-        
-        flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
-        flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
-        flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
-        
-        flag_funcs[IS_ALPHA] = orth.is_alpha
-        flag_funcs[IS_DIGIT] = orth.is_digit
-        flag_funcs[IS_PUNCT] = orth.is_punct
-        flag_funcs[IS_SPACE] = orth.is_space
-        flag_funcs[IS_TITLE] = orth.is_title
-        flag_funcs[IS_LOWER] = orth.is_lower
-        flag_funcs[IS_UPPER] = orth.is_upper
-        
-        flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
-        flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
-        flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
-        flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
-        flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
-        flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
-        flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
-        flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
-        flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
-        flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
-        flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
-        
-        Language.__init__(self, name, flag_funcs)
-
-
-    cdef list _split(self, unicode chunk):
-        strings = nltk_regex_tokenize(chunk)
-        if strings[-1] == '.':
-            strings.pop()
-            strings[-1] += '.'
-        assert strings
-        return strings
-    
-
-PTB3 = PennTreebank3('ptb3')
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,59 +1,49 @@
 from cymem.cymem cimport Pool

-from spacy.lexeme cimport LexemeC
+from .lexeme cimport Lexeme
+from .typedefs cimport flag_t
+from .utf8string cimport StringStore
+
 from thinc.typedefs cimport atom_t


 cdef class Tokens:
    cdef Pool mem
+    cdef StringStore _string_store

-    cdef LexemeC** _lex_ptr
+    cdef Lexeme** _lex_ptr
    cdef int* _idx_ptr
    cdef int* _pos_ptr
-    cdef LexemeC** lex
+    cdef Lexeme** lex
    cdef int* idx
    cdef int* pos

    cdef int length
    cdef int max_length

-    cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
-    cdef int push_back(self, int i, LexemeC* lexeme) except -1
+    cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
+    cdef int push_back(self, int i, Lexeme* lexeme) except -1

-    cpdef int id(self, size_t i) except -1
-    cpdef float prob(self, size_t i) except 1
-    cpdef int cluster(self, size_t i) except *
-    cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *
-    cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
-    cpdef unicode string_view(self, size_t i, size_t view_id)

-    cpdef unicode string(self, size_t i)
-    cpdef unicode orig(self, size_t i)
-    cpdef unicode norm(self, size_t i)
-    cpdef unicode shape(self, size_t i)
-    cpdef unicode unsparse(self, size_t i)
-    cpdef unicode asciied(self, size_t i)
-    cpdef bint is_alpha(self, size_t i) except *
-    cpdef bint is_ascii(self, size_t i) except * 
-    cpdef bint is_digit(self, size_t i) except *
-    cpdef bint is_lower(self, size_t i) except *
-    cpdef bint is_punct(self, size_t i) except *
-    cpdef bint is_space(self, size_t i) except *
-    cpdef bint is_title(self, size_t i) except *
-    cpdef bint is_upper(self, size_t i) except *
-    cpdef bint can_adj(self, size_t i) except *
-    cpdef bint can_adp(self, size_t i) except *
-    cpdef bint can_adv(self, size_t i) except *
-    cpdef bint can_conj(self, size_t i) except *
-    cpdef bint can_det(self, size_t i) except *
-    cpdef bint can_noun(self, size_t i) except *
-    cpdef bint can_num(self, size_t i) except *
-    cpdef bint can_pdt(self, size_t i) except *
-    cpdef bint can_pos(self, size_t i) except *
-    cpdef bint can_pron(self, size_t i) except *
-    cpdef bint can_prt(self, size_t i) except *
-    cpdef bint can_punct(self, size_t i) except *
-    cpdef bint can_verb(self, size_t i) except *
-    cpdef bint oft_lower(self, size_t i) except *
-    cpdef bint oft_title(self, size_t i) except *
-    cpdef bint oft_upper(self, size_t i) except *
+cdef class Token:
+    cdef StringStore _string_store
+    cdef public int i
+    cdef public int idx
+    cdef public int pos
+
+    cdef public atom_t id
+    cdef public atom_t cluster
+    cdef public atom_t length
+    cdef public atom_t lex_pos
+    cdef public atom_t lex_supersense
+
+    cdef public atom_t norm
+    cdef public atom_t shape
+    cdef public atom_t vocab10k
+    cdef public atom_t asciied
+    cdef public atom_t prefix
+    cdef public atom_t suffix
+
+    cdef public float prob
+
+    cdef public flag_t flags
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,10 +1,6 @@
 # cython: profile=True
-from .word cimport Lexeme
-
 from .lexeme cimport *
-cimport numpy
 cimport cython
-import numpy

 DEF PADDING = 5

@ -34,7 +30,8 @@ cdef class Tokens:
    >>> tokens.can_noun(1)
    True
    """
-    def __init__(self, string_length=0):
+    def __init__(self, StringStore string_store, string_length=0):
+        self._string_store = string_store
        if string_length >= 3:
            size = int(string_length / 3.0)
        else:
@ -43,7 +40,7 @@ cdef class Tokens:
        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
        # However, we need to remember the true starting places, so that we can
        # realloc.
-        self._lex_ptr = <LexemeC**>self.mem.alloc(size + (PADDING*2), sizeof(LexemeC*))
+        self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
        self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self.lex = self._lex_ptr
@ -55,39 +52,26 @@ cdef class Tokens:
        self.lex += PADDING
        self.idx += PADDING
        self.pos += PADDING
-
        self.max_length = size
        self.length = 0

    def __getitem__(self, i):
        bounds_check(i, self.length, PADDING)
-        return Lexeme(<size_t>self.lex[i])
+        return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0])

    def __len__(self):
        return self.length

-    cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
+    cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
        if self.length == self.max_length:
            self._realloc(self.length * 2)
        self.lex[self.length] = lexeme
        self.idx[self.length] = idx
        self.pos[self.length] = 0
        self.length += 1
-        return idx + lexeme.ints[<int>LexInt_length]
+        return idx + lexeme.length

-    def _realloc(self, new_size):
-        self.max_length = new_size
-        n = new_size + (PADDING * 2)
-        self._lex_ptr = <LexemeC**>self.mem.realloc(self._lex_ptr, n * sizeof(LexemeC*))
-        self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
-        self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
-        self.lex = self._lex_ptr + PADDING
-        self.idx = self._idx_ptr + PADDING
-        self.pos = self._pos_ptr + PADDING
-        for i in range(self.length, self.max_length + PADDING):
-            self.lex[i] = &EMPTY_LEXEME
-
-    cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
+    cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
        cdef int i
        if lexemes == NULL:
            return idx
@ -101,154 +85,43 @@ cdef class Tokens:
                idx = self.push_back(idx, lexemes[i])
        return idx

-    cpdef int id(self, size_t i) except -1:
-        bounds_check(i, self.length, PADDING)
-        return self.lex[i].ints[<int>LexInt_id]
+    def _realloc(self, new_size):
+        self.max_length = new_size
+        n = new_size + (PADDING * 2)
+        self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
+        self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
+        self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
+        self.lex = self._lex_ptr + PADDING
+        self.idx = self._idx_ptr + PADDING
+        self.pos = self._pos_ptr + PADDING
+        for i in range(self.length, self.max_length + PADDING):
+            self.lex[i] = &EMPTY_LEXEME

-    cpdef float prob(self, size_t i) except 1:
-        bounds_check(i, self.length, PADDING)
-        return self.lex[i].floats[<int>LexFloat_prob]

-    cpdef int cluster(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return self.lex[i].ints[<int>LexInt_cluster]
+@cython.freelist(64)
+cdef class Token:
+    def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
+        self._string_store = string_store
+        self.i = i
+        self.idx = idx
+        self.pos = pos
        
-    cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_orth_flag(self.lex[i], flag_id)
+        self.id = lex['id']
+        self.cluster = lex['cluster']
+        self.length = lex['length']
+        self.lex_pos = lex['pos']
+        self.lex_supersense = lex['supersense']
+        self.norm = lex['norm']
+        self.shape = lex['shape']
+        self.vocab10k = lex['vocab10k']
+        self.suffix = lex['asciied']
+        self.prefix = lex['prefix']

-    cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], flag_id)
+        self.prob = lex['prob']
+        self.flags = lex['flags']

-    cpdef unicode string_view(self, size_t i, size_t view_id):
-        bounds_check(i, self.length, PADDING)
-        return lexeme_get_string(self.lex[i], view_id)
+    property string:
+        def __get__(self):
+            cdef bytes utf8string = self._string_store[self.id]
+            return utf8string.decode('utf8')

-    # Provide accessor methods for the features supported by the language.
-    # Without these, clients have to use the underlying string_view and check_flag
-    # methods, which requires them to know the IDs.
-
-    cpdef unicode string(self, size_t i):
-        bounds_check(i, self.length, PADDING)
-        return self.orig(i)
-
-    cpdef unicode orig(self, size_t i):
-        bounds_check(i, self.length, PADDING)
-        cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_orig]
-        cdef unicode string = utf8_string.decode('utf8')
-        return string
-
-    cpdef unicode norm(self, size_t i):
-        bounds_check(i, self.length, PADDING)
-        cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_norm]
-        cdef unicode string = utf8_string.decode('utf8')
-        return string
-
-    cpdef unicode shape(self, size_t i):
-        bounds_check(i, self.length, PADDING)
-        return lexeme_get_string(self.lex[i], LexStr_shape)
-
-    cpdef unicode unsparse(self, size_t i):
-        bounds_check(i, self.length, PADDING)
-        return lexeme_get_string(self.lex[i], LexStr_unsparse)
-
-    cpdef unicode asciied(self, size_t i):
-        bounds_check(i, self.length, PADDING)
-        return lexeme_get_string(self.lex[i], LexStr_asciied)
-
-    cpdef bint is_alpha(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_orth_flag(self.lex[i], LexOrth_alpha)
-
-    cpdef bint is_ascii(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_orth_flag(self.lex[i], LexOrth_ascii)
-
-    cpdef bint is_digit(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_orth_flag(self.lex[i], LexOrth_digit)
-
-    cpdef bint is_lower(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_orth_flag(self.lex[i], LexOrth_lower)
-
-    cpdef bint is_punct(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_orth_flag(self.lex[i], LexOrth_punct)
-
-    cpdef bint is_space(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_orth_flag(self.lex[i], LexOrth_space)
-
-    cpdef bint is_title(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_orth_flag(self.lex[i], LexOrth_title)
-
-    cpdef bint is_upper(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_orth_flag(self.lex[i], LexOrth_upper)
-
-    cpdef bint can_adj(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_adj)
-
-    cpdef bint can_adp(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_adp)
-
-    cpdef bint can_adv(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_adv)
-
-    cpdef bint can_conj(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_conj)
-
-    cpdef bint can_det(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_det)
-
-    cpdef bint can_noun(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_noun)
-
-    cpdef bint can_num(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_num)
-
-    cpdef bint can_pdt(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_pdt)
-
-    cpdef bint can_pos(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_pos)
-
-    cpdef bint can_pron(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_pron)
-
-    cpdef bint can_prt(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_prt)
-
-    cpdef bint can_punct(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_punct)
-
-    cpdef bint can_verb(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_verb)
-
-    cpdef bint oft_lower(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_lower)
-
-    cpdef bint oft_title(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_title)
-
-    cpdef bint oft_upper(self, size_t i) except *:
-        bounds_check(i, self.length, PADDING)
-        return lexeme_check_dist_flag(self.lex[i], LexDist_upper)
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -1,12 +0,0 @@
-from .typedefs cimport hash_t, utf8_t, flag_t, id_t
-from spacy.lexeme cimport LexemeC
-
-DEF MAX_FLAG = 64
-
-
-cdef class Lexeme:
-    cdef LexemeC* _c
-
-    cpdef bint check_orth_flag(self, size_t flag_id) except *
-    cpdef bint check_dist_flag(self, size_t flag_id) except *
-    cpdef unicode string_view(self, size_t view_id)
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -1,80 +0,0 @@
-# cython: profile=True
-# cython: embedsignature=True
-
-from .lexeme cimport lexeme_get_string
-from .lexeme cimport lexeme_check_orth_flag, lexeme_check_dist_flag
-
-from .lexeme cimport *
-
-
-cdef class Lexeme:
-    """A lexical type --- a word, punctuation symbol, whitespace sequence, etc
-    keyed by a case-sensitive unicode string. All tokens with the same string,
-    e.g. all instances of "dog", ",", "NASA" etc should be mapped to the same
-    Lexeme.
-
-    You should avoid instantiating Lexemes directly, and instead use the
-    :py:meth:`space.lang.Language.tokenize` and :py:meth:`spacy.lang.Language.lookup`
-    methods on the global object exposed by the language you're working with,
-    e.g. :py:data:`spacy.en.EN`.
-
-    Attributes:
-        string (unicode):
-            The unicode string.
-            
-            Implemented as a property; relatively expensive.
-
-        length (size_t):
-            The number of unicode code-points in the string.
-
-        prob (double):
-            An estimate of the word's unigram log probability.
-
-            Probabilities are calculated from a large text corpus, and smoothed using
-            simple Good-Turing.  Estimates are read from data/en/probabilities, and
-            can be replaced using spacy.en.load_probabilities.
-        
-        cluster (size_t):
-            An integer representation of the word's Brown cluster.
-
-            A Brown cluster is an address into a binary tree, which gives some (noisy)
-            information about the word's distributional context.
-    
-            >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
-            >>> print ["{0:b"} % lookup(s).cluster for s in strings]
-            ["100111110110", "100111100100", "01010111011001", "100111110110"]
-
-            The clusterings are unideal, but often slightly useful.
-            "pineapple" and "apple" share a long prefix, indicating a similar meaning,
-            while "dapple" is totally different. On the other hand, "scalable" receives
-            the same cluster ID as "pineapple", which is not what we'd like.
-    """
-    def __cinit__(self, size_t lexeme_addr):
-        self._c = <LexemeC*>lexeme_addr
-
-    property string:
-        def __get__(self):
-            cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
-            cdef unicode string = utf8_string.decode('utf8')
-            return string
-
-    property prob:
-        def __get__(self):
-            return self._c.floats[<int>LexFloat_prob]
-
-    property cluster:
-        def __get__(self):
-            return self._c.ints[<int>LexInt_cluster]
-
-    property length:
-        def __get__(self):
-            return self._c.ints[<int>LexInt_length]
-
-    cpdef bint check_orth_flag(self, size_t flag_id) except *:
-        return lexeme_check_orth_flag(self._c, flag_id)
-
-    cpdef bint check_dist_flag(self, size_t flag_id) except *:
-        return lexeme_check_dist_flag(self._c, flag_id)
-
-    cpdef unicode string_view(self, size_t view_id):
-        return lexeme_get_string(self._c, view_id)
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -5,8 +5,8 @@ from spacy.en import EN

 def test_possess():
    tokens = EN.tokenize("Mike's")
-    assert tokens[0].string == "Mike"
-    assert tokens[1].string == "'s"
+    assert EN.lexicon.strings[tokens[0].id] == "Mike"
+    assert EN.lexicon.strings[tokens[1].id] == "'s"
    assert len(tokens) == 2


--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@ -8,19 +8,17 @@ from spacy.lexeme import *

 def test_is_alpha():
    the = EN.lexicon.lookup('the')
-    assert the.check_orth_flag(LexOrth_alpha)
+    assert the['flags'] & (1 << IS_ALPHA)
    year = EN.lexicon.lookup('1999')
-    assert not year.check_orth_flag(LexOrth_alpha)
+    assert not year['flags'] & (1 << IS_ALPHA)
    mixed = EN.lexicon.lookup('hello1')
-    assert not mixed.check_orth_flag(LexOrth_alpha)
+    assert not mixed['flags'] & (1 << IS_ALPHA)


 def test_is_digit():
    the = EN.lexicon.lookup('the')
-    assert not the.check_orth_flag(LexOrth_digit)
+    assert not the['flags'] & (1 << IS_DIGIT)
    year = EN.lexicon.lookup('1999')
-    assert year.check_orth_flag(LexOrth_digit)
+    assert year['flags'] & (1 << IS_DIGIT)
    mixed = EN.lexicon.lookup('hello1')
-    assert not mixed.check_orth_flag(LexOrth_digit)
-
-
+    assert not mixed['flags'] & (1 << IS_DIGIT)
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@ -1,27 +0,0 @@
-from __future__ import unicode_literals
-
-import pytest
-
-import spacy.word
-from spacy.en import EN
-from spacy.lexeme import *
-
-
-@pytest.fixture
-def C3P0():
-    return EN.lexicon.lookup("C3P0")
-
-
-def test_shape(C3P0):
-    assert C3P0.string_view(LexStr_shape) == "XdXd"
-
-
-def test_length():
-    t = EN.lexicon.lookup('the')
-    assert t.length == 3
-    t = EN.lexicon.lookup("n't")
-    assert t.length == 3
-    t = EN.lexicon.lookup("'s")
-    assert t.length == 2
-    t = EN.lexicon.lookup('Xxxx')
-    assert t.length == 4
--- a/tests/test_string_loading.py
+++ b/tests/test_string_loading.py
@ -8,9 +8,9 @@ from spacy.en import EN

 def test_one():
    tokens = EN.tokenize('Betty Botter bought a pound of butter.')
-    assert tokens.string(0) == 'Betty'
+    assert tokens[0].string == 'Betty'
    tokens2 = EN.tokenize('Betty also bought a pound of butter.')
-    assert tokens2.string(0) == 'Betty'
+    assert tokens2[0].string == 'Betty'



--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -5,41 +5,39 @@ from spacy.en import EN


 def test_single_word():
-    lex_ids = EN.tokenize(u'hello')
-    assert lex_ids[0].string == EN.lexicon.lookup(u'hello').string
+    tokens = EN.tokenize(u'hello')
+    assert tokens[0].string == 'hello'


 def test_two_words():
-    words = EN.tokenize('hello possums')
-    assert len(words) == 2
-    assert words[0].string == EN.lexicon.lookup('hello').string
-    assert words[0].string != words[1].string
+    tokens = EN.tokenize('hello possums')
+    assert len(tokens) == 2
+    assert tokens[0].string != tokens[1].string


 def test_punct():
    tokens = EN.tokenize('hello, possums.')
    assert len(tokens) == 4
-    assert tokens[0].string == EN.lexicon.lookup('hello').string
-    assert tokens[1].string == EN.lexicon.lookup(',').string
-    assert tokens[2].string == EN.lexicon.lookup('possums').string
-    assert tokens[1].string != EN.lexicon.lookup('hello').string
+    assert tokens[0].string == 'hello'
+    assert tokens[1].string == ','
+    assert tokens[2].string == 'possums'
+    assert tokens[1].string != 'hello'


 def test_digits():
-    lex_ids = EN.tokenize('The year: 1984.')
-    assert lex_ids.orig(3) == "1984"
-    assert len(lex_ids) == 5
-    assert lex_ids[0].string == EN.lexicon.lookup('The').string
-    assert lex_ids[3].string == EN.lexicon.lookup('1984').string
+    tokens = EN.tokenize('The year: 1984.')
+    assert len(tokens) == 5
+    assert tokens[0].id == EN.lexicon.lookup('The')['id']
+    assert tokens[3].id == EN.lexicon.lookup('1984')['id']


 def test_contraction():
-    lex_ids = EN.tokenize("don't giggle")
-    assert len(lex_ids) == 3
-    assert lex_ids[1].string == EN.lexicon.lookup("not").string
-    lex_ids = EN.tokenize("i said don't!")
-    assert len(lex_ids) == 5
-    assert lex_ids[4].string == EN.lexicon.lookup('!').string
+    tokens = EN.tokenize("don't giggle")
+    assert len(tokens) == 3
+    assert tokens[1].id == EN.lexicon.lookup("not")['id']
+    tokens = EN.tokenize("i said don't!")
+    assert len(tokens) == 5
+    assert tokens[4].id == EN.lexicon.lookup('!')['id']


 def test_contraction_punct():
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -5,30 +5,19 @@ from spacy.en import EN

 def test_neq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('bye').string != addr.string
+    assert EN.lexicon.lookup('bye')['id'] != addr['id']


 def test_eq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('Hello').string == addr.string
-
-
-def test_round_trip():
-    hello = EN.lexicon.lookup('Hello')
-    assert hello.string == 'Hello'
+    assert EN.lexicon.lookup('Hello')['id'] == addr['id']


 def test_case_neq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('hello').string != addr.string
+    assert EN.lexicon.lookup('hello')['id'] != addr['id']


 def test_punct_neq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('Hello,').string != addr.string
-
-
-def test_short():
-    addr = EN.lexicon.lookup('I')
-    assert addr.string == 'I'
-    assert addr.string != 'not'
+    assert EN.lexicon.lookup('Hello,')['id'] != addr['id']