* Large refactor, particularly to Python API

2024-12-26 01:46:28 +03:00 · 2014-10-24 00:59:17 +11:00 · 2014-10-24 00:59:17 +11:00 · 08ce602243
commit 08ce602243
parent 168b2b8cb2
21 changed files with 327 additions and 958 deletions
--- a/spacy/de.pxd
+++ b/spacy/de.pxd
@ -1,42 +0,0 @@
 from spacy.spacy cimport Language
 from spacy.word cimport Lexeme
 cimport cython
 cpdef size_t ALPHA
 cpdef size_t DIGIT 
 cpdef size_t PUNCT
 cpdef size_t SPACE
 cpdef size_t LOWER
 cpdef size_t UPPER
 cpdef size_t TITLE
 cpdef size_t ASCII
 cpdef size_t OFT_LOWER
 cpdef size_t OFT_TITLE
 cpdef size_t OFT_UPPER
 cpdef size_t PUNCT
 cpdef size_t CONJ
 cpdef size_t NUM
 cpdef size_t N
 cpdef size_t DET
 cpdef size_t ADP
 cpdef size_t ADJ
 cpdef size_t ADV
 cpdef size_t VERB
 cpdef size_t NOUN
 cpdef size_t PDT
 cpdef size_t POS
 cpdef size_t PRON
 cpdef size_t PRT
 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
 cdef English EN
 cpdef Word lookup(unicode word)
 cpdef list tokenize(unicode string)
--- a/spacy/de.pyx
+++ b/spacy/de.pyx
@ -1,126 +0,0 @@
 # cython: profile=True
 # cython: embedsignature=True
 '''Tokenize German text, using a scheme based on the Negra corpus.
 Tokenization is generally similar to English text, and the same set of orthographic
 flags are used.
 An abbreviation list is used to handle common abbreviations. Hyphenated words
 are not split, following the Treebank usage.
 '''
 from __future__ import unicode_literals
 from libc.stdint cimport uint64_t
 cimport spacy
 from spacy.orth import is_alpha, is_digit, is_punct, is_space, is_lower, is_ascii
 from spacy.orth import canonicalize_case, get_string_shape, asciify, get_non_sparse
 from spacy.common cimport check_punct
 # Python-readable flag constants --- can't read an enum from Python
 # Don't want to manually assign these numbers, or we'll insert one and have to
 # change them all.
 # Don't use "i", as we don't want it in the global scope!
 cdef size_t __i = 0
 ALPHA = __i; i += 1
 DIGIT = __i; __i += 1
 PUNCT = __i; __i += 1
 SPACE = __i; __i += 1
 LOWER = __i; __i += 1
 UPPER = __i; __i += 1
 TITLE = __i; __i += 1
 ASCII = __i; __i += 1
 OFT_LOWER = __i; __i += 1 
 OFT_UPPER = __i; __i += 1
 OFT_TITLE = __i; __i += 1
 PUNCT = __i; __i += 1
 CONJ = __i; __i += 1
 NUM = __i; __i += 1
 X = __i; __i += 1
 DET = __i; __i += 1
 ADP = __i; __i += 1
 ADJ = __i; __i += 1
 ADV = __i; __i += 1
 VERB = __i; __i += 1
 NOUN = __i; __i += 1
 PDT = __i; __i += 1
 POS = __i; __i += 1
 PRON = __i; __i += 1
 PRT = __i; __i += 1
 # These are for the string views
 __i = 0
 SIC = __i; __i += 1
 CANON_CASED = __i; __i += 1
 NON_SPARSE = __i; __i += 1
 SHAPE = __i; __i += 1
 NR_STRING_VIEWS = __i
 def get_string_views(unicode string, lexeme):
    views = ['' for _ in range(NR_STRING_VIEWS)]
    views[SIC] = string
    views[CANON_CASED] = canonicalize_case(string, lexeme)
    views[SHAPE] = get_string_shape(string)
    views[ASCIIFIED] = get_asciified(string)
    views[FIXED_VOCAB] = get_non_sparse(string, views[ASCIIFIED], views[CANON_CASED],
                                       views[SHAPE], lexeme)
    return views
 def set_orth_flags(unicode string, flags_t flags)
    setters = [
        (ALPHA, is_alpha),
        (DIGIT, is_digit),
        (PUNCT, is_punct),
        (SPACE, is_space),
        (LOWER, is_lower),
        (UPPER, is_upper),
        (SPACE, is_space)
    ]
    for bit, setter in setters:
        if setter(string):
            flags |= 1 << bit
    return flags
 cdef class German(spacy.Language):
    cdef Lexeme new_lexeme(self, unicode string, cluster=0, case_stats=None,
                           tag_freqs=None):
        return Lexeme(s, length, views, prob=prob, cluster=cluster,
                      flags=self.get_flags(string)
    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0
        if word.startswith("'s") or word.startswith("'S"):
            return 2
        # Contractions
        if word.endswith("'s") and length >= 3:
            return length - 2
        # Leading punctuation
        if check_punct(word, 0, length):
            return 1
        elif length >= 1:
            # Split off all trailing punctuation characters
            i = 0
            while i < length and not check_punct(word, i, length):
                i += 1
        return i
 DE = German('de')
 lookup = DE.lookup
 tokenize = DE.tokenize
 load_clusters = DE.load_clusters
 load_unigram_probs = DE.load_unigram_probs
 load_case_stats = DE.load_case_stats
 load_tag_stats = DE.load_tag_stats
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,5 +1,4 @@
 from spacy.lang cimport Language
 from spacy.word cimport Lexeme
 from spacy.tokens cimport Tokens
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -1,14 +1,12 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t, int64_t
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
-from .word cimport Lexeme
+from .typedefs cimport hash_t
 from .tokens cimport Tokens
-from .lexeme cimport LexemeC
+from .lexeme cimport Lexeme
 from .utf8string cimport StringStore
 cdef extern from "Python.h":
@ -21,23 +19,25 @@ cdef extern from "Python.h":
 cdef struct String:
    Py_UNICODE* chars
    size_t n
-    uint64_t key
+    hash_t key
 cdef class Lexicon:
    cdef Pool mem
    cpdef readonly size_t size
    cpdef readonly StringStore strings
-    cdef vector[LexemeC*] lexemes
+    cdef vector[Lexeme*] lexemes
    cpdef Lexeme lookup(self, unicode string)
-    cdef LexemeC* get(self, String* s) except NULL
+    cdef Lexeme* get(self, String* s) except NULL
    cdef PreshMap _dict
    cdef list _string_features
    cdef list _flag_features
 cdef class Language:
    cdef Pool _mem
    cdef unicode name
@ -52,12 +52,12 @@ cdef class Language:
    cpdef Tokens tokenize(self, unicode text)
    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
-    cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
+    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
-                             vector[LexemeC*] *suffixes) except NULL
+                             vector[Lexeme*] *suffixes) except NULL
    cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
-                            vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
+                            vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
-    cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1
+    cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -13,22 +13,21 @@ import random
 from os import path
 import re
 from .util import read_lang_data
 from .tokens import Tokens
 from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
 from .lexeme cimport LexStr_orig
 from murmurhash.mrmr cimport hash64
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
 from cython.operator cimport preincrement as preinc
 from cython.operator cimport dereference as deref
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
-from spacy import orth
+
-from spacy import util
+from .lexeme cimport Lexeme
 from .lexeme cimport from_dict as lexeme_from_dict
 from .lexeme cimport from_string as lexeme_from_string
 from . import orth
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
 cdef class Language:
@ -64,7 +63,7 @@ cdef class Language:
            tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
        """
        cdef int length = len(string)
-        cdef Tokens tokens = Tokens(length)
+        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
        if length == 0:
            return tokens
        cdef int i = 0
@ -76,7 +75,7 @@ cdef class Language:
            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                if start < i:
                    string_slice(&span, chars, start, i)
-                    lexemes = <LexemeC**>self.cache.get(span.key)
+                    lexemes = <Lexeme**>self.cache.get(span.key)
                    if lexemes != NULL:
                        tokens.extend(start, lexemes, 0)
                    else: 
@ -88,7 +87,7 @@ cdef class Language:
        i += 1
        if start < i:
            string_slice(&span, chars, start, i)
-            lexemes = <LexemeC**>self.cache.get(span.key)
+            lexemes = <Lexeme**>self.cache.get(span.key)
            if lexemes != NULL:
                tokens.extend(start, lexemes, 0)
            else: 
@ -96,9 +95,9 @@ cdef class Language:
        return tokens
    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
-        cdef vector[LexemeC*] prefixes
+        cdef vector[Lexeme*] prefixes
-        cdef vector[LexemeC*] suffixes
+        cdef vector[Lexeme*] suffixes
-        cdef uint64_t orig_key
+        cdef hash_t orig_key
        cdef int orig_size
        orig_key = span.key
        orig_size = tokens.length
@ -106,8 +105,8 @@ cdef class Language:
        self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
        self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
-    cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
+    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
-                                vector[LexemeC*] *suffixes) except NULL:
+                                vector[Lexeme*] *suffixes) except NULL:
        cdef size_t i
        cdef String prefix
        cdef String suffix
@ -150,15 +149,15 @@ cdef class Language:
    cdef int _attach_tokens(self, Tokens tokens,
                            int idx, String* string,
-                            vector[LexemeC*] *prefixes,
+                            vector[Lexeme*] *prefixes,
-                            vector[LexemeC*] *suffixes) except -1:
+                            vector[Lexeme*] *suffixes) except -1:
        cdef int split
-        cdef LexemeC** lexemes
+        cdef Lexeme** lexemes
-        cdef LexemeC* lexeme
+        cdef Lexeme* lexeme
        cdef String span
        idx = tokens.extend(idx, prefixes.data(), prefixes.size())
        if string.n != 0:
-            lexemes = <LexemeC**>self.cache.get(string.key)
+            lexemes = <Lexeme**>self.cache.get(string.key)
            if lexemes != NULL:
                idx = tokens.extend(idx, lexemes, 0)
            else:
@ -172,13 +171,13 @@ cdef class Language:
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
                    string_slice(&span, string.chars, split + 1, string.n)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
-        cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
+        cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
            idx = tokens.push_back(idx, deref(it))
            preinc(it)
-    cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1:
+    cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
-        lexemes = <LexemeC**>self._mem.alloc(n + 1, sizeof(LexemeC**))
+        lexemes = <Lexeme**>self._mem.alloc(n + 1, sizeof(Lexeme**))
        cdef int i
        for i in range(n):
            lexemes[i] = tokens[i]
@ -212,14 +211,14 @@ cdef class Language:
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                a string and tokens is a list of strings.
        '''
-        cdef LexemeC** lexemes
+        cdef Lexeme** lexemes
-        cdef uint64_t hashed
+        cdef hash_t hashed
        cdef String string
        for uni_string, substrings in token_rules:
-            lexemes = <LexemeC**>self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*))
+            lexemes = <Lexeme**>self._mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
            for i, substring in enumerate(substrings):
                string_from_unicode(&string, substring)
-                lexemes[i] = <LexemeC*>self.lexicon.get(&string)
+                lexemes[i] = <Lexeme*>self.lexicon.get(&string)
            lexemes[i + 1] = NULL
            string_from_unicode(&string, uni_string)
            self.specials.set(string.key, lexemes)
@ -227,33 +226,29 @@ cdef class Language:
 cdef class Lexicon:
-    def __cinit__(self, lexemes):
+    def __init__(self, lexemes):
        self.mem = Pool()
        self._dict = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.size = 0
        cdef String string
-        cdef dict lexeme_dict
+        cdef Lexeme* lexeme
-        cdef LexemeC* lexeme
+        #for py_string, lexeme_dict in lexemes.iteritems():
-        for py_string, lexeme_dict in lexemes.iteritems():
+        #    string_from_unicode(&string, py_string)
-            string_from_unicode(&string, py_string)
+        #    lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
-            lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
+        #    lexeme_from_dict(lexeme, lexeme_dict, self.strings)
-            lexeme_unpack(lexeme, lexeme_dict)
+        #    self._dict.set(string.key, lexeme)
-            self._dict.set(string.key, lexeme)
+        #    self.lexemes.push_back(lexeme)
-            self.lexemes.push_back(lexeme)
+        #    self.size += 1
            self.size += 1
-    def __getitem__(self, size_t i):
+    cdef Lexeme* get(self, String* string) except NULL:
-        return Lexeme(<size_t>self.lexemes.at(i))
+        cdef Lexeme* lex
-
+        lex = <Lexeme*>self._dict.get(string.key)
    cdef LexemeC* get(self, String* string) except NULL:
        cdef LexemeC* lex
        lex = <LexemeC*>self._dict.get(string.key)
        if lex != NULL:
            return lex
-        lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
+        lex = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
-        cdef unicode unicode_string = string.chars[:string.n]
+        lexeme_from_string(lex, string.chars[:string.n], self.strings)
        lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
        self._dict.set(string.key, lex)
        self.lexemes.push_back(lex)
        self.size += 1
@ -270,8 +265,8 @@ cdef class Lexicon:
        """
        cdef String string
        string_from_unicode(&string, uni_string)
-        cdef LexemeC* lexeme = self.get(&string)
+        cdef Lexeme* lexeme = self.get(&string)
-        return Lexeme(<size_t>lexeme)
+        return lexeme[0]
 cdef void string_from_unicode(String* s, unicode uni):
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,94 +1,55 @@
 from .typedefs cimport hash_t, utf8_t, flag_t, id_t
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
 from .utf8string cimport StringStore
 cpdef flag_t OOV_DIST_FLAGS
 # Flags
 cpdef enum:
    IS_ALPHA
    IS_ASCII
    IS_DIGIT
    IS_LOWER
    IS_PUNCT
    IS_SPACE
    IS_TITLE
    IS_UPPER
-cpdef enum LexInts:
+    OFT_LOWER
-    LexInt_id
+    OFT_TITLE
-    LexInt_length
+    OFT_UPPER
    LexInt_cluster
    LexInt_pos
    LexInt_supersense
    LexInt_N
-cpdef enum LexFloats:
+cdef struct Lexeme:
-    LexFloat_prob
+    atom_t id
-    LexFloat_sentiment
+    atom_t length
-    LexFloat_N
+    
    atom_t norm
    atom_t shape
    atom_t vocab10k
    atom_t asciied
    atom_t prefix
    atom_t suffix
    atom_t cluster
    atom_t pos
    atom_t supersense
    float prob
    flag_t flags
-cpdef enum LexStrs:
+cdef Lexeme EMPTY_LEXEME
    LexStr_orig
    LexStr_norm
    LexStr_shape
    LexStr_unsparse
    LexStr_asciied
    LexStr_pre
    LexStr_suff
    LexStr_N
-cpdef enum LexOrthFlags:
+cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1
    LexOrth_alpha
    LexOrth_ascii
    LexOrth_digit
    LexOrth_lower
    LexOrth_punct
    LexOrth_space
    LexOrth_title
    LexOrth_upper
    LexOrth_N
-cpdef enum LexDistFlags:
+cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1
    LexDist_adj
    LexDist_adp
    LexDist_adv
    LexDist_conj
    LexDist_det
    LexDist_noun
    LexDist_num
    LexDist_pdt
    LexDist_pos
    LexDist_pron
    LexDist_prt
    LexDist_punct
    LexDist_verb
    LexDist_lower
    LexDist_title
    LexDist_upper
    LexDist_N
-cdef struct LexemeC:
+cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
-    int[<int>LexInt_N] ints
+    return lexeme.flags & (1 << flag_id)
    float[<int>LexFloat_N] floats
    utf8_t[<int>LexStr_N] strings
    flag_t orth_flags
    flag_t dist_flags
 cdef LexemeC EMPTY_LEXEME
 cpdef dict get_lexeme_dict(size_t i, unicode string)
 cdef char* intern_and_encode(unicode string, size_t* length) except NULL
 cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *
 cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *
 cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i)
 cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *
 cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *
 cdef dict lexeme_pack(LexemeC* lexeme)
 cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -5,106 +5,40 @@ from libc.string cimport memset
 import orth
 from .utf8string cimport Utf8Str
 OOV_DIST_FLAGS = 0
-memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
+memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
 cpdef dict get_lexeme_dict(size_t i, unicode string):
    ints = [None for _ in range(LexInt_N)]
    ints[<int>LexInt_id] = i
    ints[<int>LexInt_length] = len(string)
    ints[<int>LexInt_cluster] = 0
    ints[<int>LexInt_pos] = 0
    ints[<int>LexInt_supersense] = 0
-    floats = [None for _ in range(LexFloat_N)]
+def get_flags(unicode string):
    floats[<int>LexFloat_prob] = 0
    floats[<int>LexFloat_sentiment] = 0
    strings = [None for _ in range(LexStr_N)]
    strings[<int>LexStr_orig] = string
    strings[<int>LexStr_norm] = strings[<int>LexStr_orig]
    strings[<int>LexStr_shape] = orth.word_shape(string)
    strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
    strings[<int>LexStr_asciied] = orth.asciied(string)
    strings[<int>LexStr_pre] = string[0]
    strings[<int>LexStr_suff] = string[-3:]
    orth_flags = get_orth_flags(string)
    dist_flags = OOV_DIST_FLAGS
    return {'ints': ints, 'floats': floats, 'strings': strings,
            'orth_flags': orth_flags, 'dist_flags': dist_flags}
 def get_orth_flags(unicode string):
    cdef flag_t flags = 0
-
+    flags |= orth.is_alpha(string) << IS_ALPHA
-    flags |= orth.is_ascii(string) << LexOrth_ascii
+    flags |= orth.is_ascii(string) << IS_ASCII
-    flags |= orth.is_alpha(string) << LexOrth_alpha
+    flags |= orth.is_digit(string) << IS_DIGIT
-    flags |= orth.is_digit(string) << LexOrth_digit
+    flags |= orth.is_lower(string) << IS_LOWER
-    flags |= orth.is_lower(string) << LexOrth_lower
+    flags |= orth.is_punct(string) << IS_PUNCT
-    flags |= orth.is_punct(string) << LexOrth_punct
+    flags |= orth.is_space(string) << IS_SPACE
-    flags |= orth.is_space(string) << LexOrth_space
+    flags |= orth.is_title(string) << IS_TITLE
-    flags |= orth.is_title(string) << LexOrth_title
+    flags |= orth.is_upper(string) << IS_UPPER
    flags |= orth.is_upper(string) << LexOrth_upper
    return flags
-def get_dist_flags(unicode string):
+cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1:
    return 0
 cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
    cdef bytes byte_string = string.encode('utf8')
-    cdef bytes utf8_string = intern(byte_string)
+    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
-    Py_INCREF(utf8_string)
+    lex.id = orig_str.i
-    length[0] = len(utf8_string)
+    lex.cluster = 0
-    return <char*>utf8_string
+    lex.length = len(string)
    lex.flags = get_flags(string)
    # TODO: Hook this up
    #lex.norm = norm_str.i
    #lex.shape = norm_str.i
    #lex.asciied = asciied_str.i
    #lex.prefix = prefix_str.i
    #lex.suffix = suffix_str.i
-cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:
+cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1:
-    return lexeme.ints[i]
+    pass
 cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
    return lexeme.floats[i]
 cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
    cdef bytes byte_string = lexeme.strings[i]
    return byte_string.decode('utf8')
 cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
    return lexeme.orth_flags & (1 << flag_id)
 cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
    return lexeme.dist_flags & (1 << flag_id)
 cdef dict lexeme_pack(LexemeC* lex):
    cdef dict packed = {}
    packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
    packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
    packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
    packed['orth_flags'] = lex.orth_flags
    packed['dist_flags'] = lex.orth_flags
    return packed
 cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
    cdef size_t i
    cdef int lex_int
    cdef float lex_float
    cdef unicode string
    for i, lex_int in enumerate(p['ints']):
        lex.ints[i] = lex_int
    for i, lex_float in enumerate(p['floats']):
        lex.floats[i] = lex_float
    cdef size_t _
    for i in range(LexStr_N):
        lex_string = p['strings'][i]
        lex.strings[i] = intern_and_encode(lex_string, &_)
    lex.orth_flags = p['orth_flags']
    lex.dist_flags = p['dist_flags']
--- a/spacy/pos.pyx
+++ b/spacy/pos.pyx
@ -113,8 +113,8 @@ cpdef enum:
    CONTEXT_SIZE
-cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC* n1,
+cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
-                   LexemeC* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
+                   Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
    _fill_token(&atoms[P2i], p2)
    _fill_token(&atoms[P1i], p1)
    _fill_token(&atoms[N0i], n0)
@ -124,16 +124,16 @@ cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC
    atoms[P2t] = prev_prev_tag
-cdef inline void _fill_token(atom_t* atoms, LexemeC* lex) nogil:
+cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
-    atoms[0] = lex.ints[<int>LexInt_id]
+    atoms[0] = lex.id
-    atoms[1] = lex.ints[<int>LexInt_cluster]
+    atoms[1] = lex.cluster
-    atoms[2] = <atom_t>lex.strings[<int>LexStr_norm]
+    atoms[2] = lex.norm
-    atoms[3] = <atom_t>lex.strings[<int>LexStr_shape]
+    atoms[3] = lex.shape
-    atoms[4] = <atom_t>lex.strings[<int>LexStr_pre]
+    atoms[4] = lex.prefix
-    atoms[5] = <atom_t>lex.strings[<int>LexStr_suff]
+    atoms[5] = lex.suffix
-    atoms[6] = lex.dist_flags & (1 << LexDist_title)
+    atoms[6] = lex.flags & (1 << OFT_TITLE)
-    atoms[7] = lex.dist_flags & (1 << LexDist_upper)
+    atoms[7] = lex.flags & (1 << OFT_UPPER)
 TEMPLATES = (
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@ -20,6 +20,8 @@ def realign_tagged(token_rules, tagged_line, sep='/'):
 def read_tagged(detoken_rules, file_, sep='/'):
    sentences = []
    for line in file_:
        if not line.strip():
            continue
        line = realign_tagged(detoken_rules, line, sep=sep)
        tokens, tags = _parse_line(line, sep)
        assert len(tokens) == len(tags)
@ -39,7 +41,7 @@ def _parse_line(line, sep):
            subtags.append('NULL')
        assert len(subtags) == len(subtokens), [t.string for t in subtokens]
        words.append(word)
-        tags.extend([Tagger.encode_pos(pos) for pos in subtags])
+        tags.extend([Tagger.encode_pos(ptb_to_univ(pos)) for pos in subtags])
    return EN.tokenize(' '.join(words)), tags
@ -53,3 +55,86 @@ def get_tagdict(train_sents):
            tagdict.setdefault(word, {}).setdefault(tag, 0)
            tagdict[word][tag] += 1
    return tagdict
 def ptb_to_univ(tag):
    mapping = dict(tuple(line.split()) for line in """
 NULL    NULL
 HYPH   .
 ADD X
 NFP .
 AFX X
 XX  X
 BES VERB
 HVS VERB
 GW  X
 !	.
 #	.
 $	.
 ''	.
 (	.
 )	.
 ,	.
 -LRB-	.
 -RRB-	.
 .	.
 :	.
 ?	.
 CC	CONJ
 CD	NUM
 CD|RB	X
 DT	DET
 EX	DET
 FW	X
 IN	ADP
 IN|RP	ADP
 JJ	ADJ
 JJR	ADJ
 JJRJR	ADJ
 JJS	ADJ
 JJ|RB	ADJ
 JJ|VBG	ADJ
 LS	X
 MD	VERB
 NN	NOUN
 NNP	NOUN
 NNPS	NOUN
 NNS	NOUN
 NN|NNS	NOUN
 NN|SYM	NOUN
 NN|VBG	NOUN
 NP	NOUN
 PDT	DET
 POS	PRT
 PRP	PRON
 PRP$	PRON
 PRP|VBP	PRON
 PRT	PRT
 RB	ADV
 RBR	ADV
 RBS	ADV
 RB|RP	ADV
 RB|VBG	ADV
 RN	X
 RP	PRT
 SYM	X
 TO	PRT
 UH	X
 VB	VERB
 VBD	VERB
 VBD|VBN	VERB
 VBG	VERB
 VBG|NN	VERB
 VBN	VERB
 VBP	VERB
 VBP|TO	VERB
 VBZ	VERB
 VP	VERB
 WDT	DET
 WH	X
 WP	PRON
 WP$	PRON
 WRB	ADV
 ``	.""".strip().split('\n'))
    return mapping[tag]
--- a/spacy/ptb3.pxd
+++ b/spacy/ptb3.pxd
@ -1,5 +0,0 @@
 from spacy.lang cimport Language
 cdef class PennTreebank3(Language):
    cdef list _split(self, unicode split)
--- a/spacy/ptb3.pyx
+++ b/spacy/ptb3.pyx
@ -1,161 +0,0 @@
 '''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
 so that strings can be retrieved from hashes.  Use 64-bit hash values and
 boldly assume no collisions.
 '''
 from __future__ import unicode_literals
 from libc.stdint cimport uint64_t
 cimport spacy
 import re
 from spacy import orth
 TAG_THRESH = 0.5
 UPPER_THRESH = 0.2
 LOWER_THRESH = 0.5
 TITLE_THRESH = 0.7
 NR_FLAGS = 0
 OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
 OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
 OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
 IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
 IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
 IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
 IS_SPACE = NR_FLAGS; NR_FLAGS += 1
 IS_ASCII = NR_FLAGS; NR_FLAGS += 1
 IS_TITLE = NR_FLAGS; NR_FLAGS += 1
 IS_LOWER = NR_FLAGS; NR_FLAGS += 1
 IS_UPPER = NR_FLAGS; NR_FLAGS += 1
 CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
 CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
 CAN_NUM = NR_FLAGS; NR_FLAGS += 1
 CAN_DET = NR_FLAGS; NR_FLAGS += 1
 CAN_ADP = NR_FLAGS; NR_FLAGS += 1
 CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
 CAN_ADV = NR_FLAGS; NR_FLAGS += 1
 CAN_VERB = NR_FLAGS; NR_FLAGS += 1
 CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
 CAN_PDT = NR_FLAGS; NR_FLAGS += 1
 CAN_POS = NR_FLAGS; NR_FLAGS += 1
 CAN_PRON = NR_FLAGS; NR_FLAGS += 1
 CAN_PRT = NR_FLAGS; NR_FLAGS += 1
 # List of contractions adapted from Robert MacIntyre's tokenizer.
 CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
                 re.compile(r"(?i)\b(d)('ye)\b"),
                 re.compile(r"(?i)\b(gim)(me)\b"),
                 re.compile(r"(?i)\b(gon)(na)\b"),
                 re.compile(r"(?i)\b(got)(ta)\b"),
                 re.compile(r"(?i)\b(lem)(me)\b"),
                 re.compile(r"(?i)\b(mor)('n)\b"),
                 re.compile(r"(?i)\b(wan)(na) ")]
 CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
                 re.compile(r"(?i) ('t)(was)\b")]
 CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
                 re.compile(r"(?i)\b(wha)(t)(cha)\b")]
 def nltk_regex_tokenize(text):
    # Implementation taken from NLTK 3.0, based on tokenizer.sed
    #starting quotes
    text = re.sub(r'^\"', r'``', text)
    text = re.sub(r'(``)', r' \1 ', text)
    text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
    #punctuation
    text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
    text = re.sub(r'\.\.\.', r' ... ', text)
    text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
    text = re.sub(r'[?!]', r' \g<0> ', text)
    text = re.sub(r"([^'])' ", r"\1 ' ", text)
    #parens, brackets, etc.
    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
    text = re.sub(r'--', r' -- ', text)
    #add extra space to make things easier
    text = " " + text + " "
    #ending quotes
    text = re.sub(r'"', " '' ", text)
    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
                  text)
    for regexp in CONTRACTIONS2:
        text = regexp.sub(r' \1 \2 ', text)
    for regexp in CONTRACTIONS3:
        text = regexp.sub(r' \1 \2 ', text)
    # We are not using CONTRACTIONS4 since
    # they are also commented out in the SED scripts
    # for regexp in self.CONTRACTIONS4:
    #     text = regexp.sub(r' \1 \2 \3 ', text)
    return text.split()
 cdef class PennTreebank3(Language):
    """Fully PTB compatible English tokenizer, tightly coupled to lexicon.
    Attributes:
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
    def __cinit__(self, name):
        flag_funcs = [0 for _ in range(NR_FLAGS)]
        flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
        flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
        flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
        flag_funcs[IS_ALPHA] = orth.is_alpha
        flag_funcs[IS_DIGIT] = orth.is_digit
        flag_funcs[IS_PUNCT] = orth.is_punct
        flag_funcs[IS_SPACE] = orth.is_space
        flag_funcs[IS_TITLE] = orth.is_title
        flag_funcs[IS_LOWER] = orth.is_lower
        flag_funcs[IS_UPPER] = orth.is_upper
        flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
        flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
        flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
        flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
        flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
        flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
        flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
        flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
        flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
        flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
        flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
        Language.__init__(self, name, flag_funcs)
    cdef list _split(self, unicode chunk):
        strings = nltk_regex_tokenize(chunk)
        if strings[-1] == '.':
            strings.pop()
            strings[-1] += '.'
        assert strings
        return strings
 PTB3 = PennTreebank3('ptb3')
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,59 +1,49 @@
 from cymem.cymem cimport Pool
-from spacy.lexeme cimport LexemeC
+from .lexeme cimport Lexeme
 from .typedefs cimport flag_t
 from .utf8string cimport StringStore
 from thinc.typedefs cimport atom_t
 cdef class Tokens:
    cdef Pool mem
    cdef StringStore _string_store
-    cdef LexemeC** _lex_ptr
+    cdef Lexeme** _lex_ptr
    cdef int* _idx_ptr
    cdef int* _pos_ptr
-    cdef LexemeC** lex
+    cdef Lexeme** lex
    cdef int* idx
    cdef int* pos
    cdef int length
    cdef int max_length
-    cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
+    cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
-    cdef int push_back(self, int i, LexemeC* lexeme) except -1
+    cdef int push_back(self, int i, Lexeme* lexeme) except -1
    cpdef int id(self, size_t i) except -1
    cpdef float prob(self, size_t i) except 1
    cpdef int cluster(self, size_t i) except *
    cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *
    cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
    cpdef unicode string_view(self, size_t i, size_t view_id)
-    cpdef unicode string(self, size_t i)
+cdef class Token:
-    cpdef unicode orig(self, size_t i)
+    cdef StringStore _string_store
-    cpdef unicode norm(self, size_t i)
+    cdef public int i
-    cpdef unicode shape(self, size_t i)
+    cdef public int idx
-    cpdef unicode unsparse(self, size_t i)
+    cdef public int pos
-    cpdef unicode asciied(self, size_t i)
+
-    cpdef bint is_alpha(self, size_t i) except *
+    cdef public atom_t id
-    cpdef bint is_ascii(self, size_t i) except * 
+    cdef public atom_t cluster
-    cpdef bint is_digit(self, size_t i) except *
+    cdef public atom_t length
-    cpdef bint is_lower(self, size_t i) except *
+    cdef public atom_t lex_pos
-    cpdef bint is_punct(self, size_t i) except *
+    cdef public atom_t lex_supersense
-    cpdef bint is_space(self, size_t i) except *
+
-    cpdef bint is_title(self, size_t i) except *
+    cdef public atom_t norm
-    cpdef bint is_upper(self, size_t i) except *
+    cdef public atom_t shape
-    cpdef bint can_adj(self, size_t i) except *
+    cdef public atom_t vocab10k
-    cpdef bint can_adp(self, size_t i) except *
+    cdef public atom_t asciied
-    cpdef bint can_adv(self, size_t i) except *
+    cdef public atom_t prefix
-    cpdef bint can_conj(self, size_t i) except *
+    cdef public atom_t suffix
-    cpdef bint can_det(self, size_t i) except *
+
-    cpdef bint can_noun(self, size_t i) except *
+    cdef public float prob
-    cpdef bint can_num(self, size_t i) except *
+
-    cpdef bint can_pdt(self, size_t i) except *
+    cdef public flag_t flags
    cpdef bint can_pos(self, size_t i) except *
    cpdef bint can_pron(self, size_t i) except *
    cpdef bint can_prt(self, size_t i) except *
    cpdef bint can_punct(self, size_t i) except *
    cpdef bint can_verb(self, size_t i) except *
    cpdef bint oft_lower(self, size_t i) except *
    cpdef bint oft_title(self, size_t i) except *
    cpdef bint oft_upper(self, size_t i) except *
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,10 +1,6 @@
 # cython: profile=True
 from .word cimport Lexeme
 from .lexeme cimport *
 cimport numpy
 cimport cython
 import numpy
 DEF PADDING = 5
@ -34,7 +30,8 @@ cdef class Tokens:
    >>> tokens.can_noun(1)
    True
    """
-    def __init__(self, string_length=0):
+    def __init__(self, StringStore string_store, string_length=0):
        self._string_store = string_store
        if string_length >= 3:
            size = int(string_length / 3.0)
        else:
@ -43,7 +40,7 @@ cdef class Tokens:
        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
        # However, we need to remember the true starting places, so that we can
        # realloc.
-        self._lex_ptr = <LexemeC**>self.mem.alloc(size + (PADDING*2), sizeof(LexemeC*))
+        self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
        self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
        self.lex = self._lex_ptr
@ -55,39 +52,26 @@ cdef class Tokens:
        self.lex += PADDING
        self.idx += PADDING
        self.pos += PADDING
        self.max_length = size
        self.length = 0
    def __getitem__(self, i):
        bounds_check(i, self.length, PADDING)
-        return Lexeme(<size_t>self.lex[i])
+        return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0])
    def __len__(self):
        return self.length
-    cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
+    cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
        if self.length == self.max_length:
            self._realloc(self.length * 2)
        self.lex[self.length] = lexeme
        self.idx[self.length] = idx
        self.pos[self.length] = 0
        self.length += 1
-        return idx + lexeme.ints[<int>LexInt_length]
+        return idx + lexeme.length
-    def _realloc(self, new_size):
+    cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
        self.max_length = new_size
        n = new_size + (PADDING * 2)
        self._lex_ptr = <LexemeC**>self.mem.realloc(self._lex_ptr, n * sizeof(LexemeC*))
        self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
        self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
        self.lex = self._lex_ptr + PADDING
        self.idx = self._idx_ptr + PADDING
        self.pos = self._pos_ptr + PADDING
        for i in range(self.length, self.max_length + PADDING):
            self.lex[i] = &EMPTY_LEXEME
    cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
        cdef int i
        if lexemes == NULL:
            return idx
@ -101,154 +85,43 @@ cdef class Tokens:
                idx = self.push_back(idx, lexemes[i])
        return idx
-    cpdef int id(self, size_t i) except -1:
+    def _realloc(self, new_size):
-        bounds_check(i, self.length, PADDING)
+        self.max_length = new_size
-        return self.lex[i].ints[<int>LexInt_id]
+        n = new_size + (PADDING * 2)
        self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
        self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
        self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
        self.lex = self._lex_ptr + PADDING
        self.idx = self._idx_ptr + PADDING
        self.pos = self._pos_ptr + PADDING
        for i in range(self.length, self.max_length + PADDING):
            self.lex[i] = &EMPTY_LEXEME
    cpdef float prob(self, size_t i) except 1:
        bounds_check(i, self.length, PADDING)
        return self.lex[i].floats[<int>LexFloat_prob]
-    cpdef int cluster(self, size_t i) except *:
+@cython.freelist(64)
-        bounds_check(i, self.length, PADDING)
+cdef class Token:
-        return self.lex[i].ints[<int>LexInt_cluster]
+    def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
        self._string_store = string_store
        self.i = i
        self.idx = idx
        self.pos = pos
-    cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
+        self.id = lex['id']
-        bounds_check(i, self.length, PADDING)
+        self.cluster = lex['cluster']
-        return lexeme_check_orth_flag(self.lex[i], flag_id)
+        self.length = lex['length']
        self.lex_pos = lex['pos']
        self.lex_supersense = lex['supersense']
        self.norm = lex['norm']
        self.shape = lex['shape']
        self.vocab10k = lex['vocab10k']
        self.suffix = lex['asciied']
        self.prefix = lex['prefix']
-    cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
+        self.prob = lex['prob']
-        bounds_check(i, self.length, PADDING)
+        self.flags = lex['flags']
        return lexeme_check_dist_flag(self.lex[i], flag_id)
-    cpdef unicode string_view(self, size_t i, size_t view_id):
+    property string:
-        bounds_check(i, self.length, PADDING)
+        def __get__(self):
-        return lexeme_get_string(self.lex[i], view_id)
+            cdef bytes utf8string = self._string_store[self.id]
            return utf8string.decode('utf8')
    # Provide accessor methods for the features supported by the language.
    # Without these, clients have to use the underlying string_view and check_flag
    # methods, which requires them to know the IDs.
    cpdef unicode string(self, size_t i):
        bounds_check(i, self.length, PADDING)
        return self.orig(i)
    cpdef unicode orig(self, size_t i):
        bounds_check(i, self.length, PADDING)
        cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_orig]
        cdef unicode string = utf8_string.decode('utf8')
        return string
    cpdef unicode norm(self, size_t i):
        bounds_check(i, self.length, PADDING)
        cdef bytes utf8_string = self.lex[i].strings[<int>LexStr_norm]
        cdef unicode string = utf8_string.decode('utf8')
        return string
    cpdef unicode shape(self, size_t i):
        bounds_check(i, self.length, PADDING)
        return lexeme_get_string(self.lex[i], LexStr_shape)
    cpdef unicode unsparse(self, size_t i):
        bounds_check(i, self.length, PADDING)
        return lexeme_get_string(self.lex[i], LexStr_unsparse)
    cpdef unicode asciied(self, size_t i):
        bounds_check(i, self.length, PADDING)
        return lexeme_get_string(self.lex[i], LexStr_asciied)
    cpdef bint is_alpha(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_orth_flag(self.lex[i], LexOrth_alpha)
    cpdef bint is_ascii(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_orth_flag(self.lex[i], LexOrth_ascii)
    cpdef bint is_digit(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_orth_flag(self.lex[i], LexOrth_digit)
    cpdef bint is_lower(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_orth_flag(self.lex[i], LexOrth_lower)
    cpdef bint is_punct(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_orth_flag(self.lex[i], LexOrth_punct)
    cpdef bint is_space(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_orth_flag(self.lex[i], LexOrth_space)
    cpdef bint is_title(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_orth_flag(self.lex[i], LexOrth_title)
    cpdef bint is_upper(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_orth_flag(self.lex[i], LexOrth_upper)
    cpdef bint can_adj(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_adj)
    cpdef bint can_adp(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_adp)
    cpdef bint can_adv(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_adv)
    cpdef bint can_conj(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_conj)
    cpdef bint can_det(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_det)
    cpdef bint can_noun(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_noun)
    cpdef bint can_num(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_num)
    cpdef bint can_pdt(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_pdt)
    cpdef bint can_pos(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_pos)
    cpdef bint can_pron(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_pron)
    cpdef bint can_prt(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_prt)
    cpdef bint can_punct(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_punct)
    cpdef bint can_verb(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_verb)
    cpdef bint oft_lower(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_lower)
    cpdef bint oft_title(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_title)
    cpdef bint oft_upper(self, size_t i) except *:
        bounds_check(i, self.length, PADDING)
        return lexeme_check_dist_flag(self.lex[i], LexDist_upper)
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -1,12 +0,0 @@
 from .typedefs cimport hash_t, utf8_t, flag_t, id_t
 from spacy.lexeme cimport LexemeC
 DEF MAX_FLAG = 64
 cdef class Lexeme:
    cdef LexemeC* _c
    cpdef bint check_orth_flag(self, size_t flag_id) except *
    cpdef bint check_dist_flag(self, size_t flag_id) except *
    cpdef unicode string_view(self, size_t view_id)
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -1,80 +0,0 @@
 # cython: profile=True
 # cython: embedsignature=True
 from .lexeme cimport lexeme_get_string
 from .lexeme cimport lexeme_check_orth_flag, lexeme_check_dist_flag
 from .lexeme cimport *
 cdef class Lexeme:
    """A lexical type --- a word, punctuation symbol, whitespace sequence, etc
    keyed by a case-sensitive unicode string. All tokens with the same string,
    e.g. all instances of "dog", ",", "NASA" etc should be mapped to the same
    Lexeme.
    You should avoid instantiating Lexemes directly, and instead use the
    :py:meth:`space.lang.Language.tokenize` and :py:meth:`spacy.lang.Language.lookup`
    methods on the global object exposed by the language you're working with,
    e.g. :py:data:`spacy.en.EN`.
    Attributes:
        string (unicode):
            The unicode string.
            Implemented as a property; relatively expensive.
        length (size_t):
            The number of unicode code-points in the string.
        prob (double):
            An estimate of the word's unigram log probability.
            Probabilities are calculated from a large text corpus, and smoothed using
            simple Good-Turing.  Estimates are read from data/en/probabilities, and
            can be replaced using spacy.en.load_probabilities.
        cluster (size_t):
            An integer representation of the word's Brown cluster.
            A Brown cluster is an address into a binary tree, which gives some (noisy)
            information about the word's distributional context.
            >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
            >>> print ["{0:b"} % lookup(s).cluster for s in strings]
            ["100111110110", "100111100100", "01010111011001", "100111110110"]
            The clusterings are unideal, but often slightly useful.
            "pineapple" and "apple" share a long prefix, indicating a similar meaning,
            while "dapple" is totally different. On the other hand, "scalable" receives
            the same cluster ID as "pineapple", which is not what we'd like.
    """
    def __cinit__(self, size_t lexeme_addr):
        self._c = <LexemeC*>lexeme_addr
    property string:
        def __get__(self):
            cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
            cdef unicode string = utf8_string.decode('utf8')
            return string
    property prob:
        def __get__(self):
            return self._c.floats[<int>LexFloat_prob]
    property cluster:
        def __get__(self):
            return self._c.ints[<int>LexInt_cluster]
    property length:
        def __get__(self):
            return self._c.ints[<int>LexInt_length]
    cpdef bint check_orth_flag(self, size_t flag_id) except *:
        return lexeme_check_orth_flag(self._c, flag_id)
    cpdef bint check_dist_flag(self, size_t flag_id) except *:
        return lexeme_check_dist_flag(self._c, flag_id)
    cpdef unicode string_view(self, size_t view_id):
        return lexeme_get_string(self._c, view_id)
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -5,8 +5,8 @@ from spacy.en import EN
 def test_possess():
    tokens = EN.tokenize("Mike's")
-    assert tokens[0].string == "Mike"
+    assert EN.lexicon.strings[tokens[0].id] == "Mike"
-    assert tokens[1].string == "'s"
+    assert EN.lexicon.strings[tokens[1].id] == "'s"
    assert len(tokens) == 2
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@ -8,19 +8,17 @@ from spacy.lexeme import *
 def test_is_alpha():
    the = EN.lexicon.lookup('the')
-    assert the.check_orth_flag(LexOrth_alpha)
+    assert the['flags'] & (1 << IS_ALPHA)
    year = EN.lexicon.lookup('1999')
-    assert not year.check_orth_flag(LexOrth_alpha)
+    assert not year['flags'] & (1 << IS_ALPHA)
    mixed = EN.lexicon.lookup('hello1')
-    assert not mixed.check_orth_flag(LexOrth_alpha)
+    assert not mixed['flags'] & (1 << IS_ALPHA)
 def test_is_digit():
    the = EN.lexicon.lookup('the')
-    assert not the.check_orth_flag(LexOrth_digit)
+    assert not the['flags'] & (1 << IS_DIGIT)
    year = EN.lexicon.lookup('1999')
-    assert year.check_orth_flag(LexOrth_digit)
+    assert year['flags'] & (1 << IS_DIGIT)
    mixed = EN.lexicon.lookup('hello1')
-    assert not mixed.check_orth_flag(LexOrth_digit)
+    assert not mixed['flags'] & (1 << IS_DIGIT)
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@ -1,27 +0,0 @@
 from __future__ import unicode_literals
 import pytest
 import spacy.word
 from spacy.en import EN
 from spacy.lexeme import *
@pytest.fixture
 def C3P0():
    return EN.lexicon.lookup("C3P0")
 def test_shape(C3P0):
    assert C3P0.string_view(LexStr_shape) == "XdXd"
 def test_length():
    t = EN.lexicon.lookup('the')
    assert t.length == 3
    t = EN.lexicon.lookup("n't")
    assert t.length == 3
    t = EN.lexicon.lookup("'s")
    assert t.length == 2
    t = EN.lexicon.lookup('Xxxx')
    assert t.length == 4
--- a/tests/test_string_loading.py
+++ b/tests/test_string_loading.py
@ -8,9 +8,9 @@ from spacy.en import EN
 def test_one():
    tokens = EN.tokenize('Betty Botter bought a pound of butter.')
-    assert tokens.string(0) == 'Betty'
+    assert tokens[0].string == 'Betty'
    tokens2 = EN.tokenize('Betty also bought a pound of butter.')
-    assert tokens2.string(0) == 'Betty'
+    assert tokens2[0].string == 'Betty'
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -5,41 +5,39 @@ from spacy.en import EN
 def test_single_word():
-    lex_ids = EN.tokenize(u'hello')
+    tokens = EN.tokenize(u'hello')
-    assert lex_ids[0].string == EN.lexicon.lookup(u'hello').string
+    assert tokens[0].string == 'hello'
 def test_two_words():
-    words = EN.tokenize('hello possums')
+    tokens = EN.tokenize('hello possums')
-    assert len(words) == 2
+    assert len(tokens) == 2
-    assert words[0].string == EN.lexicon.lookup('hello').string
+    assert tokens[0].string != tokens[1].string
    assert words[0].string != words[1].string
 def test_punct():
    tokens = EN.tokenize('hello, possums.')
    assert len(tokens) == 4
-    assert tokens[0].string == EN.lexicon.lookup('hello').string
+    assert tokens[0].string == 'hello'
-    assert tokens[1].string == EN.lexicon.lookup(',').string
+    assert tokens[1].string == ','
-    assert tokens[2].string == EN.lexicon.lookup('possums').string
+    assert tokens[2].string == 'possums'
-    assert tokens[1].string != EN.lexicon.lookup('hello').string
+    assert tokens[1].string != 'hello'
 def test_digits():
-    lex_ids = EN.tokenize('The year: 1984.')
+    tokens = EN.tokenize('The year: 1984.')
-    assert lex_ids.orig(3) == "1984"
+    assert len(tokens) == 5
-    assert len(lex_ids) == 5
+    assert tokens[0].id == EN.lexicon.lookup('The')['id']
-    assert lex_ids[0].string == EN.lexicon.lookup('The').string
+    assert tokens[3].id == EN.lexicon.lookup('1984')['id']
    assert lex_ids[3].string == EN.lexicon.lookup('1984').string
 def test_contraction():
-    lex_ids = EN.tokenize("don't giggle")
+    tokens = EN.tokenize("don't giggle")
-    assert len(lex_ids) == 3
+    assert len(tokens) == 3
-    assert lex_ids[1].string == EN.lexicon.lookup("not").string
+    assert tokens[1].id == EN.lexicon.lookup("not")['id']
-    lex_ids = EN.tokenize("i said don't!")
+    tokens = EN.tokenize("i said don't!")
-    assert len(lex_ids) == 5
+    assert len(tokens) == 5
-    assert lex_ids[4].string == EN.lexicon.lookup('!').string
+    assert tokens[4].id == EN.lexicon.lookup('!')['id']
 def test_contraction_punct():
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -5,30 +5,19 @@ from spacy.en import EN
 def test_neq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('bye').string != addr.string
+    assert EN.lexicon.lookup('bye')['id'] != addr['id']
 def test_eq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('Hello').string == addr.string
+    assert EN.lexicon.lookup('Hello')['id'] == addr['id']
 def test_round_trip():
    hello = EN.lexicon.lookup('Hello')
    assert hello.string == 'Hello'
 def test_case_neq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('hello').string != addr.string
+    assert EN.lexicon.lookup('hello')['id'] != addr['id']
 def test_punct_neq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('Hello,').string != addr.string
+    assert EN.lexicon.lookup('Hello,')['id'] != addr['id']
 def test_short():
    addr = EN.lexicon.lookup('I')
    assert addr.string == 'I'
    assert addr.string != 'not'