* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang

2025-09-09 13:49:41 +03:00 · 2014-10-14 15:47:06 +11:00 · 2014-10-14 15:47:06 +11:00 · 6fb42c4919
commit 6fb42c4919
parent 2805068ca8
11 changed files with 193 additions and 183 deletions
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -1,20 +1,21 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
 from spacy.word cimport Lexeme
 from spacy.tokens cimport Tokens
 from spacy.lexeme cimport LexemeC
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from libcpp.utility cimport pair
 from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t, int64_t
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from .word cimport Lexeme
 from .tokens cimport Tokens
 from .lexeme cimport LexemeC
 cdef extern from "Python.h":
    cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
    cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
    cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
    cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
 cdef struct String:
@ -24,7 +25,7 @@ cdef struct String:
 cdef class Lexicon:
-    cdef Pool _mem
+    cdef Pool mem
    cpdef readonly size_t size
    cdef vector[LexemeC*] lexemes
@ -37,7 +38,6 @@ cdef class Lexicon:
    cdef list _string_features
    cdef list _flag_features
 cdef class Language:
    cdef Pool _mem
    cdef unicode name
@ -47,19 +47,17 @@ cdef class Language:
    cdef object prefix_re
    cdef object suffix_re
    cdef object infix_re
    cpdef Tokens tokenize(self, unicode text)
    cpdef Lexeme lookup(self, unicode text)
    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
    cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
    cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
                            vector[LexemeC*] *prefixes,
                            vector[LexemeC*] *suffixes) except -1
    cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1
    cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
                             vector[LexemeC*] *suffixes) except NULL
    cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
                            vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -14,9 +14,9 @@ from os import path
 import re
 from .util import read_lang_data
-from spacy.tokens import Tokens
+from .tokens import Tokens
-from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
+from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
-from spacy.lexeme cimport LexStr_orig
+from .lexeme cimport LexStr_orig
 from murmurhash.mrmr cimport hash64
 from cpython.ref cimport Py_INCREF
@ -41,23 +41,13 @@ cdef class Language:
        self._mem = Pool()
        self.cache = PreshMap(2 ** 25)
        self.specials = PreshMap(2 ** 16)
-        rules, prefix, suffix, lexemes = util.read_lang_data(name)
+        rules, prefix, suffix, infix, lexemes = util.read_lang_data(name)
        self.prefix_re = re.compile(prefix)
        self.suffix_re = re.compile(suffix)
        self.infix_re = re.compile(infix)
        self.lexicon = Lexicon(lexemes)
        self._load_special_tokenization(rules)
    cpdef Lexeme lookup(self, unicode string):
        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
        Args:
            string (unicode): The string to be looked up. Must be unicode, not bytes.
        Returns:
            lexeme (Lexeme): A reference to a lexical type.
        """
        return self.lexicon.lookup(string)
    cpdef Tokens tokenize(self, unicode string):
        """Tokenize a string.
@ -73,37 +63,43 @@ cdef class Language:
        Returns:
            tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
        """
-        cdef size_t length = len(string)
+        cdef int length = len(string)
        cdef Tokens tokens = Tokens(length)
        if length == 0:
            return tokens
-
+        cdef int start = 0
-        cdef size_t start = 0
+        cdef int i = 0
        cdef size_t i = 0
        cdef Py_UNICODE* chars = string
        cdef String span
        for i in range(length):
            if Py_UNICODE_ISSPACE(chars[i]) == 1:
                if start < i:
-                    string_from_slice(&span, chars, start, i)
+                    self._tokenize(tokens, chars, start, i)
                    if not _extend_from_map(tokens.v, &span, self.cache):
                        self._tokenize(tokens.v, &span)
                start = i + 1
        i += 1
        if start < i:
-            string_from_slice(&span, chars, start, i)
+            self._tokenize(tokens, chars, start, i)
            if not _extend_from_map(tokens.v, &span, self.cache):
                self._tokenize(tokens.v, &span)
        return tokens
-    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
+    cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1:
-        cdef size_t i
+        cdef String span
        cdef uint64_t orig_key = string.key
        cdef size_t orig_size = tokens_v.size()
        cdef vector[LexemeC*] prefixes
        cdef vector[LexemeC*] suffixes
        cdef uint64_t orig_key
        cdef int orig_size
        string_slice(&span, chars, start, end)
        lexemes = <LexemeC**>self.cache.get(span.key)
        if lexemes != NULL:
            tokens.extend(start, lexemes, 0)
        else:
            orig_key = span.key
            orig_size = tokens.lex.size()
            span = self._split_affixes(&span, &prefixes, &suffixes)[0]
            self._attach_tokens(tokens, start, &span, &prefixes, &suffixes)
            self._save_cached(&tokens.lex, orig_key, orig_size)
    cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
            vector[LexemeC*] *suffixes) except NULL:
        cdef size_t i
        cdef String prefix
        cdef String suffix
        cdef String minus_pre
@ -113,8 +109,8 @@ cdef class Language:
            last_size = string.n
            pre_len = self._find_prefix(string.chars, string.n)
            if pre_len != 0:
-                string_from_slice(&prefix, string.chars, 0, pre_len)
+                string_slice(&prefix, string.chars, 0, pre_len)
-                string_from_slice(&minus_pre, string.chars, pre_len, string.n)
+                string_slice(&minus_pre, string.chars, pre_len, string.n)
                # Check whether we've hit a special-case
                if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
                    string = &minus_pre
@ -122,16 +118,15 @@ cdef class Language:
                    break
            suf_len = self._find_suffix(string.chars, string.n)
            if suf_len != 0:
-                string_from_slice(&suffix, string.chars, string.n - suf_len, string.n)
+                string_slice(&suffix, string.chars, string.n - suf_len, string.n)
-                string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len)
+                string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
                # Check whether we've hit a special-case
                if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
                    string = &minus_suf
                    suffixes.push_back(self.lexicon.get(&suffix))
                    break
            if pre_len and suf_len and (pre_len + suf_len) <= string.n:
-                string_from_slice(string, string.chars, pre_len, string.n - suf_len)
+                string_slice(string, string.chars, pre_len, string.n - suf_len)
                prefixes.push_back(self.lexicon.get(&prefix))
                suffixes.push_back(self.lexicon.get(&suffix))
            elif pre_len:
@ -140,26 +135,37 @@ cdef class Language:
            elif suf_len:
                string = &minus_suf
                suffixes.push_back(self.lexicon.get(&suffix))
            if self.specials.get(string.key):
                break
        return string
-        self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
+    cdef int _attach_tokens(self, Tokens tokens,
-        self._save_cached(tokens_v, orig_key, orig_size)
+                            int idx, String* string,
    cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
                            vector[LexemeC*] *prefixes,
                            vector[LexemeC*] *suffixes) except -1:
-        cdef size_t i
+        cdef int split
        cdef LexemeC** lexemes
        cdef LexemeC* lexeme
-        for lexeme in deref(prefixes):
+        cdef String span
-            tokens.push_back(lexeme)
+        idx = tokens.extend(idx, prefixes.data(), prefixes.size())
-        if not _extend_from_map(tokens, string, self.specials):
+        if string.n != 0:
-            self._split_body_token(tokens, string)
+            lexemes = <LexemeC**>self.cache.get(string.key)
            if lexemes != NULL:
                idx = tokens.extend(idx, lexemes, 0)
            else:
                split = self._find_infix(string.chars, string.n)
                if split == 0 or split == -1:
                    idx = tokens.push_back(idx, self.lexicon.get(string))
                else:
                    string_slice(&span, string.chars, 0, split)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
                    string_slice(&span, string.chars, split, split+1)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
                    string_slice(&span, string.chars, split + 1, string.n)
                    idx = tokens.push_back(idx, self.lexicon.get(&span))
        cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
-            tokens.push_back(deref(it))
+            idx = tokens.push_back(idx, deref(it))
            preinc(it)
    cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1:
@ -171,15 +177,17 @@ cdef class Language:
        lexemes[i + 1] = NULL
        self.cache.set(key, lexemes)
-    cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1:
+    cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
-        tokens.push_back(self.lexicon.get(string))
+        cdef unicode string = chars[:length]
        match = self.infix_re.search(string)
        return match.start() if match is not None else 0
    cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
        cdef unicode string = chars[:length]
        match = self.prefix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0
-    cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
+    cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1:
        cdef unicode string = chars[:length]
        match = self.suffix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0
@ -212,27 +220,30 @@ cdef class Language:
 cdef class Lexicon:
    def __cinit__(self, lexemes):
-        self._mem = Pool()
+        self.mem = Pool()
        self._dict = PreshMap(2 ** 20)
        self.size = 0
        cdef String string
        cdef dict lexeme_dict
        cdef LexemeC* lexeme
-        for lexeme_dict in lexemes:
+        for py_string, lexeme_dict in lexemes.iteritems():
-            string_from_unicode(&string, lexeme_dict['string'])
+            string_from_unicode(&string, py_string)
-            lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
+            lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
            lexeme_unpack(lexeme, lexeme_dict)
            self._dict.set(string.key, lexeme)
            self.lexemes.push_back(lexeme)
            self.size += 1
    def __getitem__(self, size_t i):
        return Lexeme(<size_t>self.lexemes.at(i))
    cdef LexemeC* get(self, String* string) except NULL:
        cdef LexemeC* lex
        lex = <LexemeC*>self._dict.get(string.key)
        if lex != NULL:
            return lex
-        lex = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
+        lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
        cdef unicode unicode_string = string.chars[:string.n]
        lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
        self._dict.set(string.key, lex)
@ -255,38 +266,12 @@ cdef class Lexicon:
        return Lexeme(<size_t>lexeme)
 cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
    if string.n == 0:
        return 1
    lexemes = <LexemeC**>map_.get(string.key)
    if lexemes == NULL:
        return 0
    cdef size_t i = 0
    while lexemes[i] != NULL:
        tokens.push_back(lexemes[i])
        i += 1
    return 1
 cdef void string_from_unicode(String* s, unicode uni):
    cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
-    string_from_slice(s, c_uni, 0, len(uni))
+    string_slice(s, c_uni, 0, len(uni))
-cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
+cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
    s.chars = &chars[start]
    s.n = end - start
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
 cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
    string_from_slice(prefix, s.chars, 0, n)
    s.chars += n
    s.n -= n
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
 cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
    string_from_slice(suffix, s.chars, s.n - n, s.n)
    s.n -= n
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
--- a/spacy/orth.py
+++ b/spacy/orth.py
@ -79,7 +79,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats):
 def word_shape(string, *args):
    length = len(string)
-    shape = ""
+    shape = []
    last = ""
    shape_char = ""
    seq = 0
@ -99,8 +99,8 @@ def word_shape(string, *args):
            seq = 0
            last = shape_char
        if seq < 5:
-            shape += shape_char
+            shape.append(shape_char)
-    return shape
+    return ''.join(shape)
 def non_sparse(string, prob, cluster, case_stats, tag_stats):
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -2,14 +2,10 @@ from spacy.lexeme cimport LexemeC
 from libcpp.vector cimport vector
 cdef struct Token:
    int i
    int pos
    LexemeC* lex
 cdef class Tokens:
-    cdef vector[Token] v
+    cdef vector[LexemeC*] lex
    cdef vector[int] idx
    cdef vector[int] pos
    cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
    cdef int push_back(self, int i, LexemeC* lexeme) except -1
@ -21,6 +17,7 @@ cdef class Tokens:
    cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
    cpdef unicode string_view(self, size_t i, size_t view_id)
    cpdef unicode string(self, size_t i)
    cpdef unicode orig(self, size_t i)
    cpdef unicode norm(self, size_t i)
    cpdef unicode shape(self, size_t i)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -25,17 +25,20 @@ cdef class Tokens:
    """
    def __cinit__(self, string_length=0):
        size = int(string_length / 3) if string_length >= 3 else 1
-        self.v = vector[Token]()
+        self.lex.reserve(size)
-        self.v.reserve(size)
+        self.idx.reserve(size)
        self.pos.reserve(size)
    def __getitem__(self, i):
-        return Lexeme(<size_t>self.v.at(i).lex)
+        return Lexeme(<size_t>self.lex.at(i))
    def __len__(self):
-        return self.v.size()
+        return self.lex.size()
    cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
-        self.v.push_back(Token(idx, 0, lexeme))
+        self.lex.push_back(lexeme)
        self.idx.push_back(idx)
        self.pos.push_back(0)
        return idx + lexeme.ints[<int>LexInt_length]
    cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
@ -46,120 +49,124 @@ cdef class Tokens:
            i = 0
            while lexemes[i] != NULL:
                idx = self.push_back(idx, lexemes[i])
                i += 1
        else:
            for i in range(n):
                idx = self.push_back(idx, lexemes[i])
        return idx
    cpdef int id(self, size_t i) except -1:
-        return self.v.at(i).lex.ints[<int>LexInt_id]
+        return self.lex.at(i).ints[<int>LexInt_id]
    cpdef float prob(self, size_t i) except 1:
-        return self.v.at(i).lex.floats[<int>LexFloat_prob]
+        return self.lex.at(i).floats[<int>LexFloat_prob]
    cpdef int cluster(self, size_t i) except *:
-        return self.v.at(i).lex.ints[<int>LexInt_cluster]
+        return self.lex.at(i).ints[<int>LexInt_cluster]
    cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, flag_id)
+        return lexeme_check_orth_flag(self.lex.at(i), flag_id)
    cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, flag_id)
+        return lexeme_check_dist_flag(self.lex.at(i), flag_id)
    cpdef unicode string_view(self, size_t i, size_t view_id):
-        return lexeme_get_string(self.v.at(i).lex, view_id)
+        return lexeme_get_string(self.lex.at(i), view_id)
    # Provide accessor methods for the features supported by the language.
    # Without these, clients have to use the underlying string_view and check_flag
    # methods, which requires them to know the IDs.
    cpdef unicode string(self, size_t i):
        return self.orig(i)
    cpdef unicode orig(self, size_t i):
-        cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_orig]
+        cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_orig]
        cdef unicode string = utf8_string.decode('utf8')
        return string
    cpdef unicode norm(self, size_t i):
-        cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_norm]
+        cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_norm]
        cdef unicode string = utf8_string.decode('utf8')
        return string
    cpdef unicode shape(self, size_t i):
-        return lexeme_get_string(self.v.at(i).lex, LexStr_shape)
+        return lexeme_get_string(self.lex.at(i), LexStr_shape)
    cpdef unicode unsparse(self, size_t i):
-        return lexeme_get_string(self.v.at(i).lex, LexStr_unsparse)
+        return lexeme_get_string(self.lex.at(i), LexStr_unsparse)
    cpdef unicode asciied(self, size_t i):
-        return lexeme_get_string(self.v.at(i).lex, LexStr_asciied)
+        return lexeme_get_string(self.lex.at(i), LexStr_asciied)
    cpdef bint is_alpha(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_alpha)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_alpha)
    cpdef bint is_ascii(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_ascii)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_ascii)
    cpdef bint is_digit(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_digit)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_digit)
    cpdef bint is_lower(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_lower)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_lower)
    cpdef bint is_punct(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_punct)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_punct)
    cpdef bint is_space(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_space)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_space)
    cpdef bint is_title(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_title)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_title)
    cpdef bint is_upper(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_upper)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_upper)
    cpdef bint can_adj(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adj)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_adj)
    cpdef bint can_adp(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adp)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_adp)
    cpdef bint can_adv(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adv)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_adv)
    cpdef bint can_conj(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_conj)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_conj)
    cpdef bint can_det(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_det)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_det)
    cpdef bint can_noun(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_noun)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_noun)
    cpdef bint can_num(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_num)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_num)
    cpdef bint can_pdt(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pdt)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_pdt)
    cpdef bint can_pos(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pos)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_pos)
    cpdef bint can_pron(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pron)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_pron)
    cpdef bint can_prt(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_prt)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_prt)
    cpdef bint can_punct(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_punct)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_punct)
    cpdef bint can_verb(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_verb)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_verb)
    cpdef bint oft_lower(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_lower)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_lower)
    cpdef bint oft_title(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_title)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_title)
    cpdef bint oft_upper(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_upper)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_upper)
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -4,3 +4,5 @@ ctypedef uint64_t hash_t
 ctypedef char* utf8_t
 ctypedef uint64_t flag_t
 ctypedef uintptr_t id_t
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,7 +1,7 @@
 import os
 from os import path
 import codecs
-import json
+import ujson
 import re
 DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
@ -16,28 +16,36 @@ def read_lang_data(name):
    tokenization = read_tokenization(data_dir)
    prefix = read_prefix(data_dir)
    suffix = read_suffix(data_dir)
    infix = read_infix(data_dir)
    lex_loc = path.join(data_dir, 'lexemes.json')
    if path.exists(lex_loc):
        with open(lex_loc) as file_:
            lexemes = ujson.load(file_)
    else:
-        lexemes = []
+        lexemes = {}
-    return tokenization, prefix, suffix, lexemes
+    return tokenization, prefix, suffix, infix, lexemes
 def read_prefix(data_dir):
    with  utf8open(path.join(data_dir, 'prefix')) as file_:
        entries = file_.read().split('\n')
-        expression = '|'.join(['^' + re.escape(piece) for piece in entries])
+        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return expression
 def read_suffix(data_dir):
    with  utf8open(path.join(data_dir, 'suffix')) as file_:
        entries = file_.read().split('\n')
-        expression = '|'.join([re.escape(piece) + '$' for piece in entries])
+        expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
    return expression
 def read_infix(data_dir):
    with utf8open(path.join(data_dir, 'infix')) as file_:
        entries = file_.read().split('\n')
        expression = '|'.join([piece for piece in entries if piece.strip()])
    return expression
 def read_tokenization(lang):
    loc = path.join(DATA_DIR, lang, 'tokenization')
    entries = []
@ -60,3 +68,16 @@ def read_tokenization(lang):
                seen.add(chunk)
                entries.append((chunk, pieces))
    return entries
 def align_tokens(ref, indices):
    start = 0
    queue = list(indices)
    for token in ref:
        end = start + len(token)
        emit = []
        while queue and queue[0][1] <= end:
            emit.append(queue.pop(0))
        yield token, emit
        start = end
    assert not queue
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@ -7,20 +7,20 @@ from spacy.lexeme import *
 def test_is_alpha():
-    the = EN.lookup('the')
+    the = EN.lexicon.lookup('the')
    assert the.check_orth_flag(LexOrth_alpha)
-    year = EN.lookup('1999')
+    year = EN.lexicon.lookup('1999')
    assert not year.check_orth_flag(LexOrth_alpha)
-    mixed = EN.lookup('hello1')
+    mixed = EN.lexicon.lookup('hello1')
    assert not mixed.check_orth_flag(LexOrth_alpha)
 def test_is_digit():
-    the = EN.lookup('the')
+    the = EN.lexicon.lookup('the')
    assert not the.check_orth_flag(LexOrth_digit)
-    year = EN.lookup('1999')
+    year = EN.lexicon.lookup('1999')
    assert year.check_orth_flag(LexOrth_digit)
-    mixed = EN.lookup('hello1')
+    mixed = EN.lexicon.lookup('hello1')
    assert not mixed.check_orth_flag(LexOrth_digit)
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@ -9,7 +9,7 @@ from spacy.lexeme import *
@pytest.fixture
 def C3P0():
-    return EN.lookup("C3P0")
+    return EN.lexicon.lookup("C3P0")
 def test_shape(C3P0):
@ -17,11 +17,11 @@ def test_shape(C3P0):
 def test_length():
-    t = EN.lookup('the')
+    t = EN.lexicon.lookup('the')
    assert t.length == 3
-    t = EN.lookup("n't")
+    t = EN.lexicon.lookup("n't")
    assert t.length == 3
-    t = EN.lookup("'s")
+    t = EN.lexicon.lookup("'s")
    assert t.length == 2
-    t = EN.lookup('Xxxx')
+    t = EN.lexicon.lookup('Xxxx')
    assert t.length == 4
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -27,7 +27,7 @@ def test_punct():
 def test_digits():
    lex_ids = EN.tokenize('The year: 1984.')
-    assert lex_ids.string(3) == "1984"
+    assert lex_ids.orig(3) == "1984"
    assert len(lex_ids) == 5
    assert lex_ids[0].string == EN.lexicon.lookup('The').string
    assert lex_ids[3].string == EN.lexicon.lookup('1984').string
@ -101,4 +101,4 @@ def test_cnts6():
 def test_cnts7():
    text = 'But then the 6,000-year ice age came...'
    tokens = EN.tokenize(text)
-    assert len(tokens) == 8
+    assert len(tokens) == 10
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -4,31 +4,31 @@ from spacy.en import EN
 def test_neq():
-    addr = EN.lookup('Hello')
+    addr = EN.lexicon.lookup('Hello')
-    assert EN.lookup('bye').string != addr.string
+    assert EN.lexicon.lookup('bye').string != addr.string
 def test_eq():
-    addr = EN.lookup('Hello')
+    addr = EN.lexicon.lookup('Hello')
-    assert EN.lookup('Hello').string == addr.string
+    assert EN.lexicon.lookup('Hello').string == addr.string
 def test_round_trip():
-    hello = EN.lookup('Hello')
+    hello = EN.lexicon.lookup('Hello')
    assert hello.string == 'Hello'
 def test_case_neq():
-    addr = EN.lookup('Hello')
+    addr = EN.lexicon.lookup('Hello')
-    assert EN.lookup('hello').string != addr.string
+    assert EN.lexicon.lookup('hello').string != addr.string
 def test_punct_neq():
-    addr = EN.lookup('Hello')
+    addr = EN.lexicon.lookup('Hello')
-    assert EN.lookup('Hello,').string != addr.string
+    assert EN.lexicon.lookup('Hello,').string != addr.string
 def test_short():
-    addr = EN.lookup('I')
+    addr = EN.lexicon.lookup('I')
    assert addr.string == 'I'
    assert addr.string != 'not'