* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang

2025-11-21 10:16:18 +03:00 · 2014-10-14 15:47:06 +11:00 · 2014-10-14 15:47:06 +11:00 · 6fb42c4919
commit 6fb42c4919
parent 2805068ca8
11 changed files with 193 additions and 183 deletions
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -1,20 +1,21 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
-from spacy.word cimport Lexeme
-from spacy.tokens cimport Tokens
-from spacy.lexeme cimport LexemeC
-from preshed.maps cimport PreshMap
-
-from cymem.cymem cimport Pool
-
-from libcpp.utility cimport pair
 from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t, int64_t

+from preshed.maps cimport PreshMap
+from cymem.cymem cimport Pool
+
+from .word cimport Lexeme
+from .tokens cimport Tokens
+from .lexeme cimport LexemeC
+

 cdef extern from "Python.h":
    cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
    cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
+    cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
+    cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)


 cdef struct String:
@ -24,7 +25,7 @@ cdef struct String:


 cdef class Lexicon:
-    cdef Pool _mem
+    cdef Pool mem
    cpdef readonly size_t size

    cdef vector[LexemeC*] lexemes
@ -37,7 +38,6 @@ cdef class Lexicon:
    cdef list _string_features
    cdef list _flag_features

-
 cdef class Language:
    cdef Pool _mem
    cdef unicode name
@ -47,19 +47,17 @@ cdef class Language:

    cdef object prefix_re
    cdef object suffix_re
+    cdef object infix_re

    cpdef Tokens tokenize(self, unicode text)
-    cpdef Lexeme lookup(self, unicode text)
-
-    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
-
-    cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1
-    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
-    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
-    
-    cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
-                            vector[LexemeC*] *prefixes,
-                            vector[LexemeC*] *suffixes) except -1

+    cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1
+    cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
+                             vector[LexemeC*] *suffixes) except NULL
+    cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
+                            vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
+    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
+    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
+    cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1
 
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -14,9 +14,9 @@ from os import path
 import re

 from .util import read_lang_data
-from spacy.tokens import Tokens
-from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
-from spacy.lexeme cimport LexStr_orig
+from .tokens import Tokens
+from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
+from .lexeme cimport LexStr_orig
 from murmurhash.mrmr cimport hash64

 from cpython.ref cimport Py_INCREF
@ -41,23 +41,13 @@ cdef class Language:
        self._mem = Pool()
        self.cache = PreshMap(2 ** 25)
        self.specials = PreshMap(2 ** 16)
-        rules, prefix, suffix, lexemes = util.read_lang_data(name)
+        rules, prefix, suffix, infix, lexemes = util.read_lang_data(name)
        self.prefix_re = re.compile(prefix)
        self.suffix_re = re.compile(suffix)
+        self.infix_re = re.compile(infix)
        self.lexicon = Lexicon(lexemes)
        self._load_special_tokenization(rules)

-    cpdef Lexeme lookup(self, unicode string):
-        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
-    
-        Args:
-            string (unicode): The string to be looked up. Must be unicode, not bytes.
-
-        Returns:
-            lexeme (Lexeme): A reference to a lexical type.
-        """
-        return self.lexicon.lookup(string)
-
    cpdef Tokens tokenize(self, unicode string):
        """Tokenize a string.

@ -73,37 +63,43 @@ cdef class Language:
        Returns:
            tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
        """
-        cdef size_t length = len(string)
+        cdef int length = len(string)
        cdef Tokens tokens = Tokens(length)
        if length == 0:
            return tokens
-
-        cdef size_t start = 0
-        cdef size_t i = 0
+        cdef int start = 0
+        cdef int i = 0
        cdef Py_UNICODE* chars = string
-        cdef String span
        for i in range(length):
            if Py_UNICODE_ISSPACE(chars[i]) == 1:
                if start < i:
-                    string_from_slice(&span, chars, start, i)
-                    if not _extend_from_map(tokens.v, &span, self.cache):
-                        self._tokenize(tokens.v, &span)
+                    self._tokenize(tokens, chars, start, i)
                start = i + 1
        i += 1
        if start < i:
-            string_from_slice(&span, chars, start, i)
-            if not _extend_from_map(tokens.v, &span, self.cache):
-                self._tokenize(tokens.v, &span)
+            self._tokenize(tokens, chars, start, i)
        return tokens

-    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
-        cdef size_t i
-        cdef uint64_t orig_key = string.key
-        cdef size_t orig_size = tokens_v.size()
-
+    cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1:
+        cdef String span
        cdef vector[LexemeC*] prefixes
        cdef vector[LexemeC*] suffixes
+        cdef uint64_t orig_key
+        cdef int orig_size
+        string_slice(&span, chars, start, end)
+        lexemes = <LexemeC**>self.cache.get(span.key)
+        if lexemes != NULL:
+            tokens.extend(start, lexemes, 0)
+        else:
+            orig_key = span.key
+            orig_size = tokens.lex.size()
+            span = self._split_affixes(&span, &prefixes, &suffixes)[0]
+            self._attach_tokens(tokens, start, &span, &prefixes, &suffixes)
+            self._save_cached(&tokens.lex, orig_key, orig_size)

+    cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
+            vector[LexemeC*] *suffixes) except NULL:
+        cdef size_t i
        cdef String prefix
        cdef String suffix
        cdef String minus_pre
@ -113,8 +109,8 @@ cdef class Language:
            last_size = string.n
            pre_len = self._find_prefix(string.chars, string.n)
            if pre_len != 0:
-                string_from_slice(&prefix, string.chars, 0, pre_len)
-                string_from_slice(&minus_pre, string.chars, pre_len, string.n)
+                string_slice(&prefix, string.chars, 0, pre_len)
+                string_slice(&minus_pre, string.chars, pre_len, string.n)
                # Check whether we've hit a special-case
                if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
                    string = &minus_pre
@ -122,16 +118,15 @@ cdef class Language:
                    break
            suf_len = self._find_suffix(string.chars, string.n)
            if suf_len != 0:
-                string_from_slice(&suffix, string.chars, string.n - suf_len, string.n)
-                string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len)
+                string_slice(&suffix, string.chars, string.n - suf_len, string.n)
+                string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
                # Check whether we've hit a special-case
                if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
                    string = &minus_suf
                    suffixes.push_back(self.lexicon.get(&suffix))
                    break
-
            if pre_len and suf_len and (pre_len + suf_len) <= string.n:
-                string_from_slice(string, string.chars, pre_len, string.n - suf_len)
+                string_slice(string, string.chars, pre_len, string.n - suf_len)
                prefixes.push_back(self.lexicon.get(&prefix))
                suffixes.push_back(self.lexicon.get(&suffix))
            elif pre_len:
@ -140,26 +135,37 @@ cdef class Language:
            elif suf_len:
                string = &minus_suf
                suffixes.push_back(self.lexicon.get(&suffix))
-
            if self.specials.get(string.key):
                break
+        return string

-        self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
-        self._save_cached(tokens_v, orig_key, orig_size)
-
-    cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
+    cdef int _attach_tokens(self, Tokens tokens,
+                            int idx, String* string,
                            vector[LexemeC*] *prefixes,
                            vector[LexemeC*] *suffixes) except -1:
-        cdef size_t i
+        cdef int split
        cdef LexemeC** lexemes
        cdef LexemeC* lexeme
-        for lexeme in deref(prefixes):
-            tokens.push_back(lexeme)
-        if not _extend_from_map(tokens, string, self.specials):
-            self._split_body_token(tokens, string)
+        cdef String span
+        idx = tokens.extend(idx, prefixes.data(), prefixes.size())
+        if string.n != 0:
+            lexemes = <LexemeC**>self.cache.get(string.key)
+            if lexemes != NULL:
+                idx = tokens.extend(idx, lexemes, 0)
+            else:
+                split = self._find_infix(string.chars, string.n)
+                if split == 0 or split == -1:
+                    idx = tokens.push_back(idx, self.lexicon.get(string))
+                else:
+                    string_slice(&span, string.chars, 0, split)
+                    idx = tokens.push_back(idx, self.lexicon.get(&span))
+                    string_slice(&span, string.chars, split, split+1)
+                    idx = tokens.push_back(idx, self.lexicon.get(&span))
+                    string_slice(&span, string.chars, split + 1, string.n)
+                    idx = tokens.push_back(idx, self.lexicon.get(&span))
        cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
-            tokens.push_back(deref(it))
+            idx = tokens.push_back(idx, deref(it))
            preinc(it)

    cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1:
@ -171,15 +177,17 @@ cdef class Language:
        lexemes[i + 1] = NULL
        self.cache.set(key, lexemes)

-    cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1:
-        tokens.push_back(self.lexicon.get(string))
+    cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
+        cdef unicode string = chars[:length]
+        match = self.infix_re.search(string)
+        return match.start() if match is not None else 0
    
    cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
        cdef unicode string = chars[:length]
        match = self.prefix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0

-    cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
+    cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1:
        cdef unicode string = chars[:length]
        match = self.suffix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0
@ -212,27 +220,30 @@ cdef class Language:

 cdef class Lexicon:
    def __cinit__(self, lexemes):
-        self._mem = Pool()
+        self.mem = Pool()
        self._dict = PreshMap(2 ** 20)
        self.size = 0
        cdef String string
        cdef dict lexeme_dict
        cdef LexemeC* lexeme
-        for lexeme_dict in lexemes:
-            string_from_unicode(&string, lexeme_dict['string'])
-            lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
+        for py_string, lexeme_dict in lexemes.iteritems():
+            string_from_unicode(&string, py_string)
+            lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
            lexeme_unpack(lexeme, lexeme_dict)
            self._dict.set(string.key, lexeme)
            self.lexemes.push_back(lexeme)
            self.size += 1

+    def __getitem__(self, size_t i):
+        return Lexeme(<size_t>self.lexemes.at(i))
+
    cdef LexemeC* get(self, String* string) except NULL:
        cdef LexemeC* lex
        lex = <LexemeC*>self._dict.get(string.key)
        if lex != NULL:
            return lex

-        lex = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
+        lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
        cdef unicode unicode_string = string.chars[:string.n]
        lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
        self._dict.set(string.key, lex)
@ -255,38 +266,12 @@ cdef class Lexicon:
        return Lexeme(<size_t>lexeme)


-cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
-    if string.n == 0:
-        return 1
-    lexemes = <LexemeC**>map_.get(string.key)
-    if lexemes == NULL:
-        return 0
-    cdef size_t i = 0
-    while lexemes[i] != NULL:
-        tokens.push_back(lexemes[i])
-        i += 1
-    return 1
-
-
 cdef void string_from_unicode(String* s, unicode uni):
    cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
-    string_from_slice(s, c_uni, 0, len(uni))
+    string_slice(s, c_uni, 0, len(uni))


-cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
+cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
    s.chars = &chars[start]
    s.n = end - start
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
-
-
-cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
-    string_from_slice(prefix, s.chars, 0, n)
-    s.chars += n
-    s.n -= n
-    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
-
-
-cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
-    string_from_slice(suffix, s.chars, s.n - n, s.n)
-    s.n -= n
-    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
--- a/spacy/orth.py
+++ b/spacy/orth.py
@ -79,7 +79,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats):

 def word_shape(string, *args):
    length = len(string)
-    shape = ""
+    shape = []
    last = ""
    shape_char = ""
    seq = 0
@ -99,8 +99,8 @@ def word_shape(string, *args):
            seq = 0
            last = shape_char
        if seq < 5:
-            shape += shape_char
-    return shape
+            shape.append(shape_char)
+    return ''.join(shape)


 def non_sparse(string, prob, cluster, case_stats, tag_stats):
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -2,14 +2,10 @@ from spacy.lexeme cimport LexemeC
 from libcpp.vector cimport vector


-cdef struct Token:
-    int i
-    int pos
-    LexemeC* lex
-
-
 cdef class Tokens:
-    cdef vector[Token] v
+    cdef vector[LexemeC*] lex
+    cdef vector[int] idx
+    cdef vector[int] pos

    cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
    cdef int push_back(self, int i, LexemeC* lexeme) except -1
@ -21,6 +17,7 @@ cdef class Tokens:
    cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
    cpdef unicode string_view(self, size_t i, size_t view_id)

+    cpdef unicode string(self, size_t i)
    cpdef unicode orig(self, size_t i)
    cpdef unicode norm(self, size_t i)
    cpdef unicode shape(self, size_t i)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -25,17 +25,20 @@ cdef class Tokens:
    """
    def __cinit__(self, string_length=0):
        size = int(string_length / 3) if string_length >= 3 else 1
-        self.v = vector[Token]()
-        self.v.reserve(size)
+        self.lex.reserve(size)
+        self.idx.reserve(size)
+        self.pos.reserve(size)

    def __getitem__(self, i):
-        return Lexeme(<size_t>self.v.at(i).lex)
+        return Lexeme(<size_t>self.lex.at(i))

    def __len__(self):
-        return self.v.size()
+        return self.lex.size()

    cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
-        self.v.push_back(Token(idx, 0, lexeme))
+        self.lex.push_back(lexeme)
+        self.idx.push_back(idx)
+        self.pos.push_back(0)
        return idx + lexeme.ints[<int>LexInt_length]

    cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
@ -46,120 +49,124 @@ cdef class Tokens:
            i = 0
            while lexemes[i] != NULL:
                idx = self.push_back(idx, lexemes[i])
+                i += 1
        else:
            for i in range(n):
                idx = self.push_back(idx, lexemes[i])
        return idx

    cpdef int id(self, size_t i) except -1:
-        return self.v.at(i).lex.ints[<int>LexInt_id]
+        return self.lex.at(i).ints[<int>LexInt_id]

    cpdef float prob(self, size_t i) except 1:
-        return self.v.at(i).lex.floats[<int>LexFloat_prob]
+        return self.lex.at(i).floats[<int>LexFloat_prob]

    cpdef int cluster(self, size_t i) except *:
-        return self.v.at(i).lex.ints[<int>LexInt_cluster]
+        return self.lex.at(i).ints[<int>LexInt_cluster]

    cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, flag_id)
+        return lexeme_check_orth_flag(self.lex.at(i), flag_id)

    cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, flag_id)
+        return lexeme_check_dist_flag(self.lex.at(i), flag_id)

    cpdef unicode string_view(self, size_t i, size_t view_id):
-        return lexeme_get_string(self.v.at(i).lex, view_id)
+        return lexeme_get_string(self.lex.at(i), view_id)

    # Provide accessor methods for the features supported by the language.
    # Without these, clients have to use the underlying string_view and check_flag
    # methods, which requires them to know the IDs.

+    cpdef unicode string(self, size_t i):
+        return self.orig(i)
+
    cpdef unicode orig(self, size_t i):
-        cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_orig]
+        cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_orig]
        cdef unicode string = utf8_string.decode('utf8')
        return string

    cpdef unicode norm(self, size_t i):
-        cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_norm]
+        cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_norm]
        cdef unicode string = utf8_string.decode('utf8')
        return string

    cpdef unicode shape(self, size_t i):
-        return lexeme_get_string(self.v.at(i).lex, LexStr_shape)
+        return lexeme_get_string(self.lex.at(i), LexStr_shape)

    cpdef unicode unsparse(self, size_t i):
-        return lexeme_get_string(self.v.at(i).lex, LexStr_unsparse)
+        return lexeme_get_string(self.lex.at(i), LexStr_unsparse)

    cpdef unicode asciied(self, size_t i):
-        return lexeme_get_string(self.v.at(i).lex, LexStr_asciied)
+        return lexeme_get_string(self.lex.at(i), LexStr_asciied)

    cpdef bint is_alpha(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_alpha)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_alpha)

    cpdef bint is_ascii(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_ascii)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_ascii)

    cpdef bint is_digit(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_digit)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_digit)

    cpdef bint is_lower(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_lower)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_lower)

    cpdef bint is_punct(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_punct)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_punct)

    cpdef bint is_space(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_space)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_space)

    cpdef bint is_title(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_title)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_title)

    cpdef bint is_upper(self, size_t i) except *:
-        return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_upper)
+        return lexeme_check_orth_flag(self.lex.at(i), LexOrth_upper)

    cpdef bint can_adj(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adj)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_adj)

    cpdef bint can_adp(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adp)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_adp)

    cpdef bint can_adv(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adv)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_adv)

    cpdef bint can_conj(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_conj)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_conj)

    cpdef bint can_det(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_det)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_det)

    cpdef bint can_noun(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_noun)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_noun)

    cpdef bint can_num(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_num)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_num)

    cpdef bint can_pdt(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pdt)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_pdt)

    cpdef bint can_pos(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pos)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_pos)

    cpdef bint can_pron(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pron)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_pron)

    cpdef bint can_prt(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_prt)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_prt)

    cpdef bint can_punct(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_punct)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_punct)

    cpdef bint can_verb(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_verb)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_verb)

    cpdef bint oft_lower(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_lower)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_lower)

    cpdef bint oft_title(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_title)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_title)

    cpdef bint oft_upper(self, size_t i) except *:
-        return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_upper)
+        return lexeme_check_dist_flag(self.lex.at(i), LexDist_upper)
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -4,3 +4,5 @@ ctypedef uint64_t hash_t
 ctypedef char* utf8_t
 ctypedef uint64_t flag_t
 ctypedef uintptr_t id_t
+
+
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,7 +1,7 @@
 import os
 from os import path
 import codecs
-import json
+import ujson
 import re

 DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
@ -16,28 +16,36 @@ def read_lang_data(name):
    tokenization = read_tokenization(data_dir)
    prefix = read_prefix(data_dir)
    suffix = read_suffix(data_dir)
+    infix = read_infix(data_dir)
    
    lex_loc = path.join(data_dir, 'lexemes.json')
    if path.exists(lex_loc):
        with open(lex_loc) as file_:
            lexemes = ujson.load(file_)
    else:
-        lexemes = []
-    return tokenization, prefix, suffix, lexemes
+        lexemes = {}
+    return tokenization, prefix, suffix, infix, lexemes


 def read_prefix(data_dir):
    with  utf8open(path.join(data_dir, 'prefix')) as file_:
        entries = file_.read().split('\n')
-        expression = '|'.join(['^' + re.escape(piece) for piece in entries])
+        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return expression

 def read_suffix(data_dir):
    with  utf8open(path.join(data_dir, 'suffix')) as file_:
        entries = file_.read().split('\n')
-        expression = '|'.join([re.escape(piece) + '$' for piece in entries])
+        expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
    return expression

+def read_infix(data_dir):
+    with utf8open(path.join(data_dir, 'infix')) as file_:
+        entries = file_.read().split('\n')
+        expression = '|'.join([piece for piece in entries if piece.strip()])
+    return expression
+
+
 def read_tokenization(lang):
    loc = path.join(DATA_DIR, lang, 'tokenization')
    entries = []
@ -60,3 +68,16 @@ def read_tokenization(lang):
                seen.add(chunk)
                entries.append((chunk, pieces))
    return entries
+
+
+def align_tokens(ref, indices):
+    start = 0
+    queue = list(indices)
+    for token in ref:
+        end = start + len(token)
+        emit = []
+        while queue and queue[0][1] <= end:
+            emit.append(queue.pop(0))
+        yield token, emit
+        start = end
+    assert not queue
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@ -7,20 +7,20 @@ from spacy.lexeme import *


 def test_is_alpha():
-    the = EN.lookup('the')
+    the = EN.lexicon.lookup('the')
    assert the.check_orth_flag(LexOrth_alpha)
-    year = EN.lookup('1999')
+    year = EN.lexicon.lookup('1999')
    assert not year.check_orth_flag(LexOrth_alpha)
-    mixed = EN.lookup('hello1')
+    mixed = EN.lexicon.lookup('hello1')
    assert not mixed.check_orth_flag(LexOrth_alpha)


 def test_is_digit():
-    the = EN.lookup('the')
+    the = EN.lexicon.lookup('the')
    assert not the.check_orth_flag(LexOrth_digit)
-    year = EN.lookup('1999')
+    year = EN.lexicon.lookup('1999')
    assert year.check_orth_flag(LexOrth_digit)
-    mixed = EN.lookup('hello1')
+    mixed = EN.lexicon.lookup('hello1')
    assert not mixed.check_orth_flag(LexOrth_digit)


--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@ -9,7 +9,7 @@ from spacy.lexeme import *

@pytest.fixture
 def C3P0():
-    return EN.lookup("C3P0")
+    return EN.lexicon.lookup("C3P0")


 def test_shape(C3P0):
@ -17,11 +17,11 @@ def test_shape(C3P0):


 def test_length():
-    t = EN.lookup('the')
+    t = EN.lexicon.lookup('the')
    assert t.length == 3
-    t = EN.lookup("n't")
+    t = EN.lexicon.lookup("n't")
    assert t.length == 3
-    t = EN.lookup("'s")
+    t = EN.lexicon.lookup("'s")
    assert t.length == 2
-    t = EN.lookup('Xxxx')
+    t = EN.lexicon.lookup('Xxxx')
    assert t.length == 4
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -27,7 +27,7 @@ def test_punct():

 def test_digits():
    lex_ids = EN.tokenize('The year: 1984.')
-    assert lex_ids.string(3) == "1984"
+    assert lex_ids.orig(3) == "1984"
    assert len(lex_ids) == 5
    assert lex_ids[0].string == EN.lexicon.lookup('The').string
    assert lex_ids[3].string == EN.lexicon.lookup('1984').string
@ -101,4 +101,4 @@ def test_cnts6():
 def test_cnts7():
    text = 'But then the 6,000-year ice age came...'
    tokens = EN.tokenize(text)
-    assert len(tokens) == 8
+    assert len(tokens) == 10
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -4,31 +4,31 @@ from spacy.en import EN


 def test_neq():
-    addr = EN.lookup('Hello')
-    assert EN.lookup('bye').string != addr.string
+    addr = EN.lexicon.lookup('Hello')
+    assert EN.lexicon.lookup('bye').string != addr.string


 def test_eq():
-    addr = EN.lookup('Hello')
-    assert EN.lookup('Hello').string == addr.string
+    addr = EN.lexicon.lookup('Hello')
+    assert EN.lexicon.lookup('Hello').string == addr.string


 def test_round_trip():
-    hello = EN.lookup('Hello')
+    hello = EN.lexicon.lookup('Hello')
    assert hello.string == 'Hello'


 def test_case_neq():
-    addr = EN.lookup('Hello')
-    assert EN.lookup('hello').string != addr.string
+    addr = EN.lexicon.lookup('Hello')
+    assert EN.lexicon.lookup('hello').string != addr.string


 def test_punct_neq():
-    addr = EN.lookup('Hello')
-    assert EN.lookup('Hello,').string != addr.string
+    addr = EN.lexicon.lookup('Hello')
+    assert EN.lexicon.lookup('Hello,').string != addr.string


 def test_short():
-    addr = EN.lookup('I')
+    addr = EN.lexicon.lookup('I')
    assert addr.string == 'I'
    assert addr.string != 'not'