From 01469b088825ed151fd1c828817887e1959e1ee0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Mon, 18 Aug 2014 19:14:00 +0200
Subject: [PATCH] * Refactor spacy so that chunks return arrays of lexemes, so
 that there is properly one lexeme per word.

---
 setup.py                     |  21 ++---
 spacy/__init__.py            |  14 ++--
 spacy/en.pxd                 |   2 +-
 spacy/en.pyx                 |  10 ++-
 spacy/en_ptb.pxd             |   2 +-
 spacy/en_ptb.pyx             |   5 +-
 spacy/lexeme.pxd             |   4 +-
 spacy/lexeme.pyx             |  16 +---
 spacy/spacy.pxd              |  21 +++--
 spacy/spacy.pyx              | 152 ++++++++++++-----------------------
 spacy/tokens.pxd             |   1 +
 spacy/util.py                |   7 +-
 tests/test_contractions.py   |  19 +++--
 tests/test_group_by.py       |   2 +-
 tests/test_orth.py           |   2 +-
 tests/test_post_punct.py     |  11 +--
 tests/test_pre_punct.py      |  16 ++--
 tests/test_rules.py          |   6 +-
 tests/test_surround_punct.py |   9 +--
 tests/test_tokenizer.py      |  15 ++--
 20 files changed, 123 insertions(+), 212 deletions(-)

diff --git a/setup.py b/setup.py
index 1b0093808..4abaf4ae4 100644
--- a/setup.py
+++ b/setup.py
@@ -39,29 +39,20 @@ cython_includes = ['.']
 
 if 'VIRTUAL_ENV' in os.environ:
     includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'include', 'site', '*'))
-    cython_includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'lib', '*'))
 else:
     # If you're not using virtualenv, set your include dir here.
     pass
 
 
 exts = [
+    Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.en", ["spacy/en.pyx"], language="c++",
-              include_dirs=includes, cython_include_dirs=cython_includes),
-    Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes,
-              cython_include_dirs=cython_includes),
-    Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes,
-              cython_include_dirs=cython_includes),
-    Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes,
-              cython_include_dirs=cython_includes),
-    Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes,
-              cython_include_dirs=cython_includes),
-    Extension("spacy.chartree", ["spacy/chartree.pyx"], language="c++", include_dirs=includes,
-              cython_include_dirs=cython_includes),
-    Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes,
-              cython_include_dirs=cython_includes),
+              include_dirs=includes),
+    Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
-              include_dirs=includes, cython_include_dirs=cython_includes),
+              include_dirs=includes),
 ]
 
 
diff --git a/spacy/__init__.py b/spacy/__init__.py
index 9f7c7932c..16d71aec6 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -1,16 +1,14 @@
 from .lexeme import lex_of
-from .lexeme import sic_of
 from .lexeme import length_of
 
 from .tokens import Tokens
 
 # Don't know how to get the enum Python visible :(
 
-SIC = 0
-LEX = 1
-NORM = 2
-SHAPE = 3
-LAST3 = 4
-LENGTH = 5
+LEX = 0
+NORM = 1
+SHAPE = 2
+LAST3 = 3
+LENGTH = 4
 
-__all__ = [Tokens, lex_of, sic_of, length_of, SIC, LEX, NORM, SHAPE, LAST3, LENGTH]
+__all__ = [Tokens, lex_of, length_of, LEX, NORM, SHAPE, LAST3, LENGTH]
diff --git a/spacy/en.pxd b/spacy/en.pxd
index ee58118a9..9f0edb791 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -9,7 +9,7 @@ from spacy.tokens cimport Tokens
 
 
 cdef class English(spacy.Language):
-    cdef int find_split(self, unicode word, size_t length)
+    cdef int find_split(self, unicode word)
 
 cdef English EN
 
diff --git a/spacy/en.pyx b/spacy/en.pyx
index 3245d8fa9..f90af1549 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -17,10 +17,13 @@ cimport spacy
 
 
 cdef class English(spacy.Language):
-    cdef int find_split(self, unicode word, size_t length):
+    cdef int find_split(self, unicode word):
+        cdef size_t length = len(word)
         cdef int i = 0
+        if word.startswith("'s") or word.startswith("'S"):
+            return 2
         # Contractions
-        if word.endswith("'s"):
+        if word.endswith("'s") and length >= 3:
             return length - 2
         # Leading punctuation
         if is_punct(word, 0, length):
@@ -36,7 +39,6 @@ cdef class English(spacy.Language):
 cdef bint is_punct(unicode word, size_t i, size_t length):
     # Don't count appostrophes as punct if the next char is a letter
     if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
-        # ...Unless we're at 0
         return i == 0
     if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
         return False
@@ -57,7 +59,7 @@ cpdef Tokens tokenize(unicode string):
  
 
 cpdef Lexeme_addr lookup(unicode string) except 0:
-    return EN.lookup_chunk(string)
+    return <Lexeme_addr>EN.lookup(string)
 
 
 cpdef unicode unhash(StringHash hash_value):
diff --git a/spacy/en_ptb.pxd b/spacy/en_ptb.pxd
index eaa0f8471..2f139a94f 100644
--- a/spacy/en_ptb.pxd
+++ b/spacy/en_ptb.pxd
@@ -8,7 +8,7 @@ from spacy.tokens cimport Tokens
 
 
 cdef class EnglishPTB(Language):
-    cdef int find_split(self, unicode word, size_t length)
+    cdef int find_split(self, unicode word)
     
 
 cdef EnglishPTB EN_PTB
diff --git a/spacy/en_ptb.pyx b/spacy/en_ptb.pyx
index 078b91b40..f70b26d45 100644
--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@@ -17,7 +17,8 @@ cimport spacy
 
 
 cdef class EnglishPTB(Language):
-    cdef int find_split(self, unicode word, size_t length):
+    cdef int find_split(self, unicode word):
+        length = len(word)
         cdef int i = 0
         # Contractions
         if word.endswith("'s"):
@@ -53,7 +54,7 @@ cpdef Tokens tokenize(unicode string):
 
 
 cpdef Lexeme_addr lookup(unicode string) except 0:
-    return EN_PTB.lookup_chunk(string)
+    return <Lexeme_addr>EN_PTB.lookup_chunk(string)
 
 
 cpdef unicode unhash(StringHash hash_value):
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 145a043c8..90d06587e 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -32,14 +32,13 @@ cdef struct Lexeme:
 
     Distribution* dist # Distribution info, lazy loaded
     Orthography* orth  # Extra orthographic views
-    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
+    #Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
 
 
 cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
 
 
 cdef enum StringAttr:
-    SIC
     LEX
     NORM
     SHAPE
@@ -49,7 +48,6 @@ cdef enum StringAttr:
 
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
 
-cpdef StringHash sic_of(size_t lex_id) except 0
 cpdef StringHash lex_of(size_t lex_id) except 0
 cpdef StringHash norm_of(size_t lex_id) except 0
 cpdef StringHash shape_of(size_t lex_id) except 0
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 430033db0..42c93ec60 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -22,9 +22,7 @@ from spacy.spacy cimport StringHash
 
 
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
-    if attr == SIC:
-        return sic_of(lex_id)
-    elif attr == LEX:
+    if attr == LEX:
         return lex_of(lex_id)
     elif attr == NORM:
         return norm_of(lex_id)
@@ -38,18 +36,6 @@ cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
         raise StandardError
 
 
-cpdef StringHash sic_of(size_t lex_id) except 0:
-    '''Access the `sic' field of the Lexeme pointed to by lex_id.
-    
-    The sic field stores the hash of the whitespace-delimited string-chunk used to
-    construct the Lexeme.
-    
-    >>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]
-    [u'Hi!', u'', u'world]
-    '''
-    return (<Lexeme*>lex_id).sic
-
-
 cpdef StringHash lex_of(size_t lex_id) except 0:
     '''Access the `lex' field of the Lexeme pointed to by lex_id.
 
diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd
index 3afc9a467..813eaa438 100644
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@@ -3,8 +3,6 @@ from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
 
 from sparsehash.dense_hash_map cimport dense_hash_map
-from _hashing cimport FixedTable
-from _hashing cimport WordTree
 
 # Circular import problems here
 ctypedef size_t Lexeme_addr
@@ -28,22 +26,21 @@ from spacy._hashing cimport WordTree
 
 cdef class Language:
     cdef object name
-    cdef WordTree vocab
-    cdef WordTree distri
-    cdef WordTree ortho
+    cdef dense_hash_map[StringHash, size_t] chunks
+    cdef dense_hash_map[StringHash, size_t] vocab
     cdef dict bacov
 
-    cpdef Tokens tokenize(self, unicode text)
+    cdef Tokens tokenize(self, unicode text)
 
-    cdef Lexeme_addr lookup(self, unicode string) except 0
-    cdef Lexeme_addr lookup_chunk(self, unicode string) except 0
-    cdef Orthography* lookup_orth(self, unicode lex) except NULL
-    cdef Distribution* lookup_dist(self, unicode lex) except NULL
+    cdef Lexeme* lookup(self, unicode string) except NULL
+    cdef Lexeme** lookup_chunk(self, unicode string) except NULL
     
-    cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL
+    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
+    cdef Lexeme* new_lexeme(self, unicode lex) except NULL
     cdef Orthography* new_orth(self, unicode lex) except NULL
     cdef Distribution* new_dist(self, unicode lex) except NULL
     
     cdef unicode unhash(self, StringHash hashed)
     
-    cdef int find_split(self, unicode word, size_t length)
+    cpdef list find_substrings(self, unicode word)
+    cdef int find_split(self, unicode word)
diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index d49138801..7da7c475f 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -5,7 +5,6 @@ from libc.stdlib cimport calloc, free
 from libcpp.pair cimport pair
 from cython.operator cimport dereference as deref
 
-from murmurhash cimport mrmr
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport BLANK_WORD
 
@@ -64,86 +63,56 @@ cdef class Language:
     def __cinit__(self, name):
         self.name = name
         self.bacov = {}
-        self.vocab = WordTree(0, 5)
-        self.ortho = WordTree(0, 5)
-        self.distri = WordTree(0, 5)
+        self.chunks = dense_hash_map[StringHash, size_t]()
+        self.vocab = dense_hash_map[StringHash, size_t]()
+        self.chunks.set_empty_key(0)
+        self.vocab.set_empty_key(0)
         self.load_tokenization(util.read_tokenization(name))
 
-    cpdef Tokens tokenize(self, unicode characters):
+    cdef Tokens tokenize(self, unicode characters):
         cdef size_t i = 0
         cdef size_t start = 0
-
+        cdef Lexeme** chunk
         cdef Tokens tokens = Tokens(self)
-        cdef Lexeme* token
-        for c in characters:
-            if _is_whitespace(c):
-                if start < i:
-                    token = <Lexeme*>self.lookup_chunk(characters[start:i])
-                    while token != NULL:
-                        tokens.append(<Lexeme_addr>token)
-                        token = token.tail
-                start = i + 1
-            i += 1
-        if start < i:
-            token = <Lexeme*>self.lookup_chunk(characters[start:])
-            while token != NULL:
-                tokens.append(<Lexeme_addr>token)
-                token = token.tail
+        for chunk_str in characters.split():
+            chunk = self.lookup_chunk(chunk_str)
+            i = 0
+            while chunk[i] != NULL:
+                tokens.append(<Lexeme_addr>chunk[i])
+                i += 1
         return tokens
 
-    cdef Lexeme_addr lookup(self, unicode string) except 0:
-        cdef size_t length = len(string)
-        if length == 0:
-            return <Lexeme_addr>&BLANK_WORD
+    cdef Lexeme* lookup(self, unicode string) except NULL:
+        if len(string) == 0:
+            return &BLANK_WORD
+        cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
+        if word == NULL:
+            word = self.new_lexeme(string)
+        return word
 
-        cdef StringHash hashed = hash(string)
-        # First, check words seen 2+ times
-        cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
-        if word_ptr == NULL:
-            word_ptr = self.new_lexeme(string, string)
-        return <Lexeme_addr>word_ptr
-
-    cdef Lexeme_addr lookup_chunk(self, unicode string) except 0:
-        '''Fetch a Lexeme representing a word string. If the word has not been seen,
-        construct one, splitting off any attached punctuation or clitics.  A
-        reference to BLANK_WORD is returned for the empty string.
-        '''
-        cdef size_t length = len(string)
-        if length == 0:
-            return <Lexeme_addr>&BLANK_WORD
-        # First, check words seen 2+ times
-        cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
+    cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
+        assert len(string) != 0
+        cdef Lexeme** chunk = <Lexeme**>self.chunks[hash(string)]
         cdef int split
-        if word_ptr == NULL:
-            split = self.find_split(string, length)
-            if split != 0 and split != -1 and split < length:
-                word_ptr = self.new_lexeme(string, string[:split])
-                word_ptr.tail = <Lexeme*>self.lookup_chunk(string[split:])
-            else:
-                word_ptr = self.new_lexeme(string, string)
-        return <Lexeme_addr>word_ptr
+        if chunk == NULL:
+            chunk = self.new_chunk(string, self.find_substrings(string))
+        return chunk
 
-    cdef Orthography* lookup_orth(self, unicode lex):
-        cdef Orthography* orth = <Orthography*>self.ortho.get(lex)
-        if orth == NULL:
-            orth = self.new_orth(lex)
-        return orth
+    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:
+        cdef Lexeme** chunk = <Lexeme**>calloc(len(substrings) + 1, sizeof(Lexeme*))
+        for i, substring in enumerate(substrings):
+            chunk[i] = self.lookup(substring)
+        chunk[i + 1] = NULL
+        self.chunks[hash(string)] = <size_t>chunk
+        return chunk
 
-    cdef Distribution* lookup_dist(self, unicode lex):
-        cdef Distribution* dist = <Distribution*>self.distri.get(lex)
-        if dist == NULL:
-            dist = self.new_dist(lex)
-        return dist
-
-    cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL:
+    cdef Lexeme* new_lexeme(self, unicode string) except NULL:
         cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
-        word.sic = hash(key)
         word.lex = hash(string)
         self.bacov[word.lex] = string
-        self.bacov[word.sic] = key
-        word.orth = self.lookup_orth(string)
-        word.dist = self.lookup_dist(string)
-        self.vocab.set(key, <size_t>word)
+        word.orth = self.new_orth(string)
+        word.dist = self.new_dist(string)
+        self.vocab[word.lex] = <size_t>word
         return word
 
     cdef Orthography* new_orth(self, unicode lex) except NULL:
@@ -170,30 +139,33 @@ cdef class Language:
         self.bacov[orth.norm] = norm
         self.bacov[orth.shape] = shape
 
-        self.ortho.set(lex, <size_t>orth)
         return orth
 
     cdef Distribution* new_dist(self, unicode lex) except NULL:
         dist = <Distribution*>calloc(1, sizeof(Distribution))
-        self.distri.set(lex, <size_t>dist)
         return dist
 
     cdef unicode unhash(self, StringHash hash_value):
         '''Fetch a string from the reverse index, given its hash value.'''
         return self.bacov[hash_value]
 
-    cdef int find_split(self, unicode word, size_t length):
-        return -1
+    cpdef list find_substrings(self, unicode word):
+        substrings = []
+        while word:
+            split = self.find_split(word)
+            if split == 0:
+                substrings.append(word)
+                break
+            substrings.append(word[:split])
+            word = word[split:]
+        return substrings
+
+    cdef int find_split(self, unicode word):
+        return len(word)
 
     def load_tokenization(self, token_rules=None):
-        cdef Lexeme* word
-        cdef StringHash hashed
-        for chunk, lex, tokens in token_rules:
-            word = <Lexeme*>self.new_lexeme(chunk, lex)
-            for i, lex in enumerate(tokens):
-                token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
-                word.tail = <Lexeme*>self.new_lexeme(token_string, lex)
-                word = word.tail
+        for chunk, tokens in token_rules:
+            self.new_chunk(chunk, tokens)
 
     def load_clusters(self):
         cdef Lexeme* w
@@ -209,24 +181,4 @@ cdef class Language:
                 # the first 4 bits. See redshift._parse_features.pyx
                 cluster = int(cluster_str[::-1], 2)
                 upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
-                word = self.new_lexeme(token_string, token_string)
-
-
-cdef inline bint _is_whitespace(unsigned char c) nogil:
-    if c == b' ':
-        return True
-    elif c == b'\n':
-        return True
-    elif c == b'\t':
-        return True
-    else:
-        return False
-
-
-cpdef vector[size_t] expand_chunk(size_t addr) except *:
-    cdef vector[size_t] tokens = vector[size_t]()
-    word = <Lexeme*>addr
-    while word != NULL:
-        tokens.push_back(<size_t>word)
-        word = word.tail
-    return tokens
+                self.new_lexeme(token_string)
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 5359761c0..ba692280f 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -1,5 +1,6 @@
 from libcpp.vector cimport vector
 from spacy.spacy cimport Lexeme_addr
+from spacy.lexeme cimport Lexeme
 
 from cython.operator cimport dereference as deref
 from spacy.spacy cimport Language
diff --git a/spacy/util.py b/spacy/util.py
index 4e080d0b3..4d12014ca 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -32,13 +32,12 @@ def read_tokenization(lang):
                 continue
             pieces = line.split()
             chunk = pieces.pop(0)
-            lex = pieces.pop(0)
             assert chunk not in seen, chunk
             seen.add(chunk)
-            entries.append((chunk, lex, pieces))
+            entries.append((chunk, list(pieces)))
             if chunk[0].isalpha() and chunk[0].islower():
                 chunk = chunk[0].title() + chunk[1:]
-                lex = lex[0].title() + lex[1:]
+                pieces[0] = pieces[0][0].title() + pieces[0][1:]
                 seen.add(chunk)
-                entries.append((chunk, lex, pieces))
+                entries.append((chunk, pieces))
     return entries
diff --git a/tests/test_contractions.py b/tests/test_contractions.py
index aa11faa39..1839b15f5 100644
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@@ -1,44 +1,43 @@
 from __future__ import unicode_literals
 
-from spacy.spacy import expand_chunk
-from spacy.en import lookup, unhash
+from spacy.en import tokenize, lookup, unhash
 
 from spacy import lex_of
 
 
 def test_possess():
-    tokens = expand_chunk(lookup("Mike's"))
-    assert len(tokens) == 2
+    tokens = tokenize("Mike's")
     assert unhash(lex_of(tokens[0])) == "Mike"
     assert unhash(lex_of(tokens[1])) == "'s"
+    assert len(tokens) == 2
 
 
 def test_apostrophe():
-    tokens = expand_chunk(lookup("schools'"))
+    tokens = tokenize("schools'")
     assert len(tokens) == 2
     assert unhash(lex_of(tokens[1])) == "'"
     assert unhash(lex_of(tokens[0])) == "schools"
 
 
 def test_LL():
-    tokens = expand_chunk(lookup("we'll"))
+    tokens = tokenize("we'll")
     assert len(tokens) == 2
     assert unhash(lex_of(tokens[1])) == "will"
     assert unhash(lex_of(tokens[0])) == "we"
 
 
 def test_aint():
-    tokens = expand_chunk(lookup("ain't"))
+    tokens = tokenize("ain't")
     assert len(tokens) == 2
     assert unhash(lex_of(tokens[0])) == "are"
     assert unhash(lex_of(tokens[1])) == "not"
 
 
 def test_capitalized():
-    tokens = expand_chunk(lookup("can't"))
+    tokens = tokenize("can't")
     assert len(tokens) == 2
-    tokens = expand_chunk(lookup("Can't"))
+    tokens = tokenize("Can't")
     assert len(tokens) == 2
-    tokens = expand_chunk(lookup("Ain't"))
+    tokens = tokenize("Ain't")
     assert len(tokens) == 2
     assert unhash(lex_of(tokens[0])) == "Are"
diff --git a/tests/test_group_by.py b/tests/test_group_by.py
index 2f9dd6ce0..9f83c5ce9 100644
--- a/tests/test_group_by.py
+++ b/tests/test_group_by.py
@@ -5,7 +5,7 @@ import pytest
 from spacy import en
 from spacy.lexeme import lex_of
 
-from spacy import SIC, LEX, NORM, SHAPE, LAST3
+from spacy import LEX, NORM, SHAPE, LAST3
 
 
 def test_group_by_lex():
diff --git a/tests/test_orth.py b/tests/test_orth.py
index f13fa90bf..503394916 100644
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@@ -4,7 +4,7 @@ import pytest
 
 from spacy.en import lookup, unhash
 
-from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of, length_of
+from spacy.lexeme import lex_of, norm_of, shape_of, first_of, length_of
 from spacy.lexeme import shape_of
 
 @pytest.fixture
diff --git a/tests/test_post_punct.py b/tests/test_post_punct.py
index 0138819db..f8391235a 100644
--- a/tests/test_post_punct.py
+++ b/tests/test_post_punct.py
@@ -1,8 +1,8 @@
 from __future__ import unicode_literals
 
 from spacy import lex_of
-from spacy.spacy import expand_chunk
 from spacy.en import lookup
+from spacy.en import tokenize
 from spacy.en import unhash
 
 import pytest
@@ -17,8 +17,7 @@ def test_close(close_puncts):
     word_str = 'Hello'
     for p in close_puncts:
         string = word_str + p
-        token = lookup(string)
-        tokens = expand_chunk(token)
+        tokens = tokenize(string)
         assert len(tokens) == 2
         assert unhash(lex_of(tokens[1])) == p
         assert unhash(lex_of(tokens[0])) == word_str
@@ -28,9 +27,7 @@ def test_two_different_close(close_puncts):
     word_str = 'Hello'
     for p in close_puncts:
         string = word_str + p + "'"
-        token = lookup(string)
-        assert unhash(lex_of(token)) == word_str
-        tokens = expand_chunk(token)
+        tokens = tokenize(string)
         assert len(tokens) == 3
         assert unhash(lex_of(tokens[0])) == word_str
         assert unhash(lex_of(tokens[1])) == p
@@ -41,7 +38,7 @@ def test_three_same_close(close_puncts):
     word_str = 'Hello'
     for p in close_puncts:
         string = word_str + p + p + p
-        tokens = expand_chunk(lookup(string))
+        tokens = tokenize(string)
         assert len(tokens) == 4
         assert unhash(lex_of(tokens[0])) == word_str
         assert unhash(lex_of(tokens[1])) == p
diff --git a/tests/test_pre_punct.py b/tests/test_pre_punct.py
index d1cd10bf6..5a4a4d072 100644
--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@@ -1,8 +1,8 @@
 from __future__ import unicode_literals
 
 from spacy import lex_of
-from spacy.spacy import expand_chunk
 from spacy.en import lookup
+from spacy.en import tokenize
 from spacy.en import unhash
 
 import pytest
@@ -17,9 +17,7 @@ def test_open(open_puncts):
     word_str = 'Hello'
     for p in open_puncts:
         string = p + word_str
-        token = lookup(string)
-        assert unhash(lex_of(token)) == p
-        tokens = expand_chunk(token)
+        tokens = tokenize(string)
         assert len(tokens) == 2
         assert unhash(lex_of(tokens[0])) == p
         assert unhash(lex_of(tokens[1])) == word_str
@@ -29,9 +27,7 @@ def test_two_different_open(open_puncts):
     word_str = 'Hello'
     for p in open_puncts:
         string = p + "`" + word_str
-        token = lookup(string)
-        assert unhash(lex_of(token)) == p
-        tokens = expand_chunk(token)
+        tokens = tokenize(string)
         assert len(tokens) == 3
         assert unhash(lex_of(tokens[0])) == p
         assert unhash(lex_of(tokens[1])) == "`"
@@ -42,9 +38,7 @@ def test_three_same_open(open_puncts):
     word_str = 'Hello'
     for p in open_puncts:
         string = p + p + p + word_str
-        token = lookup(string)
-        assert unhash(lex_of(token)) == p
-        tokens = expand_chunk(token)
+        tokens = tokenize(string)
         assert len(tokens) == 4
         assert unhash(lex_of(tokens[0])) == p
         assert unhash(lex_of(tokens[3])) == word_str
@@ -52,6 +46,6 @@ def test_three_same_open(open_puncts):
 
 def test_open_appostrophe():
     string = "'The"
-    tokens = expand_chunk(lookup(string))
+    tokens = tokenize(string)
     assert len(tokens) == 2
     assert unhash(lex_of(tokens[0])) == "'"
diff --git a/tests/test_rules.py b/tests/test_rules.py
index f95f1f820..b19a1c3f1 100644
--- a/tests/test_rules.py
+++ b/tests/test_rules.py
@@ -5,7 +5,7 @@ def test_load_en():
     rules = util.read_tokenization('en')
     assert len(rules) != 0
     aint = [rule for rule in rules if rule[0] == "ain't"][0]
-    chunk, lex, pieces = aint
+    chunk, pieces = aint
     assert chunk == "ain't"
-    assert lex == "are"
-    assert pieces == ["not"]
+    assert pieces[0] == "are"
+    assert pieces[1] == "not"
diff --git a/tests/test_surround_punct.py b/tests/test_surround_punct.py
index bef9cc83a..2c3a7f837 100644
--- a/tests/test_surround_punct.py
+++ b/tests/test_surround_punct.py
@@ -1,7 +1,7 @@
 from __future__ import unicode_literals
 
-from spacy import lex_of, sic_of
-from spacy.spacy import expand_chunk
+from spacy import lex_of
+from spacy.en import tokenize
 from spacy.en import lookup
 from spacy.en import unhash
 
@@ -17,19 +17,18 @@ def test_token(paired_puncts):
     word_str = 'Hello'
     for open_, close_ in paired_puncts:
         string = open_ + word_str + close_
-        tokens = expand_chunk(lookup(string))
+        tokens = tokenize(string)
         assert len(tokens) == 3
         assert unhash(lex_of(tokens[0])) == open_
         assert unhash(lex_of(tokens[1])) == word_str
         assert unhash(lex_of(tokens[2])) == close_
-        assert unhash(sic_of(tokens[0])) == string
 
 
 def test_two_different(paired_puncts):
     word_str = 'Hello'
     for open_, close_ in paired_puncts:
         string = "`" + open_ + word_str + close_ + "'"
-        tokens = expand_chunk(lookup(string))
+        tokens = tokenize(string)
         assert len(tokens) == 5
         assert unhash(lex_of(tokens[0])) == "`"
         assert unhash(lex_of(tokens[1])) == open_
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index c3760c6fb..a0dbdc129 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -19,15 +19,12 @@ def test_two_words():
 
 
 def test_punct():
-    lex_ids = tokenize('hello, possums.')
-    assert len(lex_ids) == 4
-    assert lex_ids[0] != lookup('hello')
-    assert lex_of(lex_ids[0]) == lex_of(lookup('hello'))
-    assert lex_ids[2] == lookup('possums.')
-    assert lex_of(lex_ids[2]) == lex_of(lookup('possums.'))
-    assert lex_of(lex_ids[2]) == lex_of(lookup('possums'))
-    assert lex_of(lex_ids[1]) != lex_of(lookup('hello'))
-    assert lex_ids[0] != lookup('hello.')
+    tokens = tokenize('hello, possums.')
+    assert len(tokens) == 4
+    assert lex_of(tokens[0]) == lex_of(lookup('hello'))
+    assert lex_of(tokens[1]) == lex_of(lookup(','))
+    assert lex_of(tokens[2]) == lex_of(lookup('possums'))
+    assert lex_of(tokens[1]) != lex_of(lookup('hello'))
 
 
 def test_digits():