Merge branch 'feature/contractions' into develop

2026-03-07 05:11:27 +03:00 · 2014-07-07 05:11:43 +02:00 · 2014-07-07 05:11:43 +02:00 · aaae66114c
commit aaae66114c
parent 556f6a18ca e60b958b7d
36 changed files with 926535 additions and 12119 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,10 @@
 # Vim
 *.swp

+spacy/*.cpp
+ext/murmurhash.cpp
+ext/sparsehash.cpp
+
 _build/
 .env/

--- a/data/en/case
+++ b/data/en/case
--- a/data/en/clusters
+++ b/data/en/clusters
--- a/data/en/tokenization
+++ b/data/en/tokenization
@ -0,0 +1,93 @@
+# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
+#  21:09, 25 June 2014
+#*--*  --
+#*---* ---
+#*'s  's
+
+ain't   are not
+aren't  are not
+can't   can not
+could've    could have
+couldn't    could not
+couldn't've could not have
+didn't  did not
+doesn't does not
+don't   do not
+hadn't  had not
+hadn't've   had not have
+hasn't  has not
+haven't have not
+he'd    he would
+he'd've he would have
+he'll   he will
+he's    he 's
+how'd   he would
+how'll  he will
+how's   how 's
+I'd I would
+I'd've  I would have
+I'll    I will
+I'm I am
+I've    I have
+isn't   is not
+it'd    it would
+it'd've it would have
+it'll   it will
+it's    it 's
+let's   let 's
+mightn't    might not
+mightn't've might not have
+might've    might have
+mustn't must not
+must've must have
+needn't need not
+not've  not have
+shan't  shall not
+she'd   she would
+she'd've    she would have
+she'll  she will
+she's   she 's
+should've   should have
+shouldn't   should not
+shouldn't've    should not have
+that's  that 's
+there'd there would
+there'd've  there would have
+there's there is
+they'd  there would
+they'd've   they would have
+they'll they will
+they're they are
+they've they have
+wasn't  was not
+we'd    we would
+we'd've we would have
+we'll   we will
+we're   we are
+we've   we have
+weren't were not
+what'll what will
+what're what are
+what's  what 's
+what've what have
+when's  when 's
+where'd where would
+where's where 's
+where've    where have
+who'd   who would
+who'll  who will
+who're  who are
+who's   who 's
+who've  who have
+why'll  who will
+why're  why are
+why's   why 's
+won't   will not
+would've    would have
+wouldn't    would not
+wouldn't've would not have
+you'd   you would
+you'd've    you would have
+you'll  you will
+you're  you are
+you've  you have
--- a/data/en_ptb/case
+++ b/data/en_ptb/case
--- a/data/en_ptb/clusters
+++ b/data/en_ptb/clusters
--- a/data/en_ptb/tokenization
+++ b/data/en_ptb/tokenization
@ -0,0 +1,104 @@
+# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
+#  21:09, 25 June 2014
+#*--*  --
+#*---* ---
+#*'s  's
+
+cannot  can not
+d'ye    d' ye
+gimme   gim me
+gonna   gon na
+lemme   lem me
+more'n  more 'n
+'tis    't is
+'twas   't was
+wanna   wan na
+whaddya wha dd ya
+whatcha wha t cha
+ain't   ai n't
+aren't  are n't
+can't   can n't
+could've    could 've
+couldn't    could n't
+couldn't've could n't 've
+didn't  did n't
+doesn't does n't
+don't   do n't
+hadn't  had n't
+hadn't've   had n't 've
+hasn't  has n't
+haven't have n't
+he'd    he 'd
+he'd've he 'd 've
+he'll   he 'll
+he's    he 's
+how'd   he 'd
+how'll  he 'll
+how's   how 's
+I'd I 'd
+I'd've  I 'd 've
+I'll    I 'll
+I'm I 'm
+I've    I 've
+isn't   is n't
+it'd    it 'd
+it'd've it 'd 've
+it'll   it 'll
+it's    it 's
+let's   let 's
+mightn't    might n't
+mightn't've might n't 've
+might've    might 've
+mustn't must n't
+must've must 've
+needn't need n't
+not've  not h've
+shan't  sha n't
+she'd   she 'd
+she'd've    she 'd 've
+she'll  she 'll
+she's   she 's
+should've   should 've
+shouldn't   should n't
+shouldn't've    should n't 've
+that's  that 's
+there'd there 'd
+there'd've  there 'd 've
+there's there 's
+they'd  there 'd
+they'd've   they 'd 've
+they'll they 'll
+they're they 're
+they've they 've
+wasn't  was n't
+we'd    we 'd
+we'd've we 'd h've
+we'll   we 'll
+we're   we 're
+we've   we h've
+weren't were n't
+what'll what 'll
+what're what 're
+what's  what 's
+what've what 've
+when's  when 's
+where'd where 'd
+where's where 's
+where've    where 've
+who'd   who 'd
+who'll  who 'll
+who're  who 're
+who's   who 's
+who've  who 've
+why'll  why 'll
+why're  why 're
+why's   why 's
+won't   will n't
+would've    would 've
+wouldn't    would n't
+wouldn't've would n't 've
+you'd   you 'd
+you'd've    you 'd 've
+you'll  you 'll
+you're  you 're
+you've  you 've
--- a/ext/murmurhash.cpp
+++ b/ext/murmurhash.cpp
--- a/ext/sparsehash.cpp
+++ b/ext/sparsehash.cpp
--- a/setup.py
+++ b/setup.py
@ -45,8 +45,18 @@ exts = [
              ["spacy/en.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
              language="c++",
              include_dirs=[path.join(HERE, 'ext')]),
+    Extension("spacy.en_ptb",
+              ["spacy/en_ptb.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
+              language="c++",
+              include_dirs=[path.join(HERE, 'ext')]),
+ 
    Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.spacy",
+             ["spacy/spacy.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
+             language="c++", include_dirs=includes),
+    Extension("spacy.string_tools",
+             ["spacy/string_tools.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
+             language="c++", include_dirs=includes),
 ]


--- a/spacy/en.cpp
+++ b/spacy/en.cpp
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,17 +1,15 @@
-from ext.sparsehash cimport dense_hash_map
-from spacy.lexeme cimport StringHash
+from libcpp.vector cimport vector
+
+from spacy.spacy cimport StringHash
+from spacy.spacy cimport Vocab
 from spacy.lexeme cimport Lexeme
+from spacy.lexeme cimport Lexeme_addr


-ctypedef Py_UNICODE* string_ptr
-ctypedef size_t Lexeme_addr # For python interop 
-ctypedef Lexeme* Lexeme_ptr
-
-
-cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES
+cdef Vocab VOCAB
+cdef dict BACOV


 cpdef Lexeme_addr lookup(unicode word) except 0
-cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0
-cdef StringHash hash_string(unicode s, size_t length) except 0
+cpdef vector[Lexeme_addr] tokenize(unicode string) except *
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -6,160 +6,69 @@ from __future__ import unicode_literals

 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
+from libcpp.vector cimport vector

 from spacy.lexeme cimport Lexeme
-from ext.murmurhash cimport MurmurHash64A
-from ext.murmurhash cimport MurmurHash64B
+from spacy.string_tools cimport substr
+from . import util
+
+cimport spacy
+
+BACOV = {}
+VOCAB = Vocab()
+VOCAB.set_empty_key(0)


-STRINGS = {}
-LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]()
-LEXEMES.set_empty_key(0)
+spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))


-cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
-
+cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
+    return spacy.tokenize(VOCAB, BACOV, find_split, string)
+ 

 cpdef Lexeme_addr lookup(unicode string) except 0:
-    '''.. function:: enumerate(sequence[, start=0])
-    Fetch a Lexeme representing a word string. If the word has not been seen,
-    construct one, splitting off any attached punctuation or clitics.  A
-    reference to BLANK_WORD is returned for the empty string.
-    
-    To specify the boundaries of the word if it has not been seen, use lookup_chunk.
-    '''
-    if string == '':
-        return <Lexeme_addr>&BLANK_WORD
-    cdef size_t length = len(string)
-    cdef StringHash hashed = hash_string(string, length)
-    cdef Lexeme* word_ptr = LEXEMES[hashed]
-    cdef size_t n
-    if word_ptr == NULL:
-        word_ptr = _add(hashed, string, _find_split(string, length), length)
-    return <Lexeme_addr>word_ptr
-
-
-cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0:
-    '''Fetch a Lexeme representing a word string. If the word has not been seen,
-    construct one, given the specified start and end indices.  A negative index
-    significes 0 for start, and the string length for end --- i.e. the string
-    will not be sliced if start == -1 and end == -1.
-    
-    A reference to BLANK_WORD is returned for the empty string.
-    '''
-    if string == '':
-        return <Lexeme_addr>&BLANK_WORD
-    cdef size_t length = len(string)
-    cdef StringHash hashed = hash_string(string, length)
-    cdef Lexeme* chunk_ptr = LEXEMES[hashed]
-    if chunk_ptr == NULL:
-        chunk_ptr = _add(hashed, string, start, length)
-    return <Lexeme_addr>chunk_ptr
-
-
-cdef StringHash hash_string(unicode s, size_t length) except 0:
-    '''Hash unicode with MurmurHash64A'''
-    assert length
-    return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0)
+    return spacy.lookup(VOCAB, BACOV, find_split, -1, string)


 cpdef unicode unhash(StringHash hash_value):
-    '''Fetch a string from the reverse index, given its hash value.'''
-    cdef string_ptr string = STRINGS[hash_value]
-    if string == NULL:
-        raise ValueError(hash_value)
-
-    return string
+    return spacy.unhash(BACOV, hash_value)


-cdef unicode normalize_word_string(unicode word):
-    '''Return a normalized version of the word, mapping:
-    - 4 digit strings into !YEAR
-    - Other digit strings into !DIGITS
-    - All other strings into lower-case
-    '''
+cdef vector[StringHash] make_string_views(unicode word):
    cdef unicode s
-    if word.isdigit() and len(word) == 4:
-        return '!YEAR'
-    elif word[0].isdigit():
-        return '!DIGITS'
-    else:
-        return word.lower()
-    
-
-cpdef unicode _substr(unicode string, int start, int end, size_t length):
-    if end >= length:
-        end = -1
-    if start >= length:
-        start = 0
-    if start <= 0 and end < 0:
-        return string
-    elif start < 0:
-        start = 0
-    elif end < 0:
-        end = length
-    return string[start:end]
+    return vector[StringHash]()
+    #if word.isdigit() and len(word) == 4:
+    #    return '!YEAR'
+    #elif word[0].isdigit():
+    #    return '!DIGITS'
+    #else:
+    #    return word.lower()
  

-cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL:
-    assert string
-    assert split <= length
-    word = _init_lexeme(string, hashed, split, length)
-    LEXEMES[hashed] = word
-    STRINGS[hashed] = string
-    return word
-
-
-cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
-                          int split, size_t length) except NULL:
-    assert split <= length
-    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
-
-    word.first = <Py_UNICODE>(string[0] if string else 0)
-    word.sic = hashed
-    
-    cdef unicode tail_string
-    cdef unicode lex 
-    if split != 0 and split < length:
-        lex = _substr(string, 0, split, length)
-        tail_string = _substr(string, split, length, length)
-    else:
-        lex = string
-        tail_string = ''
-    assert lex
-    cdef unicode normed = normalize_word_string(lex)
-    cdef unicode last3 = _substr(string, length - 3, length, length)
-
-    assert normed
-    assert len(normed)
-    
-    word.lex = hash_string(lex, len(lex))
-    word.normed = hash_string(normed, len(normed))
-    word.last3 = hash_string(last3, len(last3))
-
-    STRINGS[word.lex] = lex
-    STRINGS[word.normed] = normed
-    STRINGS[word.last3] = last3
-
-    # These are loaded later
-    word.prob = 0
-    word.cluster = 0
-    word.oft_upper = False
-    word.oft_title = False
-    
-    # Now recurse, and deal with the tail
-    if tail_string:
-        word.tail = <Lexeme*>lookup(tail_string)
-    return word
-
-
-cdef size_t _find_split(unicode word, size_t length):
-    cdef size_t i = 0
-    if word[0].isalnum():
-        while i < length and word[i].isalnum():
-            i += 1
-    else:
-        # Split off a punctuation character, or a sequence of the same punctuation character
-        while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]):
+cdef int find_split(unicode word, size_t length):
+    cdef int i = 0
+    # Contractions
+    if word.endswith("'s"):
+        return length - 2
+    # Leading punctuation
+    if is_punct(word, 0, length):
+        return 1
+    elif length >= 1:
+        # Split off all trailing punctuation characters
+        i = 0
+        while i < length and not is_punct(word, i, length):
            i += 1
    return i
+
+
+cdef bint is_punct(unicode word, size_t i, size_t length):
+    # Don't count appostrophes as punct if the next char is a letter
+    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
+        return False
+    # Don't count commas as punct if the next char is a number
+    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
+        return False
+    # Don't count periods as punct if the next char is a number
+    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
+        return False
+    return not word[i].isalnum()
--- a/spacy/en_ptb.pxd
+++ b/spacy/en_ptb.pxd
@ -0,0 +1,15 @@
+from libcpp.vector cimport vector
+
+from spacy.spacy cimport StringHash
+from spacy.spacy cimport Vocab
+from spacy.lexeme cimport Lexeme
+from spacy.lexeme cimport Lexeme_addr
+
+
+cdef Vocab VOCAB
+cdef dict BACOV
+
+
+cpdef Lexeme_addr lookup(unicode word) except 0
+cpdef vector[Lexeme_addr] tokenize(unicode string) except *
+cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@ -0,0 +1,74 @@
+'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
+so that strings can be retrieved from hashes.  Use 64-bit hash values and
+boldly assume no collisions.
+'''
+from __future__ import unicode_literals
+
+from libc.stdlib cimport malloc, calloc, free
+from libc.stdint cimport uint64_t
+from libcpp.vector cimport vector
+
+from spacy.lexeme cimport Lexeme
+from spacy.string_tools cimport substr
+from . import util
+
+cimport spacy
+
+BACOV = {}
+VOCAB = Vocab()
+VOCAB.set_empty_key(0)
+
+
+spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en_ptb'))
+
+
+cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
+    return spacy.tokenize(VOCAB, BACOV, find_split, string)
+ 
+
+cpdef Lexeme_addr lookup(unicode string) except 0:
+    return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
+
+
+cpdef unicode unhash(StringHash hash_value):
+    return spacy.unhash(BACOV, hash_value)
+
+
+cdef vector[StringHash] make_string_views(unicode word):
+    cdef unicode s
+    return vector[StringHash]()
+    #if word.isdigit() and len(word) == 4:
+    #    return '!YEAR'
+    #elif word[0].isdigit():
+    #    return '!DIGITS'
+    #else:
+    #    return word.lower()
+  
+
+cdef int find_split(unicode word, size_t length):
+    cdef int i = 0
+    # Contractions
+    if word.endswith("'s"):
+        return length - 2
+    # Leading punctuation
+    if is_punct(word, 0, length):
+        return 1
+    elif length >= 1:
+        # Split off all trailing punctuation characters
+        i = 0
+        while i < length and not is_punct(word, i, length):
+            i += 1
+    return i
+
+
+cdef bint is_punct(unicode word, size_t i, size_t length):
+    is_final = i == (length - 1)
+    if word[i] == '.':
+        return False
+    if not is_final and word[i] == '-' and word[i+1] == '-':
+        return True
+    # Don't count appostrophes as punct if the next char is a letter
+    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
+        return False
+    punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
+    return word[i] in punct_chars
--- a/spacy/lexeme.cpp
+++ b/spacy/lexeme.cpp
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,9 +1,12 @@
 from libc.stdint cimport uint64_t

-
+# Put these above import to avoid circular import problem
 ctypedef int ClusterID
 ctypedef uint64_t StringHash
+ctypedef size_t Lexeme_addr

+from spacy.spacy cimport Vocab
+from spacy.spacy cimport Splitter

 cdef struct Lexeme:
    StringHash sic # Hash of the original string
@ -20,6 +23,12 @@ cdef struct Lexeme:
    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens


+cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
+
+cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
+                         unicode string, StringHash hashed,
+                         int split, size_t length) except NULL
+ 
 # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
 # has a conditional to pick out the correct item.  This allows safe iteration
 # over the Lexeme, via:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -2,6 +2,60 @@
 Mostly useful from Python-space. From Cython-space, you can just cast to
 Lexeme* yourself.
 '''
+from __future__ import unicode_literals
+
+from spacy.string_tools cimport substr
+from spacy.spacy cimport hash_string
+from spacy.spacy cimport lookup
+
+from libc.stdlib cimport malloc, calloc, free
+from libc.stdint cimport uint64_t
+from libcpp.vector cimport vector
+
+
+cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
+                         unicode string, StringHash hashed,
+                         int split, size_t length) except NULL:
+    assert split <= length
+    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+
+    word.first = <Py_UNICODE>(string[0] if string else 0)
+    word.sic = hashed
+    
+    cdef unicode tail_string
+    cdef unicode lex 
+    if split != 0 and split < length:
+        lex = substr(string, 0, split, length)
+        tail_string = substr(string, split, length, length)
+    else:
+        lex = string
+        tail_string = ''
+    assert lex
+    #cdef unicode normed = normalize_word_string(lex)
+    cdef unicode normed = '?'
+    cdef unicode last3 = substr(string, length - 3, length, length)
+
+    assert normed
+    assert len(normed)
+    
+    word.lex = hash_string(lex, len(lex))
+    word.normed = hash_string(normed, len(normed))
+    word.last3 = hash_string(last3, len(last3))
+
+    bacov[word.lex] = lex
+    bacov[word.normed] = normed
+    bacov[word.last3] = last3
+
+    # These are loaded later
+    word.prob = 0
+    word.cluster = 0
+    word.oft_upper = False
+    word.oft_title = False
+    
+    # Now recurse, and deal with the tail
+    if tail_string:
+        word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string)
+    return word


 cpdef StringHash sic_of(size_t lex_id) except 0:
--- a/spacy/spacy.cpp
+++ b/spacy/spacy.cpp
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -1,5 +1,24 @@
 from libcpp.vector cimport vector
+from libc.stdint cimport uint64_t
+
+from ext.sparsehash cimport dense_hash_map
+
+# Circular import problems here
+ctypedef size_t Lexeme_addr
+ctypedef uint64_t StringHash
+ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
+ctypedef int (*Splitter)(unicode word, size_t length)
+
+
 from spacy.lexeme cimport Lexeme

-
+cdef load_tokenization(Vocab& vocab, dict bacov, token_rules)
+cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
+                                  unicode string) except *
+cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start,
+                        unicode string) except 0
+cdef StringHash hash_string(unicode s, size_t length) except 0
+cdef unicode unhash(dict bacov, StringHash hash_value)
+ 
+ 
 cpdef vector[size_t] expand_chunk(size_t addr) except *
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -1,5 +1,78 @@
 from __future__ import unicode_literals
-from spacy.lexeme cimport Lexeme
+
+from ext.murmurhash cimport MurmurHash64A
+from ext.murmurhash cimport MurmurHash64B
+
+from spacy.lexeme cimport init_lexeme
+from spacy.lexeme cimport BLANK_WORD
+
+from spacy.string_tools cimport is_whitespace
+
+from . import util
+
+
+cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
+    cdef Lexeme* word
+    cdef StringHash hashed
+    for chunk, lex, tokens in token_rules:
+        hashed = hash_string(chunk, len(chunk))
+        assert vocab[hashed] == 0, chunk
+        word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
+        for i, lex in enumerate(tokens):
+            token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
+            length = len(token_string)
+            hashed = hash_string(token_string, length)
+            word.tail = _add(vocab, bacov, <Splitter>NULL, hashed, lex, 0, len(lex))
+            word = word.tail
+
+
+cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
+                                  unicode string) except *:
+    cdef size_t length = len(string)
+    cdef Py_UNICODE* characters = <Py_UNICODE*>string
+
+    cdef size_t i
+    cdef Py_UNICODE c
+
+    cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
+    cdef unicode current = u''
+    cdef Lexeme* token
+    for i in range(length):
+        c = characters[i]
+        if is_whitespace(c):
+            if current:
+                token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
+                while token != NULL:
+                    tokens.push_back(<Lexeme_addr>token)
+                    token = token.tail
+            current = u''
+        else:
+            current += c
+    if current:
+        token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
+        while token != NULL:
+            tokens.push_back(<Lexeme_addr>token)
+            token = token.tail
+    return tokens
+
+
+cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start,
+                        unicode string) except 0:
+    '''Fetch a Lexeme representing a word string. If the word has not been seen,
+    construct one, splitting off any attached punctuation or clitics.  A
+    reference to BLANK_WORD is returned for the empty string.
+    
+    To specify the boundaries of the word if it has not been seen, use lookup_chunk.
+    '''
+    if string == '':
+        return <Lexeme_addr>&BLANK_WORD
+    cdef size_t length = len(string)
+    cdef StringHash hashed = hash_string(string, length)
+    cdef Lexeme* word_ptr = <Lexeme*>vocab[hashed]
+    if word_ptr == NULL:
+        start = find_split(string, length) if start == -1 else start
+        word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
+    return <Lexeme_addr>word_ptr


 cpdef vector[size_t] expand_chunk(size_t addr) except *:
@ -11,62 +84,22 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
    return tokens


-"""
-cpdef vector[size_t] ids_from_text(unicode text) except *:
-    cdef size_t length = len(text)
-    cdef Py_UNICODE* characters = <Py_UNICODE*>text
-
-    cdef size_t i
-    cdef Py_UNICODE c
-
-    cdef vector[size_t] tokens = vector[size_t]()
-    cdef unicode current = u''
-    cdef Lexeme* token
-    cdef int alnum_end = -1
-    cdef size_t alnum_start = 0
-    cdef bint seen_alnum = False
-    for i in range(length):
-        c = characters[i]
-        if is_whitespace(c):
-            token = <Lexeme*>lookup(current)
-            tokens.push_back(<size_t>token)
-            clitic = 0
-            while token.clitics[clitic]:
-                tokens.push_back(token.clitics[clitic])
-                clitic += 1
-            current = u''
-            alnum_start = 0
-            alnum_end = -1
-            seen_alnum = False
-        else:
-            if not seen_alnum and c.isalnum():
-                alnum_start = i
-                seen_alnum = True
-            elif seen_alnum and alnum_end == -1 and not c.isalnum():
-                alnum_end = i
-            current += c
-    if current:
-        token = <Lexeme*>lookup(current)
-        tokens.push_back(<size_t>token)
-        clitic = 0
-        while token.clitics[clitic]:
-            tokens.push_back(token.clitics[clitic])
-            clitic += 1
-    return tokens
-"""
-
-#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *:
-#    pass
+cdef StringHash hash_string(unicode s, size_t length) except 0:
+    '''Hash unicode with MurmurHash64A'''
+    assert length
+    return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)


-cdef inline bint is_whitespace(Py_UNICODE c):
-    # TODO: Support other unicode spaces
-    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
-    if c == u' ':
-        return True
-    elif c == u'\n':
-        return True
-    elif c == u'\t':
-        return True
-    else:
-        return False
+cdef unicode unhash(dict bacov, StringHash hash_value):
+    '''Fetch a string from the reverse index, given its hash value.'''
+    return bacov[hash_value]
+
+
+cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed,
+                  unicode string, int split, size_t length) except NULL:
+    assert string
+    assert split <= length
+    word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
+    vocab[hashed] = <Lexeme_addr>word
+    bacov[hashed] = string
+    return word
--- a/spacy/string_tools.pxd
+++ b/spacy/string_tools.pxd
@ -0,0 +1,3 @@
+cpdef unicode substr(unicode string, int start, int end, size_t length)
+
+cdef bint is_whitespace(Py_UNICODE c)
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -0,0 +1,25 @@
+cpdef unicode substr(unicode string, int start, int end, size_t length):
+    if end >= length:
+        end = -1
+    if start >= length:
+        start = 0
+    if start <= 0 and end < 0:
+        return string
+    elif start < 0:
+        start = 0
+    elif end < 0:
+        end = length
+    return string[start:end]
+  
+
+cdef bint is_whitespace(Py_UNICODE c):
+    # TODO: Support other unicode spaces
+    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
+    if c == u' ':
+        return True
+    elif c == u'\n':
+        return True
+    elif c == u'\t':
+        return True
+    else:
+        return False
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,3 +1,10 @@
+import os
+from os import path
+import codecs
+
+DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
+
+
 def utf8open(loc, mode='r'):
    return codecs.open(loc, mode, 'utf8')

@ -12,23 +19,28 @@ def load_case_stats(data_dir):
    return case_stats


-def load_clitics(data_dir):
-    clitics_loc = path.join(data_dir, 'clitics.txt')
+def read_tokenization(lang):
+    loc = path.join(DATA_DIR, lang, 'tokenization')
    entries = []
    seen = set()
-    with utf8open(clitics_loc) as clitics_file:
-        for line in clitics_file:
+    with utf8open(loc) as file_:
+        for line in file_:
            line = line.strip()
            if line.startswith('#'):
                continue
            if not line:
                continue
-            clitics = line.split()
-            word = clitics.pop(0)
-            norm_form = clitics.pop(0)
-            assert word not in seen, word
-            seen.add(word)
-            entries.append((word, norm_form, clitics))
+            pieces = line.split()
+            chunk = pieces.pop(0)
+            lex = pieces.pop(0)
+            assert chunk not in seen, chunk
+            seen.add(chunk)
+            entries.append((chunk, lex, pieces))
+            if chunk[0].isalpha() and chunk[0].islower():
+                chunk = chunk[0].title() + chunk[1:]
+                lex = lex[0].title() + lex[1:]
+                seen.add(chunk)
+                entries.append((chunk, lex, pieces))
    return entries
 

--- a/tests/.test_tokenizer.py.swo
+++ b/tests/.test_tokenizer.py.swo
--- a/tests/sun.tokens
+++ b/tests/sun.tokens
@ -0,0 +1,4 @@
+The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ] 
+
+The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ] 
+Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ] 
--- a/tests/sun.txt
+++ b/tests/sun.txt
@ -0,0 +1,4 @@
+The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields.[12][13] It has a diameter of about 1,392,684 km (865,374 mi),[5] around 109 times that of Earth, and its mass (1.989×1030 kilograms, approximately 330,000 times the mass of Earth) accounts for about 99.86% of the total mass of the Solar System.[14] Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium. The remaining 1.69% (equal to 5,600 times the mass of Earth) consists of heavier elements, including oxygen, carbon, neon and iron, among others.[15]
+
+The Sun formed about 4.567 billion[a][16] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center, while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense, eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star (G2V) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum, and although it is actually white in color, from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light.[17] In the spectral class label, G2 indicates its surface temperature, of approximately 5778 K (5505 °C), and V indicates that the Sun, like most stars, is a main-sequence star, and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core, the Sun fuses about 620 million metric tons of hydrogen each second.[18][19]
+Once regarded by astronomers as a small and relatively insignificant star, the Sun is now thought to be brighter than about 85% of the stars in the Milky Way, most of which are red dwarfs.[20][21] The absolute magnitude of the Sun is +4.83; however, as the star closest to Earth, the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74.[22][23] This is about 13 billion times brighter than the next brightest star, Sirius, with an apparent magnitude of −1.46. The Sun's hot corona continuously expands in space creating the solar wind, a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind, the heliosphere, is the largest continuous structure in the Solar System.[24][25]
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -0,0 +1,44 @@
+from __future__ import unicode_literals
+
+from spacy.spacy import expand_chunk
+from spacy.en import lookup, unhash
+
+from spacy import lex_of
+
+
+def test_possess():
+    tokens = expand_chunk(lookup("Mike's"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "Mike"
+    assert unhash(lex_of(tokens[1])) == "'s"
+
+
+def test_apostrophe():
+    tokens = expand_chunk(lookup("schools'"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[1])) == "'"
+    assert unhash(lex_of(tokens[0])) == "schools"
+
+
+def test_LL():
+    tokens = expand_chunk(lookup("we'll"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[1])) == "will"
+    assert unhash(lex_of(tokens[0])) == "we"
+
+
+def test_aint():
+    tokens = expand_chunk(lookup("ain't"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "are"
+    assert unhash(lex_of(tokens[1])) == "not"
+
+
+def test_capitalized():
+    tokens = expand_chunk(lookup("can't"))
+    assert len(tokens) == 2
+    tokens = expand_chunk(lookup("Can't"))
+    assert len(tokens) == 2
+    tokens = expand_chunk(lookup("Ain't"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "Are"
--- a/tests/test_post_punct.py
+++ b/tests/test_post_punct.py
@ -18,11 +18,10 @@ def test_close(close_puncts):
    for p in close_puncts:
        string = word_str + p
        token = lookup(string)
-        assert unhash(lex_of(token)) == word_str
        tokens = expand_chunk(token)
        assert len(tokens) == 2
-        assert unhash(lex_of(tokens[0])) == word_str
        assert unhash(lex_of(tokens[1])) == p
+        assert unhash(lex_of(tokens[0])) == word_str


 def test_two_different_close(close_puncts):
@ -43,6 +42,6 @@ def test_three_same_close(close_puncts):
    for p in close_puncts:
        string = word_str + p + p + p
        tokens = expand_chunk(lookup(string))
-        assert len(tokens) == 2
+        assert len(tokens) == 4
        assert unhash(lex_of(tokens[0])) == word_str
-        assert unhash(lex_of(tokens[1])) == p + p + p
+        assert unhash(lex_of(tokens[1])) == p
--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@ -43,8 +43,8 @@ def test_three_same_open(open_puncts):
    for p in open_puncts:
        string = p + p + p + word_str
        token = lookup(string)
-        assert unhash(lex_of(token)) == p + p + p
+        assert unhash(lex_of(token)) == p
        tokens = expand_chunk(token)
-        assert len(tokens) == 2
-        assert unhash(lex_of(tokens[0])) == p + p + p
-        assert unhash(lex_of(tokens[1])) == word_str
+        assert len(tokens) == 4
+        assert unhash(lex_of(tokens[0])) == p
+        assert unhash(lex_of(tokens[3])) == word_str
--- a/tests/test_ptb_match_wiki_sun.py
+++ b/tests/test_ptb_match_wiki_sun.py
@ -0,0 +1,46 @@
+from __future__ import unicode_literals
+
+from spacy.en import unhash
+from spacy import lex_of
+from spacy.util import utf8open
+from spacy.en_ptb import tokenize, lookup, unhash
+
+import pytest
+import os
+from os import path
+
+
+HERE = path.dirname(__file__)
+
+
+@pytest.fixture
+def sun_txt():
+    loc = path.join(HERE, 'sun.txt')
+    return utf8open(loc).read()
+
+
+@pytest.fixture
+def my_tokens(sun_txt):
+    assert len(sun_txt) != 0
+    tokens = tokenize(sun_txt)
+    return [unhash(lex_of(t)) for t in tokens]
+
+
+@pytest.fixture
+def sed_tokens():
+    loc = path.join(HERE, 'sun.tokens')
+    return utf8open(loc).read().split()
+
+
+def test_compare_tokens(my_tokens, sed_tokens):
+    me = my_tokens
+    sed = sed_tokens
+    i = 0
+    while i < len(me) and i < len(sed):
+        assert me[i] == sed[i]
+        i += 1
+
+    assert len(me) == len(sed)
+
+
+
--- a/tests/test_rules.py
+++ b/tests/test_rules.py
@ -0,0 +1,11 @@
+from spacy import util
+
+
+def test_load_en():
+    rules = util.read_tokenization('en')
+    assert len(rules) != 0
+    aint = [rule for rule in rules if rule[0] == "ain't"][0]
+    chunk, lex, pieces = aint
+    assert chunk == "ain't"
+    assert lex == "are"
+    assert pieces == ["not"]
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -0,0 +1,47 @@
+from __future__ import unicode_literals
+
+from spacy.en import tokenize
+from spacy.en import lookup
+
+from spacy.lexeme import lex_of
+
+
+def test_single_word():
+    lex_ids = tokenize(u'hello')
+    assert lex_ids[0] == lookup(u'hello')
+
+
+def test_two_words():
+    lex_ids = tokenize(u'hello possums')
+    assert len(lex_ids) == 2
+    assert lex_ids[0] == lookup(u'hello')
+    assert lex_ids[0] != lex_ids[1]
+
+
+def test_punct():
+    lex_ids = tokenize('hello, possums.')
+    assert len(lex_ids) == 4
+    assert lex_ids[0] != lookup('hello')
+    assert lex_of(lex_ids[0]) == lex_of(lookup('hello'))
+    assert lex_ids[2] == lookup('possums.')
+    assert lex_of(lex_ids[2]) == lex_of(lookup('possums.'))
+    assert lex_of(lex_ids[2]) == lex_of(lookup('possums'))
+    assert lex_of(lex_ids[1]) != lex_of(lookup('hello'))
+    assert lex_ids[0] != lookup('hello.')
+
+
+def test_digits():
+    lex_ids = tokenize('The year: 1984.')
+    assert len(lex_ids) == 5
+    assert lex_of(lex_ids[0]) == lex_of(lookup('The'))
+    assert lex_of(lex_ids[3]) == lex_of(lookup('1984'))
+    assert lex_of(lex_ids[4]) == lex_of(lookup('.'))
+
+
+def test_contraction():
+    lex_ids = tokenize("don't giggle")
+    assert len(lex_ids) == 3
+    assert lex_of(lex_ids[1]) == lex_of(lookup("not"))
+    lex_ids = tokenize("i said don't!")
+    assert len(lex_ids) == 4
+    assert lex_of(lex_ids[3]) == lex_of(lookup('!'))
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -28,3 +28,10 @@ def test_case_neq():
 def test_punct_neq():
    addr = lookup('Hello')
    assert lookup('Hello,') != addr
+
+
+def test_short():
+    addr = lookup('I')
+    assert unhash(lex_of(addr)) == 'I'
+    addr = lookup('not')
+    assert unhash(lex_of(addr)) == 'not'
--- a/tests/test_wiki_sun.py
+++ b/tests/test_wiki_sun.py
@ -0,0 +1,25 @@
+from __future__ import unicode_literals
+
+from spacy.en import unhash
+from spacy import lex_of
+from spacy import en
+from spacy.util import utf8open
+
+import pytest
+import os
+from os import path
+
+
+HERE = path.dirname(__file__)
+
+
+@pytest.fixture
+def sun_txt():
+    loc = path.join(HERE, 'sun.txt')
+    return utf8open(loc).read()
+
+
+def test_tokenize(sun_txt):
+    assert len(sun_txt) != 0
+    tokens = en.tokenize(sun_txt)
+    assert True
--- a/tests/tokenizer.sed
+++ b/tests/tokenizer.sed
@ -0,0 +1,82 @@
+#!/bin/sed -f
+
+# Sed script to produce Penn Treebank tokenization on arbitrary raw text.
+# Yeah, sure.
+
+# expected input: raw text with ONE SENTENCE TOKEN PER LINE
+
+# by Robert MacIntyre, University of Pennsylvania, late 1995.
+
+# If this wasn't such a trivial program, I'd include all that stuff about
+# no warrantee, free use, etc. from the GNU General Public License.  If you
+# want to be picky, assume that all of its terms apply.  Okay?
+
+# attempt to get correct directional quotes
+s=^"=`` =g
+s=\([ ([{<]\)"=\1 `` =g
+# close quotes handled at end
+
+s=\.\.\.= ... =g
+s=[,;:@#$%&]= & =g
+
+# Assume sentence tokenization has been done first, so split FINAL periods
+# only. 
+s=\([^.]\)\([.]\)\([])}>"']*\)[ 	]*$=\1 \2\3 =g
+# however, we may as well split ALL question marks and exclamation points,
+# since they shouldn't have the abbrev.-marker ambiguity problem
+s=[?!]= & =g
+
+# parentheses, brackets, etc.
+s=[][(){}<>]= & =g
+# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file
+# version of these symbols.
+# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST.
+# s/(/-LRB-/g
+# s/)/-RRB-/g
+# s/\[/-LSB-/g
+# s/\]/-RSB-/g
+# s/{/-LCB-/g
+# s/}/-RCB-/g
+
+s=--= -- =g
+
+# NOTE THAT SPLIT WORDS ARE NOT MARKED.  Obviously this isn't great, since
+# you might someday want to know how the words originally fit together --
+# but it's too late to make a better system now, given the millions of
+# words we've already done "wrong".
+
+# First off, add a space to the beginning and end of each line, to reduce
+# necessary number of regexps.
+s=$= =
+s=^= =
+
+s="= '' =g
+# possessive or close-single-quote
+s=\([^']\)' =\1 ' =g
+# as in it's, I'm, we'd
+s='\([sSmMdD]\) = '\1 =g
+s='ll = 'll =g
+s='re = 're =g
+s='ve = 've =g
+s=n't = n't =g
+s='LL = 'LL =g
+s='RE = 'RE =g
+s='VE = 'VE =g
+s=N'T = N'T =g
+
+s= \([Cc]\)annot = \1an not =g
+s= \([Dd]\)'ye = \1' ye =g
+s= \([Gg]\)imme = \1im me =g
+s= \([Gg]\)onna = \1on na =g
+s= \([Gg]\)otta = \1ot ta =g
+s= \([Ll]\)emme = \1em me =g
+s= \([Mm]\)ore'n = \1ore 'n =g
+s= '\([Tt]\)is = '\1 is =g
+s= '\([Tt]\)was = '\1 was =g
+s= \([Ww]\)anna = \1an na =g
+# s= \([Ww]\)haddya = \1ha dd ya =g
+# s= \([Ww]\)hatcha = \1ha t cha =g
+
+# clean out extra spaces
+s=  *= =g
+s=^ *==g