diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py
index 933de124e..4ea837ed0 100644
--- a/spacy/en/__init__.py
+++ b/spacy/en/__init__.py
@@ -12,13 +12,24 @@ from .attrs import get_flags
 
 
 def get_lex_props(string):
-    return {'flags': get_flags(string), 'length': len(string),
-            'sic': string, 'norm1': string, 'norm2': string, 'shape': string,
-            'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0,
-            'sentiment': 0}
+    return {
+        'flags': get_flags(string),
+        'length': len(string),
+        'sic': string,
+        'norm1': string,
+        'norm2': string,
+        'shape': orth.word_shape(string),
+        'prefix': string[0],
+        'suffix': string[-3:],
+        'cluster': 0,
+        'prob': 0,
+        'sentiment': 0
+    }
+
 
 LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
 
+
 class English(object):
     """The English NLP pipeline.
 
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 8686f8e6a..32626f122 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -16,7 +16,7 @@ cdef class Lexeme:
     cdef readonly attr_t id
     cdef readonly attr_t length
 
-    cdef readonly unicode sic
+    cdef readonly attr_t sic
     cdef readonly unicode norm1
     cdef readonly unicode norm2
     cdef readonly unicode shape
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index dfc82d46e..fbdcd31da 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -1,3 +1,4 @@
+# cython: embedsignature=True
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
@@ -29,6 +30,7 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
 
 
 cdef class Lexeme:
+    """A dummy docstring"""
     def __init__(self):
         pass
         
@@ -42,7 +44,7 @@ cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings):
     py.id = c.id
     py.length = c.length
 
-    py.sic = strings[c.sic]
+    py.sic = c.sic
     py.norm1 = strings[c.norm1]
     py.norm2 = strings[c.norm2]
     py.shape = strings[c.shape]
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index df9b89dc3..e5792aa9a 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -53,8 +53,8 @@ cdef class StringStore:
         self.mem = Pool()
         self._map = PreshMap()
         self._resize_at = 10000
-        self.size = 1
         self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
+        self.size = 1
 
     property size:
         def __get__(self):
@@ -64,7 +64,9 @@ cdef class StringStore:
         cdef bytes byte_string
         cdef const Utf8Str* utf8str
         if isinstance(string_or_id, int) or isinstance(string_or_id, long):
-            if string_or_id < 1 or string_or_id >= self.size:
+            if string_or_id == 0:
+                return u''
+            elif string_or_id < 1 or string_or_id >= self.size:
                 raise IndexError(string_or_id)
             utf8str = &self.strings[<int>string_or_id]
             return utf8str.chars[:utf8str.length].decode('utf8')
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 61aab89b1..4c0156df3 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -120,9 +120,9 @@ cdef class Tokens:
             attr_ids (list[int]): A list of attribute ID ints.
 
         Returns:
-            feat_array (numpy.ndarray[long, ndim=2]): A feature matrix, with one
-                row per word, and one column per attribute indicated in the input
-                attr_ids.
+            feat_array (numpy.ndarray[long, ndim=2]):
+              A feature matrix, with one row per word, and one column per attribute
+              indicated in the input attr_ids.
         """
         cdef int i, j
         cdef attr_id_t feature
@@ -278,7 +278,7 @@ cdef class Token:
 
     property sic:
         def __get__(self):
-            return self._seq.vocab.strings[self._seq.data[self.i].lex.sic]
+            return self._seq.data[self.i].lex.sic
 
     property head:
         """The token predicted by the parser to be the head of the current token."""
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 4043b14e0..800947964 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -77,14 +77,15 @@ cdef class Vocab:
         unseen unicode string is given, a new lexeme is created and stored.
 
         Args:
-            id_or_string (int or unicode): The integer ID of a word, or its unicode
-                string.  If an int >= Lexicon.size, IndexError is raised.
-                If id_or_string is neither an int nor a unicode string, ValueError
-                is raised.
+            id_or_string (int or unicode):
+              The integer ID of a word, or its unicode string.  If an int >= Lexicon.size,
+              IndexError is raised. If id_or_string is neither an int nor a unicode string,
+              ValueError is raised.
 
         Returns:
-            lexeme (Lexeme): An instance of the Lexeme Python class, with data
-                copied on instantiation.
+            lexeme (Lexeme):
+              An instance of the Lexeme Python class, with data copied on
+              instantiation.
         '''
         cdef UniStr c_str
         cdef const LexemeC* lexeme
@@ -92,9 +93,11 @@ cdef class Vocab:
             if id_or_string >= self.lexemes.size():
                 raise IndexError
             lexeme = self.lexemes.at(id_or_string)
-        else:
+        elif type(id_or_string) == unicode:
             slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
             lexeme = self.get(self.mem, &c_str)
+        else:
+            raise ValueError("Vocab unable to map type: %s. Maps unicode --> int or int --> unicode" % str(type(id_or_string)))
         return Lexeme_cinit(lexeme, self.strings)
 
     def __setitem__(self, unicode py_str, dict props):
diff --git a/tests/test_intern.py b/tests/test_intern.py
index 5375ebb2a..74f0d6bcf 100644
--- a/tests/test_intern.py
+++ b/tests/test_intern.py
@@ -27,10 +27,6 @@ def test_save_unicode(sstore):
     assert Hello_i == 1
 
 
-def test_zero_id(sstore):
-    with pytest.raises(IndexError):
-        sstore[0]
-
 def test_retrieve_id(sstore):
     A_i = sstore[b'A']
     assert sstore.size == 1
diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py
index bc9231259..5a6a8fc62 100644
--- a/tests/test_lemmatizer.py
+++ b/tests/test_lemmatizer.py
@@ -1,14 +1,14 @@
 from __future__ import unicode_literals
 
 from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc
-from spacy.en import DATA_DIR
+from spacy.en import LOCAL_DATA_DIR
 from os import path
 
 import pytest
 
 
 def test_read_index():
-    wn = path.join(DATA_DIR, 'wordnet')
+    wn = path.join(LOCAL_DATA_DIR, 'wordnet')
     index = read_index(path.join(wn, 'index.noun'))
     assert 'man' in index
     assert 'plantes' not in index
@@ -16,14 +16,14 @@ def test_read_index():
 
 
 def test_read_exc():
-    wn = path.join(DATA_DIR, 'wordnet')
+    wn = path.join(LOCAL_DATA_DIR, 'wordnet')
     exc = read_exc(path.join(wn, 'verb.exc'))
     assert exc['was'] == ('be',)
 
 
 @pytest.fixture
 def lemmatizer():
-    return Lemmatizer(path.join(DATA_DIR, 'wordnet'), 0, 0, 0)
+    return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0)
 
 
 def test_noun_lemmas(lemmatizer):
diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py
index 3a4776b80..e04360d98 100644
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@@ -13,17 +13,17 @@ def EN():
 
 def test_is_alpha(EN):
     the = EN.vocab['the']
-    assert the['flags'] & (1 << IS_ALPHA)
+    assert the.flags & (1 << IS_ALPHA)
     year = EN.vocab['1999']
-    assert not year['flags'] & (1 << IS_ALPHA)
+    assert not year.flags & (1 << IS_ALPHA)
     mixed = EN.vocab['hello1']
-    assert not mixed['flags'] & (1 << IS_ALPHA)
+    assert not mixed.flags & (1 << IS_ALPHA)
 
 
 def test_is_digit(EN):
     the = EN.vocab['the']
-    assert not the['flags'] & (1 << IS_DIGIT)
+    assert not the.flags & (1 << IS_DIGIT)
     year = EN.vocab['1999']
-    assert year['flags'] & (1 << IS_DIGIT)
+    assert year.flags & (1 << IS_DIGIT)
     mixed = EN.vocab['hello1']
-    assert not mixed['flags'] & (1 << IS_DIGIT)
+    assert not mixed.flags & (1 << IS_DIGIT)
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index cb02bbcff..c5b50041f 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -33,17 +33,17 @@ def test_punct(EN):
 def test_digits(EN):
     tokens = EN('The year: 1984.')
     assert len(tokens) == 5
-    assert tokens[0].sic == EN.vocab['The']['sic']
-    assert tokens[3].sic == EN.vocab['1984']['sic']
+    assert tokens[0].sic == EN.vocab['The'].sic
+    assert tokens[3].sic == EN.vocab['1984'].sic
 
 
 def test_contraction(EN):
     tokens = EN("don't giggle")
     assert len(tokens) == 3
-    assert tokens[1].sic == EN.vocab["n't"]['sic']
+    assert tokens[1].sic == EN.vocab["n't"].sic
     tokens = EN("i said don't!")
     assert len(tokens) == 5
-    assert tokens[4].sic == EN.vocab['!']['sic']
+    assert tokens[4].sic == EN.vocab['!'].sic
 
 
 def test_contraction_punct(EN):
diff --git a/tests/test_vocab.py b/tests/test_vocab.py
index 0a739ad0e..a83fa82d3 100644
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@@ -11,24 +11,24 @@ def EN():
 
 def test_neq(EN):
     addr = EN.vocab['Hello']
-    assert EN.vocab['bye']['sic'] != addr['sic']
+    assert EN.vocab['bye'].sic != addr.sic
 
 
 def test_eq(EN):
     addr = EN.vocab['Hello']
-    assert EN.vocab['Hello']['sic'] == addr['sic']
+    assert EN.vocab['Hello'].sic == addr.sic
 
 
 def test_case_neq(EN):
     addr = EN.vocab['Hello']
-    assert EN.vocab['hello']['sic'] != addr['sic']
+    assert EN.vocab['hello'].sic != addr.sic
 
 
 def test_punct_neq(EN):
     addr = EN.vocab['Hello']
-    assert EN.vocab['Hello,']['sic'] != addr['sic']
+    assert EN.vocab['Hello,'].sic != addr.sic
 
 
 def test_shape_attr(EN):
     example = EN.vocab['example']
-    assert example['sic'] != example['shape']
+    assert example.sic != example.shape