From ce2edd6312ee7fcf6bcafe235cb6cb4a1406d1e1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 12 Jan 2015 10:26:22 +1100
Subject: [PATCH] * Tmp commit. Refactoring to create a Python Lexeme class.

---
 spacy/en/__init__.py |  17 ++--
 spacy/en/download.py |   2 +-
 spacy/en/pos.pxd     |   4 +-
 spacy/en/pos.pyx     |   4 +-
 spacy/lexeme.pxd     |  10 +--
 spacy/lexeme.pyx     |   9 +-
 spacy/morphology.pxd |   2 +-
 spacy/strings.pxd    |   3 +
 spacy/strings.pyx    |   6 ++
 spacy/structs.pxd    |   6 +-
 spacy/tokenizer.pxd  |  10 +--
 spacy/tokenizer.pyx  |  30 +++----
 spacy/tokens.pxd     |  41 +++++++--
 spacy/tokens.pyx     | 207 +++++++++++++++++++++++++------------------
 spacy/vocab.pxd      |  10 +--
 spacy/vocab.pyx      |  93 +++++++++++++------
 16 files changed, 281 insertions(+), 173 deletions(-)

diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py
index 94ab36291..633ba48e4 100644
--- a/spacy/en/__init__.py
+++ b/spacy/en/__init__.py
@@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 from os import path
 
+from .. import orth
 from ..vocab import Vocab
 from ..tokenizer import Tokenizer
 from ..syntax.parser import GreedyParser
@@ -10,12 +11,10 @@ from .pos import POS_TAGS
 from .attrs import get_flags
 
 
-DATA_DIR = path.join(path.dirname(__file__), 'data')
-
-
 def get_lex_props(string):
     return {'flags': get_flags(string), 'dense': 1}
 
+LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
 
 class English(object):
     """The English NLP pipeline.
@@ -44,16 +43,18 @@ class English(object):
         parser (spacy.syntax.parser.GreedyParser):
             A greedy shift-reduce dependency parser.
     """
-    def __init__(self, data_dir=None):
-        if data_dir is None:
-            data_dir = path.join(path.dirname(__file__), 'data')
+    def __init__(self, data_dir=LOCAL_DATA_DIR):
         self._data_dir = data_dir
         self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
                            get_lex_props=get_lex_props)
         tag_names = list(POS_TAGS.keys())
         tag_names.sort()
-        self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
-                                            POS_TAGS, tag_names)
+        if data_dir is None:
+            self.tokenizer = Tokenizer(self.vocab, {}, None, None, None,
+                                       POS_TAGS, tag_names)
+        else:
+            self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
+                                                POS_TAGS, tag_names)
         self.strings = self.vocab.strings
         self._tagger = None
         self._parser = None
diff --git a/spacy/en/download.py b/spacy/en/download.py
index 9f74f0620..709fd7cb4 100644
--- a/spacy/en/download.py
+++ b/spacy/en/download.py
@@ -4,7 +4,7 @@ import tarfile
 import shutil
 import requests
 
-URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
+PARSER_URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
 
 DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps')
 
diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd
index 223b7aef3..d3697b97e 100644
--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
 
 from .._ml cimport Model
 from ..strings cimport StringStore
-from ..structs cimport TokenC, Lexeme, Morphology, PosTag
+from ..structs cimport TokenC, LexemeC, Morphology, PosTag
 from ..typedefs cimport univ_tag_t
 from .lemmatizer import Lemmatizer
 
@@ -21,5 +21,5 @@ cdef class EnPosTagger:
     cdef readonly int n_tags
 
     cdef int set_morph(self, const int i, TokenC* tokens) except -1
-    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
+    cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1
 
diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx
index b3b5b8d4b..114aea2ce 100644
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@@ -12,7 +12,7 @@ from ..typedefs cimport univ_tag_t
 from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from ..typedefs cimport X, PUNCT, EOL
 from ..typedefs cimport id_t
-from ..structs cimport TokenC, Morphology, Lexeme
+from ..structs cimport TokenC, Morphology, LexemeC
 from ..tokens cimport Tokens
 from ..morphology cimport set_morph_from_dict
 from .._ml cimport arg_max
@@ -290,7 +290,7 @@ cdef class EnPosTagger:
         tokens[i].lemma = cached.lemma
         tokens[i].morph = cached.morph
 
-    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
+    cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
         if self.lemmatizer is None:
             return lex.sic
         cdef bytes py_string = self.strings[lex.sic]
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 35826ef55..5f26ec266 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,21 +1,21 @@
 from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
 from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
-from .structs cimport Lexeme
+from .structs cimport LexemeC
 from .strings cimport StringStore
 
 
-cdef Lexeme EMPTY_LEXEME
+cdef LexemeC EMPTY_LEXEME
 
 
-cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
+cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
                   dict props) except *
  
 
-cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
+cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
     return lexeme.flags & (1 << flag_id)
 
 
-cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
+cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
     if feat_name < (sizeof(flags_t) * 8):
         return check_flag(lex, feat_name)
     elif feat_name == ID:
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 1423f30c9..e77c90ead 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -7,12 +7,12 @@ from libc.string cimport memset
 from .orth cimport word_shape
 
 
-memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
+memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 
 
-cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
+cdef LexemeC init(id_t i, unicode string, hash_t hashed,
                   StringStore string_store, dict props) except *:
-    cdef Lexeme lex
+    cdef LexemeC lex
     lex.id = i
     lex.length = len(string)
     lex.sic = string_store[string]
@@ -27,3 +27,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
    
     lex.flags = props.get('flags', 0)
     return lex
+
+
+
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index f2cb22b74..5dfee4250 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,4 +1,4 @@
-from .structs cimport TokenC, Lexeme, Morphology, PosTag
+from .structs cimport TokenC, Morphology, PosTag
 
 
 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index 9c16cfe1c..178ae51b6 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -3,6 +3,9 @@ from preshed.maps cimport PreshMap
 from murmurhash.mrmr cimport hash64
 
 from .structs cimport Utf8Str, UniStr
+from .typedefs cimport hash_t
+
+cpdef hash_t hash_string(unicode string) except 0
 
 
 cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 67d375ed7..29afde45c 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,6 +1,7 @@
 import codecs
 
 from libc.string cimport memcpy
+from murmurhash.mrmr cimport hash64
 
 
 from .typedefs cimport hash_t
@@ -9,6 +10,11 @@ from .typedefs cimport hash_t
 SEPARATOR = '\n|-SEP-|\n'
 
 
+cpdef hash_t hash_string(unicode string) except 0:
+    chars = <Py_UNICODE*>string
+    return hash64(chars, len(string) * sizeof(Py_UNICODE), 0)
+
+
 """
 cdef class SymbolMap:
     def __init__(self):
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index ee476eed6..8ddddf4d2 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -3,7 +3,9 @@ from libc.stdint cimport uint8_t, uint32_t
 from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
 
 
-cdef struct Lexeme:
+cdef struct LexemeC:
+    const float* vec
+
     flags_t flags
    
     attr_t id
@@ -38,7 +40,7 @@ cdef struct PosTag:
 
 
 cdef struct TokenC:
-    const Lexeme* lex
+    const LexemeC* lex
     Morphology morph
     univ_tag_t pos
     int fine_pos
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index add47425c..2837a4c47 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -6,14 +6,14 @@ from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 
 from .typedefs cimport hash_t
-from .structs cimport Lexeme, TokenC, Morphology, UniStr
+from .structs cimport LexemeC, TokenC, Morphology, UniStr
 from .strings cimport StringStore
 from .tokens cimport Tokens
 from .vocab cimport Vocab, _Cached
 
 
 cdef union LexemesOrTokens:
-    const Lexeme* const* lexemes
+    const LexemeC* const* lexemes
     TokenC* tokens
 
 
@@ -33,10 +33,10 @@ cdef class Tokenizer:
 
     cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
-    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
-                             vector[Lexeme*] *suffixes) except NULL
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
+                             vector[LexemeC*] *suffixes) except NULL
     cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
-                            vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
+                            vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
     cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
     cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
     cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index bede109c7..d0494917a 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -53,7 +53,7 @@ cdef class Tokenizer:
         cdef int idx = 0
         for i, py_string in enumerate(strings):
             slice_unicode(&string_struct, py_string, 0, len(py_string))
-            tokens.push_back(idx, <const Lexeme*>self.vocab.get(tokens.mem, &string_struct))
+            tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct))
             idx += len(py_string) + 1
         return tokens
 
@@ -75,7 +75,7 @@ cdef class Tokenizer:
             string (unicode): The string to be tokenized. 
 
         Returns:
-            tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
+            tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
         """
         cdef int length = len(string)
         cdef Tokens tokens = Tokens(self.vocab, length)
@@ -121,8 +121,8 @@ cdef class Tokenizer:
         return True
 
     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
-        cdef vector[Lexeme*] prefixes
-        cdef vector[Lexeme*] suffixes
+        cdef vector[LexemeC*] prefixes
+        cdef vector[LexemeC*] suffixes
         cdef hash_t orig_key
         cdef int orig_size
         orig_key = span.key
@@ -131,8 +131,8 @@ cdef class Tokenizer:
         self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
         self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
 
-    cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
-                                vector[const Lexeme*] *suffixes) except NULL:
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC*] *prefixes,
+                                vector[const LexemeC*] *suffixes) except NULL:
         cdef size_t i
         cdef UniStr prefix
         cdef UniStr suffix
@@ -174,12 +174,12 @@ cdef class Tokenizer:
         return string
 
     cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
-                            vector[const Lexeme*] *prefixes,
-                            vector[const Lexeme*] *suffixes) except -1:
+                            vector[const LexemeC*] *prefixes,
+                            vector[const LexemeC*] *suffixes) except -1:
         cdef bint cache_hit
         cdef int split
-        cdef const Lexeme* const* lexemes
-        cdef Lexeme* lexeme
+        cdef const LexemeC* const* lexemes
+        cdef LexemeC* lexeme
         cdef UniStr span
         cdef int i
         if prefixes.size():
@@ -200,7 +200,7 @@ cdef class Tokenizer:
                     idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
                     slice_unicode(&span, string.chars, split + 1, string.n)
                     idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
-        cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
+        cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
             idx = tokens.push_back(idx, deref(it))
             preinc(it)
@@ -213,10 +213,10 @@ cdef class Tokenizer:
         cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
         cached.length = n
         cached.is_lex = True
-        lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
+        lexemes = <const LexemeC**>self.mem.alloc(n, sizeof(LexemeC**))
         for i in range(n):
             lexemes[i] = tokens[i].lex
-        cached.data.lexemes = <const Lexeme* const*>lexemes
+        cached.data.lexemes = <const LexemeC* const*>lexemes
         self._cache.set(key, cached)
 
     cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
@@ -243,7 +243,7 @@ cdef class Tokenizer:
         cdef unicode form
         cdef unicode lemma
         cdef dict props
-        cdef Lexeme** lexemes
+        cdef LexemeC** lexemes
         cdef hash_t hashed
         cdef UniStr string
         for chunk, substrings in sorted(rules.items()):
@@ -252,7 +252,7 @@ cdef class Tokenizer:
                 form = props['F']
                 lemma = props.get("L", None)
                 slice_unicode(&string, form, 0, len(form))
-                tokens[i].lex = <Lexeme*>self.vocab.get(self.vocab.mem, &string)
+                tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
                 if lemma:
                     tokens[i].lemma = self.vocab.strings[lemma]
                 if 'pos' in props:
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index e50e688ac..35a7c2b63 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -5,13 +5,13 @@ from cython.view cimport array as cvarray
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
 
-from .typedefs cimport flags_t, attr_id_t, attr_t
-from .structs cimport Morphology, TokenC, Lexeme
+from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t
+from .structs cimport Morphology, TokenC, LexemeC
 from .vocab cimport Vocab
 from .strings cimport StringStore
 
 
-ctypedef const Lexeme* const_Lexeme_ptr
+ctypedef const LexemeC* const_Lexeme_ptr
 ctypedef TokenC* TokenC_ptr
 
 ctypedef fused LexemeOrToken:
@@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
     TokenC_ptr
 
 
-cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil
+cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil
 cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil
 
-cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
+cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
     return lexeme.flags & (1 << flag_id)
 
 
@@ -42,5 +42,32 @@ cdef class Tokens:
 
 
 cdef class Token:
-    cdef Tokens _seq
-    cdef readonly int i
+    cdef cvarray vec
+
+    cdef readonly flags_t flags
+   
+    cdef readonly attr_t id
+    cdef readonly attr_t sic
+    cdef readonly attr_t dense
+    cdef readonly attr_t shape
+    cdef readonly attr_t prefix
+    cdef readonly attr_t suffix
+ 
+    cdef readonly attr_t length
+    cdef readonly attr_t cluster
+    cdef readonly attr_t pos_type
+
+    cdef readonly float prob
+    cdef readonly float sentiment
+
+    cdef readonly Morphology morph
+    cdef readonly univ_tag_t pos
+    cdef readonly int fine_pos
+    cdef readonly int idx
+    cdef readonly int lemma
+    cdef readonly int sense
+    cdef readonly int dep_tag
+    
+    cdef readonly int head_offset
+    cdef readonly uint32_t l_kids
+    cdef readonly uint32_t r_kids
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 0d8bc91b0..7e73ab4f8 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -32,7 +32,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
         return get_lex_attr(token.lex, feat_name)
 
 
-cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
+cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
     if feat_name < (sizeof(flags_t) * 8):
         return check_flag(lex, feat_name)
     elif feat_name == ID:
@@ -85,7 +85,7 @@ cdef class Tokens:
             token (Token):
         """
         bounds_check(i, self.length, PADDING)
-        return Token(self, i)
+        return cinit_token(&self.data[i])
 
     def __iter__(self):
         """Iterate over the tokens.
@@ -174,26 +174,57 @@ cdef class Tokens:
             self.data[i].lex = &EMPTY_LEXEME
 
 
-@cython.freelist(64)
+cdef Token cinit_token(const TokenC* c_tok):
+    cdef const LexemeC* lex = c_tok.lex
+    cdef Token py_tok = Token.__new__(Token)
+
+    cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
+    py_tok.vec = cyarr
+
+    py_tok.flags = lex.flags
+    py_tok.id = lex.id
+    py_tok.sic = lex.sic
+    py_tok.dense = lex.dense
+    py_tok.shape = lex.shape
+    py_tok.prefix = lex.prefix
+    py_tok.suffix = lex.suffix
+    py_tok.length = lex.length
+    py_tok.cluster = lex.cluster
+    py_tok.pos_type = lex.pos_type
+
+    py_tok.prob = lex.prob
+    py_tok.sentiment = lex.sentiment
+
+    py_tok.morph = c_tok.morph
+    py_tok.pos = c_tok.pos
+    py_tok.fine_pos = c_tok.fine_pos
+    py_tok.idx = c_tok.idx
+    py_tok.lemma = c_tok.lemma
+    py_tok.sense = c_tok.sense
+    py_tok.dep_tag = c_tok.dep_tag
+    py_tok.head_offset = c_tok.head
+    py_tok.l_kids = c_tok.l_kids
+    py_tok.r_kids = c_tok.r_kids
+    return py_tok
+
+
 cdef class Token:
     """An individual token.
-
-    Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
-    object.
     """
-    def __init__(self, Tokens tokens, int i):
-        self._seq = tokens
-        self.i = i
+    def __init__(self):
+        pass
+        #self._seq = tokens
+        #self.i = i
 
-    def __unicode__(self):
-        cdef const TokenC* t = &self._seq.data[self.i]
-        cdef int end_idx = t.idx + t.lex.length
-        if self.i + 1 == self._seq.length:
-            return self.string
-        if end_idx == t[1].idx:
-            return self.string
-        else:
-            return self.string + ' '
+    #def __unicode__(self):
+    #    cdef const TokenC* t = &self._seq.data[self.i]
+    #    cdef int end_idx = t.idx + t.lex.length
+    #    if self.i + 1 == self._seq.length:
+    #        return self.string
+    #    if end_idx == t[1].idx:
+    #        return self.string
+    #    else:
+    #        return self.string + ' '
 
     def __len__(self):
         """The number of unicode code-points in the original string.
@@ -201,87 +232,87 @@ cdef class Token:
         Returns:
             length (int):
         """
-        return self._seq.data[self.i].lex.length
+        return self.length
 
-    property idx:
-        """The index into the original string at which the token starts.
+    #property idx:
+    #    """The index into the original string at which the token starts.
 
-        The following is supposed to always be true:
-        
-        >>> original_string[token.idx:token.idx len(token) == token.string
-        """
-        def __get__(self):
-            return self._seq.data[self.i].idx
+    #    The following is supposed to always be true:
+    #    
+    #    >>> original_string[token.idx:token.idx len(token) == token.string
+    #    """
+    #    def __get__(self):
+    #        return self._seq.data[self.i].idx
 
-    property cluster:
-        """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
-    
-        Similar words have better-than-chance likelihood of having similar cluster
-        IDs, although the clustering is quite noisy.  Cluster IDs make good features,
-        and help to make models slightly more robust to domain variation.
+    #property cluster:
+    #    """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
+    #
+    #    Similar words have better-than-chance likelihood of having similar cluster
+    #    IDs, although the clustering is quite noisy.  Cluster IDs make good features,
+    #    and help to make models slightly more robust to domain variation.
 
-        A common trick is to use only the first N bits of a cluster ID in a feature,
-        as the more general part of the hierarchical clustering is often more accurate
-        than the lower categories.
+    #    A common trick is to use only the first N bits of a cluster ID in a feature,
+    #    as the more general part of the hierarchical clustering is often more accurate
+    #    than the lower categories.
 
-        To assist in this, I encode the cluster IDs little-endian, to allow a simple
-        bit-mask:
+    #    To assist in this, I encode the cluster IDs little-endian, to allow a simple
+    #    bit-mask:
 
-        >>> six_bits = cluster & (2**6 - 1)
-        """
-        def __get__(self):
-            return self._seq.data[self.i].lex.cluster
+    #    >>> six_bits = cluster & (2**6 - 1)
+    #    """
+    #    def __get__(self):
+    #        return self._seq.data[self.i].lex.cluster
 
-    property string:
-        """The unicode string of the word, with no whitespace padding."""
-        def __get__(self):
-            cdef const TokenC* t = &self._seq.data[self.i]
-            if t.lex.sic == 0:
-                return ''
-            cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
-            return utf8string.decode('utf8')
+    #property string:
+    #    """The unicode string of the word, with no whitespace padding."""
+    #    def __get__(self):
+    #        cdef const TokenC* t = &self._seq.data[self.i]
+    #        if t.lex.sic == 0:
+    #            return ''
+    #        cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
+    #        return utf8string.decode('utf8')
 
-    property lemma:
-        """The unicode string of the word's lemma.  If no part-of-speech tag is
-        assigned, the most common part-of-speech tag of the word is used.
-        """
-        def __get__(self):
-            cdef const TokenC* t = &self._seq.data[self.i]
-            if t.lemma == 0:
-                return self.string
-            cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
-            return utf8string.decode('utf8')
+    #property lemma:
+    #    """The unicode string of the word's lemma.  If no part-of-speech tag is
+    #    assigned, the most common part-of-speech tag of the word is used.
+    #    """
+    #    def __get__(self):
+    #        cdef const TokenC* t = &self._seq.data[self.i]
+    #        if t.lemma == 0:
+    #            return self.string
+    #        cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
+    #        return utf8string.decode('utf8')
 
-    property dep_tag:
-        """The ID integer of the word's dependency label.  If no parse has been
-        assigned, defaults to 0.
-        """
-        def __get__(self):
-            return self._seq.data[self.i].dep_tag
+    #property dep_tag:
+    #    """The ID integer of the word's dependency label.  If no parse has been
+    #    assigned, defaults to 0.
+    #    """
+    #    def __get__(self):
+    #        return self._seq.data[self.i].dep_tag
 
-    property pos:
-        """The ID integer of the word's part-of-speech tag, from the 13-tag
-        Google Universal Tag Set.  Constants for this tag set are available in
-        spacy.typedefs.
-        """
-        def __get__(self):
-            return self._seq.data[self.i].pos
+    #property pos:
+    #    """The ID integer of the word's part-of-speech tag, from the 13-tag
+    #    Google Universal Tag Set.  Constants for this tag set are available in
+    #    spacy.typedefs.
+    #    """
+    #    def __get__(self):
+    #        return self._seq.data[self.i].pos
 
-    property fine_pos:
-        """The ID integer of the word's fine-grained part-of-speech tag, as assigned
-        by the tagger model.  Fine-grained tags include morphological information,
-        and other distinctions, and allow a more accurate tagger to be trained.
-        """
+    #property fine_pos:
+    #    """The ID integer of the word's fine-grained part-of-speech tag, as assigned
+    #    by the tagger model.  Fine-grained tags include morphological information,
+    #    and other distinctions, and allow a more accurate tagger to be trained.
+    #    """
  
-        def __get__(self):
-            return self._seq.data[self.i].fine_pos
+    #    def __get__(self):
+    #        return self._seq.data[self.i].fine_pos
 
-    property sic:
-        def __get__(self):
-            return self._seq.data[self.i].lex.sic
+    #property sic:
+    #    def __get__(self):
+    #        return self._seq.data[self.i].lex.sic
 
-    property head:
-        """The token predicted by the parser to be the head of the current token."""
-        def __get__(self):
-            cdef const TokenC* t = &self._seq.data[self.i]
-            return Token(self._seq, self.i + t.head)
+    #property head:
+    #    """The token predicted by the parser to be the head of the current token."""
+    #    def __get__(self):
+    #        cdef const TokenC* t = &self._seq.data[self.i]
+    #        return Token(self._seq, self.i + t.head)
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index dc3eb7aba..203d3c7a5 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -4,16 +4,16 @@ from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 
-from .structs cimport Lexeme, TokenC, UniStr
+from .structs cimport LexemeC, TokenC, UniStr
 from .typedefs cimport utf8_t, id_t, hash_t
 from .strings cimport StringStore
 
 
-cdef Lexeme EMPTY_LEXEME
+cdef LexemeC EMPTY_LEXEME
 
 
 cdef union LexemesOrTokens:
-    const Lexeme* const* lexemes
+    const LexemeC* const* lexemes
     TokenC* tokens
 
 
@@ -27,9 +27,9 @@ cdef class Vocab:
     cpdef public get_lex_props
     cdef Pool mem
     cpdef readonly StringStore strings
-    cdef vector[Lexeme*] lexemes
+    cdef vector[LexemeC*] lexemes
 
-    cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
+    cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
     
     cdef PreshMap _map
   
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 6b6fee922..a63edb6b4 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -2,20 +2,27 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from libc.string cimport memset
 
 from os import path
+import codecs
 
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
 from .strings cimport slice_unicode
-from .typedefs cimport flags_t
+from .strings cimport hash_string
 from .orth cimport word_shape
 
 
-memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
+DEF MAX_VEC_SIZE = 100000
 
 
-cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
+cdef float[MAX_VEC_SIZE] EMPTY_VEC
+memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
+memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
+EMPTY_LEXEME.vec = EMPTY_VEC
+
+
+cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
                   StringStore string_store, dict props) except *:
-    cdef Lexeme lex
+    cdef LexemeC lex
     lex.id = i
     lex.length = len(string)
     lex.sic = string_store[string]
@@ -28,13 +35,12 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
     lex.suffix = string_store[string[-3:]]
     lex.shape = string_store[word_shape(string)]
    
-    cdef object flags_val = props.get('flags', 0)
-    lex.flags = <flags_t>flags_val
+    lex.flags = props.get('flags', 0)
     return lex
 
 
 cdef class Vocab:
-    '''A map container for a language's Lexeme structs.
+    '''A map container for a language's LexemeC structs.
     '''
     def __init__(self, data_dir=None, get_lex_props=None):
         self.mem = Pool()
@@ -50,24 +56,25 @@ cdef class Vocab:
             if not path.isdir(data_dir):
                 raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
             self.strings.load(path.join(data_dir, 'strings.txt'))
-            self.load(path.join(data_dir, 'lexemes.bin'))
+            self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
+            #self.load_vectors(path.join(data_dir, 'deps.words'))
 
     def __len__(self):
         """The current number of lexemes stored."""
         return self.lexemes.size()
 
-    cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
-        '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
+    cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
+        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
         if necessary, using memory acquired from the given pool.  If the pool
         is the lexicon's own memory, the lexeme is saved in the lexicon.'''
-        cdef Lexeme* lex
-        lex = <Lexeme*>self._map.get(string.key)
+        cdef LexemeC* lex
+        lex = <LexemeC*>self._map.get(string.key)
         if lex != NULL:
             return lex
         if string.n < 3:
             mem = self.mem
         cdef unicode py_string = string.chars[:string.n]
-        lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
+        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
         lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
                              self.get_lex_props(py_string))
         if mem is self.mem:
@@ -81,13 +88,13 @@ cdef class Vocab:
 
     def __getitem__(self,  id_or_string):
         '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
-        unseen unicode string is given, a new Lexeme is created and stored.
+        unseen unicode string is given, a new LexemeC is created and stored.
 
         This function relies on Cython's struct-to-dict conversion.  Python clients
         receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
-        with int values.  Cython clients can instead receive a Lexeme struct value.
+        with int values.  Cython clients can instead receive a LexemeC struct value.
         More efficient Cython access is provided by Lexicon.get, which returns
-        a Lexeme*.
+        a LexemeC*.
 
         Args:
             id_or_string (int or unicode): The integer ID of a word, or its unicode
@@ -96,24 +103,26 @@ cdef class Vocab:
                 is raised.
 
         Returns:
-            lexeme (dict): A Lexeme struct instance, which Cython translates into
+            lexeme (dict): A LexemeC struct instance, which Cython translates into
                 a dict if the operator is called from Python.
         '''
         if type(id_or_string) == int:
             if id_or_string >= self.lexemes.size():
                 raise IndexError
-            return self.lexemes.at(id_or_string)[0]
+            return {}
+            #return self.lexemes.at(id_or_string)[0]
         cdef UniStr string
         slice_unicode(&string, id_or_string, 0, len(id_or_string))
-        cdef const Lexeme* lexeme = self.get(self.mem, &string)
-        return lexeme[0]
+        cdef const LexemeC* lexeme = self.get(self.mem, &string)
+        return {}
+        #return lexeme[0]
 
     def __setitem__(self, unicode uni_string, dict props):
         cdef UniStr s
         slice_unicode(&s, uni_string, 0, len(uni_string))
         # Cast through the const here, since we're allowed to change our own
-        # Lexemes.
-        lex = <Lexeme*><void*>self.get(self.mem, &s)
+        # LexemeCs.
+        lex = <LexemeC*><void*>self.get(self.mem, &s)
         lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
 
     def dump(self, loc):
@@ -128,30 +137,30 @@ cdef class Vocab:
             key = self._map.c_map.cells[i].key
             if key == 0:
                 continue
-            lexeme = <Lexeme*>self._map.c_map.cells[i].value
+            lexeme = <LexemeC*>self._map.c_map.cells[i].value
             st = fwrite(&key, sizeof(key), 1, fp)
             assert st == 1
-            st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
+            st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
             assert st == 1
         st = fclose(fp)
         assert st == 0
 
-    def load(self, loc):
+    def load_lexemes(self, loc):
         if not path.exists(loc):
-            raise IOError('Lexemes file not found at %s' % loc)
+            raise IOError('LexemeCs file not found at %s' % loc)
         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
         cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
         assert fp != NULL
         cdef size_t st
-        cdef Lexeme* lexeme
+        cdef LexemeC* lexeme
         cdef hash_t key
         i = 0
         while True:
             st = fread(&key, sizeof(key), 1, fp)
             if st != 1:
                 break
-            lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
-            st = fread(lexeme, sizeof(Lexeme), 1, fp)
+            lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
+            st = fread(lexeme, sizeof(LexemeC), 1, fp)
             if st != 1:
                 break
             self._map.set(key, lexeme)
@@ -160,3 +169,29 @@ cdef class Vocab:
             self.lexemes[lexeme.id] = lexeme
             i += 1
         fclose(fp)
+
+    def load_vectors(self, loc):
+        cdef int i
+        cdef unicode line
+        cdef unicode word
+        cdef unicode val_str
+        cdef hash_t key
+        cdef LexemeC* lex
+        cdef float* vec
+ 
+        with codecs.open(loc, 'r', 'utf8') as file_:
+            for line in file_:
+                pieces = line.split()
+                word = pieces.pop(0)
+                if len(pieces) >= MAX_VEC_SIZE:
+                    sizes = (len(pieces), MAX_VEC_SIZE)
+                    msg = ("Your vector is %d elements."
+                           "The compile-time limit is %d elements." % sizes)
+                    raise ValueError(msg)
+                key = hash_string(word)
+                lex = <LexemeC*>self._map.get(key)
+                if lex is not NULL:
+                    vec = <float*>self.mem.alloc(len(pieces), sizeof(float))
+                    for i, val_str in enumerate(pieces):
+                        vec[i] = float(val_str)
+                    lex.vec = vec