* Tmp commit. Refactoring to create a Python Lexeme class.

2025-11-07 03:17:37 +03:00 · 2015-01-12 10:26:22 +11:00 · 2015-01-12 10:26:22 +11:00 · ce2edd6312
commit ce2edd6312
parent 61904e590f
16 changed files with 281 additions and 173 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 from os import path

+from .. import orth
 from ..vocab import Vocab
 from ..tokenizer import Tokenizer
 from ..syntax.parser import GreedyParser
@ -10,12 +11,10 @@ from .pos import POS_TAGS
 from .attrs import get_flags


-DATA_DIR = path.join(path.dirname(__file__), 'data')
-
-
 def get_lex_props(string):
    return {'flags': get_flags(string), 'dense': 1}

+LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')

 class English(object):
    """The English NLP pipeline.
@ -44,14 +43,16 @@ class English(object):
        parser (spacy.syntax.parser.GreedyParser):
            A greedy shift-reduce dependency parser.
    """
-    def __init__(self, data_dir=None):
-        if data_dir is None:
-            data_dir = path.join(path.dirname(__file__), 'data')
+    def __init__(self, data_dir=LOCAL_DATA_DIR):
        self._data_dir = data_dir
        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
                           get_lex_props=get_lex_props)
        tag_names = list(POS_TAGS.keys())
        tag_names.sort()
+        if data_dir is None:
+            self.tokenizer = Tokenizer(self.vocab, {}, None, None, None,
+                                       POS_TAGS, tag_names)
+        else:
            self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
                                                POS_TAGS, tag_names)
        self.strings = self.vocab.strings
--- a/spacy/en/download.py
+++ b/spacy/en/download.py
@ -4,7 +4,7 @@ import tarfile
 import shutil
 import requests

-URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
+PARSER_URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'

 DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps')

--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool

 from .._ml cimport Model
 from ..strings cimport StringStore
-from ..structs cimport TokenC, Lexeme, Morphology, PosTag
+from ..structs cimport TokenC, LexemeC, Morphology, PosTag
 from ..typedefs cimport univ_tag_t
 from .lemmatizer import Lemmatizer

@ -21,5 +21,5 @@ cdef class EnPosTagger:
    cdef readonly int n_tags

    cdef int set_morph(self, const int i, TokenC* tokens) except -1
-    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
+    cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1

--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -12,7 +12,7 @@ from ..typedefs cimport univ_tag_t
 from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from ..typedefs cimport X, PUNCT, EOL
 from ..typedefs cimport id_t
-from ..structs cimport TokenC, Morphology, Lexeme
+from ..structs cimport TokenC, Morphology, LexemeC
 from ..tokens cimport Tokens
 from ..morphology cimport set_morph_from_dict
 from .._ml cimport arg_max
@ -290,7 +290,7 @@ cdef class EnPosTagger:
        tokens[i].lemma = cached.lemma
        tokens[i].morph = cached.morph

-    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
+    cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
        if self.lemmatizer is None:
            return lex.sic
        cdef bytes py_string = self.strings[lex.sic]
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,21 +1,21 @@
 from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
 from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
-from .structs cimport Lexeme
+from .structs cimport LexemeC
 from .strings cimport StringStore


-cdef Lexeme EMPTY_LEXEME
+cdef LexemeC EMPTY_LEXEME


-cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
+cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
                  dict props) except *
 

-cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
+cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)


-cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
+cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
    if feat_name < (sizeof(flags_t) * 8):
        return check_flag(lex, feat_name)
    elif feat_name == ID:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -7,12 +7,12 @@ from libc.string cimport memset
 from .orth cimport word_shape


-memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
+memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


-cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
+cdef LexemeC init(id_t i, unicode string, hash_t hashed,
                  StringStore string_store, dict props) except *:
-    cdef Lexeme lex
+    cdef LexemeC lex
    lex.id = i
    lex.length = len(string)
    lex.sic = string_store[string]
@ -27,3 +27,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
   
    lex.flags = props.get('flags', 0)
    return lex
+
+
+
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,4 +1,4 @@
-from .structs cimport TokenC, Lexeme, Morphology, PosTag
+from .structs cimport TokenC, Morphology, PosTag


 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -3,6 +3,9 @@ from preshed.maps cimport PreshMap
 from murmurhash.mrmr cimport hash64

 from .structs cimport Utf8Str, UniStr
+from .typedefs cimport hash_t
+
+cpdef hash_t hash_string(unicode string) except 0


 cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,6 +1,7 @@
 import codecs

 from libc.string cimport memcpy
+from murmurhash.mrmr cimport hash64


 from .typedefs cimport hash_t
@ -9,6 +10,11 @@ from .typedefs cimport hash_t
 SEPARATOR = '\n|-SEP-|\n'


+cpdef hash_t hash_string(unicode string) except 0:
+    chars = <Py_UNICODE*>string
+    return hash64(chars, len(string) * sizeof(Py_UNICODE), 0)
+
+
 """
 cdef class SymbolMap:
    def __init__(self):
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -3,7 +3,9 @@ from libc.stdint cimport uint8_t, uint32_t
 from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t


-cdef struct Lexeme:
+cdef struct LexemeC:
+    const float* vec
+
    flags_t flags
   
    attr_t id
@ -38,7 +40,7 @@ cdef struct PosTag:


 cdef struct TokenC:
-    const Lexeme* lex
+    const LexemeC* lex
    Morphology morph
    univ_tag_t pos
    int fine_pos
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -6,14 +6,14 @@ from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool

 from .typedefs cimport hash_t
-from .structs cimport Lexeme, TokenC, Morphology, UniStr
+from .structs cimport LexemeC, TokenC, Morphology, UniStr
 from .strings cimport StringStore
 from .tokens cimport Tokens
 from .vocab cimport Vocab, _Cached


 cdef union LexemesOrTokens:
-    const Lexeme* const* lexemes
+    const LexemeC* const* lexemes
    TokenC* tokens


@ -33,10 +33,10 @@ cdef class Tokenizer:

    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
-    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
-                             vector[Lexeme*] *suffixes) except NULL
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
+                             vector[LexemeC*] *suffixes) except NULL
    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
-                            vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
+                            vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -53,7 +53,7 @@ cdef class Tokenizer:
        cdef int idx = 0
        for i, py_string in enumerate(strings):
            slice_unicode(&string_struct, py_string, 0, len(py_string))
-            tokens.push_back(idx, <const Lexeme*>self.vocab.get(tokens.mem, &string_struct))
+            tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct))
            idx += len(py_string) + 1
        return tokens

@ -75,7 +75,7 @@ cdef class Tokenizer:
            string (unicode): The string to be tokenized. 

        Returns:
-            tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
+            tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
        """
        cdef int length = len(string)
        cdef Tokens tokens = Tokens(self.vocab, length)
@ -121,8 +121,8 @@ cdef class Tokenizer:
        return True

    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
-        cdef vector[Lexeme*] prefixes
-        cdef vector[Lexeme*] suffixes
+        cdef vector[LexemeC*] prefixes
+        cdef vector[LexemeC*] suffixes
        cdef hash_t orig_key
        cdef int orig_size
        orig_key = span.key
@ -131,8 +131,8 @@ cdef class Tokenizer:
        self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
        self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)

-    cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
-                                vector[const Lexeme*] *suffixes) except NULL:
+    cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC*] *prefixes,
+                                vector[const LexemeC*] *suffixes) except NULL:
        cdef size_t i
        cdef UniStr prefix
        cdef UniStr suffix
@ -174,12 +174,12 @@ cdef class Tokenizer:
        return string

    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
-                            vector[const Lexeme*] *prefixes,
-                            vector[const Lexeme*] *suffixes) except -1:
+                            vector[const LexemeC*] *prefixes,
+                            vector[const LexemeC*] *suffixes) except -1:
        cdef bint cache_hit
        cdef int split
-        cdef const Lexeme* const* lexemes
-        cdef Lexeme* lexeme
+        cdef const LexemeC* const* lexemes
+        cdef LexemeC* lexeme
        cdef UniStr span
        cdef int i
        if prefixes.size():
@ -200,7 +200,7 @@ cdef class Tokenizer:
                    idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
                    slice_unicode(&span, string.chars, split + 1, string.n)
                    idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
-        cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
+        cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
            idx = tokens.push_back(idx, deref(it))
            preinc(it)
@ -213,10 +213,10 @@ cdef class Tokenizer:
        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
        cached.length = n
        cached.is_lex = True
-        lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
+        lexemes = <const LexemeC**>self.mem.alloc(n, sizeof(LexemeC**))
        for i in range(n):
            lexemes[i] = tokens[i].lex
-        cached.data.lexemes = <const Lexeme* const*>lexemes
+        cached.data.lexemes = <const LexemeC* const*>lexemes
        self._cache.set(key, cached)

    cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
@ -243,7 +243,7 @@ cdef class Tokenizer:
        cdef unicode form
        cdef unicode lemma
        cdef dict props
-        cdef Lexeme** lexemes
+        cdef LexemeC** lexemes
        cdef hash_t hashed
        cdef UniStr string
        for chunk, substrings in sorted(rules.items()):
@ -252,7 +252,7 @@ cdef class Tokenizer:
                form = props['F']
                lemma = props.get("L", None)
                slice_unicode(&string, form, 0, len(form))
-                tokens[i].lex = <Lexeme*>self.vocab.get(self.vocab.mem, &string)
+                tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
                if lemma:
                    tokens[i].lemma = self.vocab.strings[lemma]
                if 'pos' in props:
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -5,13 +5,13 @@ from cython.view cimport array as cvarray
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t

-from .typedefs cimport flags_t, attr_id_t, attr_t
-from .structs cimport Morphology, TokenC, Lexeme
+from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t
+from .structs cimport Morphology, TokenC, LexemeC
 from .vocab cimport Vocab
 from .strings cimport StringStore


-ctypedef const Lexeme* const_Lexeme_ptr
+ctypedef const LexemeC* const_Lexeme_ptr
 ctypedef TokenC* TokenC_ptr

 ctypedef fused LexemeOrToken:
@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
    TokenC_ptr


-cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil
+cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil
 cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil

-cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
+cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)


@ -42,5 +42,32 @@ cdef class Tokens:


 cdef class Token:
-    cdef Tokens _seq
-    cdef readonly int i
+    cdef cvarray vec
+
+    cdef readonly flags_t flags
+   
+    cdef readonly attr_t id
+    cdef readonly attr_t sic
+    cdef readonly attr_t dense
+    cdef readonly attr_t shape
+    cdef readonly attr_t prefix
+    cdef readonly attr_t suffix
+ 
+    cdef readonly attr_t length
+    cdef readonly attr_t cluster
+    cdef readonly attr_t pos_type
+
+    cdef readonly float prob
+    cdef readonly float sentiment
+
+    cdef readonly Morphology morph
+    cdef readonly univ_tag_t pos
+    cdef readonly int fine_pos
+    cdef readonly int idx
+    cdef readonly int lemma
+    cdef readonly int sense
+    cdef readonly int dep_tag
+    
+    cdef readonly int head_offset
+    cdef readonly uint32_t l_kids
+    cdef readonly uint32_t r_kids
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -32,7 +32,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
        return get_lex_attr(token.lex, feat_name)


-cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
+cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
    if feat_name < (sizeof(flags_t) * 8):
        return check_flag(lex, feat_name)
    elif feat_name == ID:
@ -85,7 +85,7 @@ cdef class Tokens:
            token (Token):
        """
        bounds_check(i, self.length, PADDING)
-        return Token(self, i)
+        return cinit_token(&self.data[i])

    def __iter__(self):
        """Iterate over the tokens.
@ -174,26 +174,57 @@ cdef class Tokens:
            self.data[i].lex = &EMPTY_LEXEME


-@cython.freelist(64)
+cdef Token cinit_token(const TokenC* c_tok):
+    cdef const LexemeC* lex = c_tok.lex
+    cdef Token py_tok = Token.__new__(Token)
+
+    cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
+    py_tok.vec = cyarr
+
+    py_tok.flags = lex.flags
+    py_tok.id = lex.id
+    py_tok.sic = lex.sic
+    py_tok.dense = lex.dense
+    py_tok.shape = lex.shape
+    py_tok.prefix = lex.prefix
+    py_tok.suffix = lex.suffix
+    py_tok.length = lex.length
+    py_tok.cluster = lex.cluster
+    py_tok.pos_type = lex.pos_type
+
+    py_tok.prob = lex.prob
+    py_tok.sentiment = lex.sentiment
+
+    py_tok.morph = c_tok.morph
+    py_tok.pos = c_tok.pos
+    py_tok.fine_pos = c_tok.fine_pos
+    py_tok.idx = c_tok.idx
+    py_tok.lemma = c_tok.lemma
+    py_tok.sense = c_tok.sense
+    py_tok.dep_tag = c_tok.dep_tag
+    py_tok.head_offset = c_tok.head
+    py_tok.l_kids = c_tok.l_kids
+    py_tok.r_kids = c_tok.r_kids
+    return py_tok
+
+
 cdef class Token:
    """An individual token.
-
-    Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
-    object.
    """
-    def __init__(self, Tokens tokens, int i):
-        self._seq = tokens
-        self.i = i
+    def __init__(self):
+        pass
+        #self._seq = tokens
+        #self.i = i

-    def __unicode__(self):
-        cdef const TokenC* t = &self._seq.data[self.i]
-        cdef int end_idx = t.idx + t.lex.length
-        if self.i + 1 == self._seq.length:
-            return self.string
-        if end_idx == t[1].idx:
-            return self.string
-        else:
-            return self.string + ' '
+    #def __unicode__(self):
+    #    cdef const TokenC* t = &self._seq.data[self.i]
+    #    cdef int end_idx = t.idx + t.lex.length
+    #    if self.i + 1 == self._seq.length:
+    #        return self.string
+    #    if end_idx == t[1].idx:
+    #        return self.string
+    #    else:
+    #        return self.string + ' '

    def __len__(self):
        """The number of unicode code-points in the original string.
@ -201,87 +232,87 @@ cdef class Token:
        Returns:
            length (int):
        """
-        return self._seq.data[self.i].lex.length
+        return self.length

-    property idx:
-        """The index into the original string at which the token starts.
+    #property idx:
+    #    """The index into the original string at which the token starts.

-        The following is supposed to always be true:
+    #    The following is supposed to always be true:
+    #    
+    #    >>> original_string[token.idx:token.idx len(token) == token.string
+    #    """
+    #    def __get__(self):
+    #        return self._seq.data[self.i].idx

-        >>> original_string[token.idx:token.idx len(token) == token.string
-        """
-        def __get__(self):
-            return self._seq.data[self.i].idx
+    #property cluster:
+    #    """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
+    #
+    #    Similar words have better-than-chance likelihood of having similar cluster
+    #    IDs, although the clustering is quite noisy.  Cluster IDs make good features,
+    #    and help to make models slightly more robust to domain variation.

-    property cluster:
-        """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
+    #    A common trick is to use only the first N bits of a cluster ID in a feature,
+    #    as the more general part of the hierarchical clustering is often more accurate
+    #    than the lower categories.

-        Similar words have better-than-chance likelihood of having similar cluster
-        IDs, although the clustering is quite noisy.  Cluster IDs make good features,
-        and help to make models slightly more robust to domain variation.
+    #    To assist in this, I encode the cluster IDs little-endian, to allow a simple
+    #    bit-mask:

-        A common trick is to use only the first N bits of a cluster ID in a feature,
-        as the more general part of the hierarchical clustering is often more accurate
-        than the lower categories.
+    #    >>> six_bits = cluster & (2**6 - 1)
+    #    """
+    #    def __get__(self):
+    #        return self._seq.data[self.i].lex.cluster

-        To assist in this, I encode the cluster IDs little-endian, to allow a simple
-        bit-mask:
+    #property string:
+    #    """The unicode string of the word, with no whitespace padding."""
+    #    def __get__(self):
+    #        cdef const TokenC* t = &self._seq.data[self.i]
+    #        if t.lex.sic == 0:
+    #            return ''
+    #        cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
+    #        return utf8string.decode('utf8')

-        >>> six_bits = cluster & (2**6 - 1)
-        """
-        def __get__(self):
-            return self._seq.data[self.i].lex.cluster
+    #property lemma:
+    #    """The unicode string of the word's lemma.  If no part-of-speech tag is
+    #    assigned, the most common part-of-speech tag of the word is used.
+    #    """
+    #    def __get__(self):
+    #        cdef const TokenC* t = &self._seq.data[self.i]
+    #        if t.lemma == 0:
+    #            return self.string
+    #        cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
+    #        return utf8string.decode('utf8')

-    property string:
-        """The unicode string of the word, with no whitespace padding."""
-        def __get__(self):
-            cdef const TokenC* t = &self._seq.data[self.i]
-            if t.lex.sic == 0:
-                return ''
-            cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
-            return utf8string.decode('utf8')
+    #property dep_tag:
+    #    """The ID integer of the word's dependency label.  If no parse has been
+    #    assigned, defaults to 0.
+    #    """
+    #    def __get__(self):
+    #        return self._seq.data[self.i].dep_tag

-    property lemma:
-        """The unicode string of the word's lemma.  If no part-of-speech tag is
-        assigned, the most common part-of-speech tag of the word is used.
-        """
-        def __get__(self):
-            cdef const TokenC* t = &self._seq.data[self.i]
-            if t.lemma == 0:
-                return self.string
-            cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
-            return utf8string.decode('utf8')
+    #property pos:
+    #    """The ID integer of the word's part-of-speech tag, from the 13-tag
+    #    Google Universal Tag Set.  Constants for this tag set are available in
+    #    spacy.typedefs.
+    #    """
+    #    def __get__(self):
+    #        return self._seq.data[self.i].pos

-    property dep_tag:
-        """The ID integer of the word's dependency label.  If no parse has been
-        assigned, defaults to 0.
-        """
-        def __get__(self):
-            return self._seq.data[self.i].dep_tag
+    #property fine_pos:
+    #    """The ID integer of the word's fine-grained part-of-speech tag, as assigned
+    #    by the tagger model.  Fine-grained tags include morphological information,
+    #    and other distinctions, and allow a more accurate tagger to be trained.
+    #    """
 
-    property pos:
-        """The ID integer of the word's part-of-speech tag, from the 13-tag
-        Google Universal Tag Set.  Constants for this tag set are available in
-        spacy.typedefs.
-        """
-        def __get__(self):
-            return self._seq.data[self.i].pos
+    #    def __get__(self):
+    #        return self._seq.data[self.i].fine_pos

-    property fine_pos:
-        """The ID integer of the word's fine-grained part-of-speech tag, as assigned
-        by the tagger model.  Fine-grained tags include morphological information,
-        and other distinctions, and allow a more accurate tagger to be trained.
-        """
+    #property sic:
+    #    def __get__(self):
+    #        return self._seq.data[self.i].lex.sic

-        def __get__(self):
-            return self._seq.data[self.i].fine_pos
-
-    property sic:
-        def __get__(self):
-            return self._seq.data[self.i].lex.sic
-
-    property head:
-        """The token predicted by the parser to be the head of the current token."""
-        def __get__(self):
-            cdef const TokenC* t = &self._seq.data[self.i]
-            return Token(self._seq, self.i + t.head)
+    #property head:
+    #    """The token predicted by the parser to be the head of the current token."""
+    #    def __get__(self):
+    #        cdef const TokenC* t = &self._seq.data[self.i]
+    #        return Token(self._seq, self.i + t.head)
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -4,16 +4,16 @@ from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64

-from .structs cimport Lexeme, TokenC, UniStr
+from .structs cimport LexemeC, TokenC, UniStr
 from .typedefs cimport utf8_t, id_t, hash_t
 from .strings cimport StringStore


-cdef Lexeme EMPTY_LEXEME
+cdef LexemeC EMPTY_LEXEME


 cdef union LexemesOrTokens:
-    const Lexeme* const* lexemes
+    const LexemeC* const* lexemes
    TokenC* tokens


@ -27,9 +27,9 @@ cdef class Vocab:
    cpdef public get_lex_props
    cdef Pool mem
    cpdef readonly StringStore strings
-    cdef vector[Lexeme*] lexemes
+    cdef vector[LexemeC*] lexemes

-    cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
+    cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
    
    cdef PreshMap _map
  
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -2,20 +2,27 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from libc.string cimport memset

 from os import path
+import codecs

 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport init as lexeme_init
 from .strings cimport slice_unicode
-from .typedefs cimport flags_t
+from .strings cimport hash_string
 from .orth cimport word_shape


-memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
+DEF MAX_VEC_SIZE = 100000


-cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
+cdef float[MAX_VEC_SIZE] EMPTY_VEC
+memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
+memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
+EMPTY_LEXEME.vec = EMPTY_VEC
+
+
+cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
                  StringStore string_store, dict props) except *:
-    cdef Lexeme lex
+    cdef LexemeC lex
    lex.id = i
    lex.length = len(string)
    lex.sic = string_store[string]
@ -28,13 +35,12 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
    lex.suffix = string_store[string[-3:]]
    lex.shape = string_store[word_shape(string)]
   
-    cdef object flags_val = props.get('flags', 0)
-    lex.flags = <flags_t>flags_val
+    lex.flags = props.get('flags', 0)
    return lex


 cdef class Vocab:
-    '''A map container for a language's Lexeme structs.
+    '''A map container for a language's LexemeC structs.
    '''
    def __init__(self, data_dir=None, get_lex_props=None):
        self.mem = Pool()
@ -50,24 +56,25 @@ cdef class Vocab:
            if not path.isdir(data_dir):
                raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
            self.strings.load(path.join(data_dir, 'strings.txt'))
-            self.load(path.join(data_dir, 'lexemes.bin'))
+            self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
+            #self.load_vectors(path.join(data_dir, 'deps.words'))

    def __len__(self):
        """The current number of lexemes stored."""
        return self.lexemes.size()

-    cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
-        '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
+    cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
+        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
        if necessary, using memory acquired from the given pool.  If the pool
        is the lexicon's own memory, the lexeme is saved in the lexicon.'''
-        cdef Lexeme* lex
-        lex = <Lexeme*>self._map.get(string.key)
+        cdef LexemeC* lex
+        lex = <LexemeC*>self._map.get(string.key)
        if lex != NULL:
            return lex
        if string.n < 3:
            mem = self.mem
        cdef unicode py_string = string.chars[:string.n]
-        lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
+        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
        lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
                             self.get_lex_props(py_string))
        if mem is self.mem:
@ -81,13 +88,13 @@ cdef class Vocab:

    def __getitem__(self,  id_or_string):
        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
-        unseen unicode string is given, a new Lexeme is created and stored.
+        unseen unicode string is given, a new LexemeC is created and stored.

        This function relies on Cython's struct-to-dict conversion.  Python clients
        receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
-        with int values.  Cython clients can instead receive a Lexeme struct value.
+        with int values.  Cython clients can instead receive a LexemeC struct value.
        More efficient Cython access is provided by Lexicon.get, which returns
-        a Lexeme*.
+        a LexemeC*.

        Args:
            id_or_string (int or unicode): The integer ID of a word, or its unicode
@ -96,24 +103,26 @@ cdef class Vocab:
                is raised.

        Returns:
-            lexeme (dict): A Lexeme struct instance, which Cython translates into
+            lexeme (dict): A LexemeC struct instance, which Cython translates into
                a dict if the operator is called from Python.
        '''
        if type(id_or_string) == int:
            if id_or_string >= self.lexemes.size():
                raise IndexError
-            return self.lexemes.at(id_or_string)[0]
+            return {}
+            #return self.lexemes.at(id_or_string)[0]
        cdef UniStr string
        slice_unicode(&string, id_or_string, 0, len(id_or_string))
-        cdef const Lexeme* lexeme = self.get(self.mem, &string)
-        return lexeme[0]
+        cdef const LexemeC* lexeme = self.get(self.mem, &string)
+        return {}
+        #return lexeme[0]

    def __setitem__(self, unicode uni_string, dict props):
        cdef UniStr s
        slice_unicode(&s, uni_string, 0, len(uni_string))
        # Cast through the const here, since we're allowed to change our own
-        # Lexemes.
-        lex = <Lexeme*><void*>self.get(self.mem, &s)
+        # LexemeCs.
+        lex = <LexemeC*><void*>self.get(self.mem, &s)
        lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)

    def dump(self, loc):
@ -128,30 +137,30 @@ cdef class Vocab:
            key = self._map.c_map.cells[i].key
            if key == 0:
                continue
-            lexeme = <Lexeme*>self._map.c_map.cells[i].value
+            lexeme = <LexemeC*>self._map.c_map.cells[i].value
            st = fwrite(&key, sizeof(key), 1, fp)
            assert st == 1
-            st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
+            st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
            assert st == 1
        st = fclose(fp)
        assert st == 0

-    def load(self, loc):
+    def load_lexemes(self, loc):
        if not path.exists(loc):
-            raise IOError('Lexemes file not found at %s' % loc)
+            raise IOError('LexemeCs file not found at %s' % loc)
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
        cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
        assert fp != NULL
        cdef size_t st
-        cdef Lexeme* lexeme
+        cdef LexemeC* lexeme
        cdef hash_t key
        i = 0
        while True:
            st = fread(&key, sizeof(key), 1, fp)
            if st != 1:
                break
-            lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
-            st = fread(lexeme, sizeof(Lexeme), 1, fp)
+            lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
+            st = fread(lexeme, sizeof(LexemeC), 1, fp)
            if st != 1:
                break
            self._map.set(key, lexeme)
@ -160,3 +169,29 @@ cdef class Vocab:
            self.lexemes[lexeme.id] = lexeme
            i += 1
        fclose(fp)
+
+    def load_vectors(self, loc):
+        cdef int i
+        cdef unicode line
+        cdef unicode word
+        cdef unicode val_str
+        cdef hash_t key
+        cdef LexemeC* lex
+        cdef float* vec
+ 
+        with codecs.open(loc, 'r', 'utf8') as file_:
+            for line in file_:
+                pieces = line.split()
+                word = pieces.pop(0)
+                if len(pieces) >= MAX_VEC_SIZE:
+                    sizes = (len(pieces), MAX_VEC_SIZE)
+                    msg = ("Your vector is %d elements."
+                           "The compile-time limit is %d elements." % sizes)
+                    raise ValueError(msg)
+                key = hash_string(word)
+                lex = <LexemeC*>self._map.get(key)
+                if lex is not NULL:
+                    vec = <float*>self.mem.alloc(len(pieces), sizeof(float))
+                    for i, val_str in enumerate(pieces):
+                        vec[i] = float(val_str)
+                    lex.vec = vec