From cad0cca4e3b7c50f45e1e1084d7d3d2fbc6db7ae Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 22 Aug 2015 22:04:34 +0200
Subject: [PATCH] * Tmp

---
 spacy/en/__init__.py |   1 -
 spacy/lexeme.pxd     | 126 ++++++++++++++-----------------------------
 spacy/lexeme.pyx     | 109 ++++++++++++++++++++++++-------------
 spacy/matcher.pyx    |  34 ++++++------
 spacy/strings.pyx    |   5 ++
 spacy/vocab.pyx      |  45 +++++-----------
 6 files changed, 147 insertions(+), 173 deletions(-)

diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py
index c81630a72..a04b615da 100644
--- a/spacy/en/__init__.py
+++ b/spacy/en/__init__.py
@@ -80,7 +80,6 @@ class English(object):
       Packer=None,
       load_vectors=True
     ):
-        
         self.data_dir = data_dir
 
         if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index f7b210281..321f7c616 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -8,97 +8,53 @@ from .strings cimport StringStore
 from numpy cimport ndarray
 
 
-
 cdef LexemeC EMPTY_LEXEME
 
-
-cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
-                              const float* empty_vec) except -1
-
 cdef class Lexeme:
-    cdef readonly ndarray repvec
-
-    cdef readonly flags_t flags
-    cdef readonly attr_t id
-    cdef readonly attr_t length
-
+    cdef LexemeC* c
+    cdef readonly Vocab vocab
     cdef readonly attr_t orth
-    cdef readonly attr_t lower
-    cdef readonly attr_t norm
-    cdef readonly attr_t shape
-    cdef readonly attr_t prefix
-    cdef readonly attr_t suffix
 
-    cdef readonly unicode orth_
-    cdef readonly unicode lower_
-    cdef readonly unicode norm_
-    cdef readonly unicode shape_
-    cdef readonly unicode prefix_
-    cdef readonly unicode suffix_
+    cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
+        lex.length = props['length']
+        lex.orth = vocab.strings[props['orth']]
+        lex.lower = vocab.strings[props['lower']]
+        lex.norm = vocab.strings[props['norm']]
+        lex.shape = vocab.strings[props['shape']]
+        lex.prefix = vocab.strings[props['prefix']]
+        lex.suffix = vocab.strings[props['suffix']]
 
-    cdef readonly attr_t cluster
-    cdef readonly float prob
-    cdef readonly float sentiment
-    cdef readonly float l2_norm
+        lex.cluster = props['cluster']
+        lex.prob = props['prob']
+        lex.sentiment = props['sentiment']
+
+        lex.flags = props['flags']
+        lex.repvec = empty_vec
 
-    # Workaround for an apparent bug in the way the decorator is handled ---
-    # TODO: post bug report / patch to Cython.
     @staticmethod
-    cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length):
-        cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length)
-        for i in range(repvec_length):
-            py.repvec[i] = ptr.repvec[i]
-        py.l2_norm = ptr.l2_norm
-        py.flags = ptr.flags
-        py.id = ptr.id
-        py.length = ptr.length
+    cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
+        if feat_name < (sizeof(flags_t) * 8):
+            return Lexeme.check_flag(lex, feat_name)
+        elif feat_name == ID:
+            return lex.id
+        elif feat_name == ORTH:
+            return lex.orth
+        elif feat_name == LOWER:
+            return lex.lower
+        elif feat_name == NORM:
+            return lex.norm
+        elif feat_name == SHAPE:
+            return lex.shape
+        elif feat_name == PREFIX:
+            return lex.prefix
+        elif feat_name == SUFFIX:
+            return lex.suffix
+        elif feat_name == LENGTH:
+            return lex.length
+        elif feat_name == CLUSTER:
+            return lex.cluster
+        else:
+            return 0
 
-        py.orth = ptr.orth
-        py.lower = ptr.lower
-        py.norm = ptr.norm
-        py.shape = ptr.shape
-        py.prefix = ptr.prefix
-        py.suffix = ptr.suffix
-
-        py.orth_ = strings[ptr.orth]
-        py.lower_ = strings[ptr.lower]
-        py.norm_ = strings[ptr.norm]
-        py.shape_ = strings[ptr.shape]
-        py.prefix_ = strings[ptr.prefix]
-        py.suffix_ = strings[ptr.suffix]
-
-        py.cluster = ptr.cluster
-        py.prob = ptr.prob
-        py.sentiment = ptr.sentiment
-        return py
-
-    cpdef bint check_flag(self, attr_id_t flag_id) except -1
-    
-
-cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
-    return lexeme.flags & (1 << flag_id)
-
-
-cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
-    if feat_name < (sizeof(flags_t) * 8):
-        return check_flag(lex, feat_name)
-    elif feat_name == ID:
-        return lex.id
-    elif feat_name == ORTH:
-        return lex.orth
-    elif feat_name == LOWER:
-        return lex.lower
-    elif feat_name == NORM:
-        return lex.norm
-    elif feat_name == SHAPE:
-        return lex.shape
-    elif feat_name == PREFIX:
-        return lex.prefix
-    elif feat_name == SUFFIX:
-        return lex.suffix
-    elif feat_name == LENGTH:
-        return lex.length
-    elif feat_name == CLUSTER:
-        return lex.cluster
-    else:
-        return 0
+    cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
+        return lexeme.flags & (1 << flag_id)
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 07f151114..f0b3303f1 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -17,70 +17,105 @@ from .attrs cimport IS_OOV
 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 
 
-cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
-                              const float* empty_vec) except -1:
-    lex.length = props['length']
-    lex.orth = string_store[props['orth']]
-    lex.lower = string_store[props['lower']]
-    lex.norm = string_store[props['norm']]
-    lex.shape = string_store[props['shape']]
-    lex.prefix = string_store[props['prefix']]
-    lex.suffix = string_store[props['suffix']]
-
-    lex.cluster = props['cluster']
-    lex.prob = props['prob']
-    lex.sentiment = props['sentiment']
-
-    lex.flags = props['flags']
-    lex.repvec = empty_vec
-
-
 cdef class Lexeme:
     """An entry in the vocabulary.  A Lexeme has no string context --- it's a
     word-type, as opposed to a word token.  It therefore has no part-of-speech
     tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
     tag).
     """
-    def __cinit__(self, int vec_size):
-        self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
+    def __init__(self, Vocab vocab, int orth):
+        self.vocab = vocab
+        self.orth = orth
+        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
 
-    @property
-    def has_repvec(self):
-        return self.l2_norm != 0
+    property orth:
+        def __get__(self): 
+            return self.c.orth
+    
+    property lower:
+        def __get__(self): return self.c.lower
+        def __set__(self, int x): self.c.lower = x
+    
+    property norm:
+        def __get__(self): return self.c.norm
+        def __set__(self, int x): self.c.norm = x
 
-    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-        cdef flags_t one = 1
-        return self.flags & (one << flag_id)
+    property shape:
+        def __get__(self): return self.c.shape
+        def __set__(self, int x): self.c.shape = x
+
+    property prefix:
+        def __get__(self): return self.c.prefix
+        def __set__(self, int x): self.c.prefix = x
+
+    property suffix:
+        def __get__(self): return self.c.suffix
+        def __set__(self, int x): self.c.suffix = x
+    
+    property orth_:
+        def __get__(self):
+            return self.vocab.strings[self.c.orth]
+
+    property lower_:
+        def __get__(self): return self.vocab.strings[self.c.lower]
+        def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
+ 
+    property norm_:
+        def __get__(self): return self.c.norm
+        def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
+    
+    property shape_:
+        def __get__(self): return self.vocab.strings[self.c.shape]
+        def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
+
+    property prefix_:
+        def __get__(self): return self.c.prefix
+        def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
+
+    property suffix_:
+        def __get__(self): return self.c.suffix
+        def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
 
     property is_oov:
-        def __get__(self): return self.check_flag(IS_OOV)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)
 
     property is_alpha:
-        def __get__(self): return self.check_flag(IS_ALPHA)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
     
     property is_ascii:
-        def __get__(self): return self.check_flag(IS_ASCII)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)
 
     property is_digit:
-        def __get__(self): return self.check_flag(IS_DIGIT)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)
 
     property is_lower:
-        def __get__(self): return self.check_flag(IS_LOWER)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)
 
     property is_title:
-        def __get__(self): return self.check_flag(IS_TITLE)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)
 
     property is_punct:
-        def __get__(self): return self.check_flag(IS_PUNCT)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)
 
     property is_space: 
-        def __get__(self): return self.check_flag(IS_SPACE)
+        def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)
 
     property like_url:
-        def __get__(self): return self.check_flag(LIKE_URL)
+        def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
     
     property like_num:
-        def __get__(self): return self.check_flag(LIKE_NUM)
+        def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
 
     property like_email:
-        def __get__(self): return self.check_flag(LIKE_EMAIL)
+        def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
+        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index ee2ceaecc..72473b073 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -12,6 +12,8 @@ from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab
 
+from libcpp.vector cimport vector
+
 try:
     import ujson as json
 except ImportError:
@@ -96,28 +98,26 @@ def map_attr_name(attr):
 
 cdef class Matcher:
     cdef Pool mem
-    cdef Pattern** patterns
+    cdef vector[Pattern*] patterns
     cdef readonly int n_patterns
 
     def __init__(self, vocab, patterns):
         self.mem = Pool()
-        n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()])
-        self.patterns = <Pattern**>self.mem.alloc(n_patterns, sizeof(Pattern*))
-        cdef int i = 0
         for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
-            if isinstance(entity_key, basestring):
-                entity_key = vocab.strings[entity_key]
-            if isinstance(etype, basestring):
-                etype = vocab.strings[etype]
-            elif etype is None:
-                etype = -1
-            # TODO: Do something more clever about multiple patterns for single
-            # entity
-            for spec in specs:
-                spec = _convert_strings(spec, vocab.strings)
-                self.patterns[i] = init_pattern(self.mem, spec, etype)
-                i += 1
-        self.n_patterns = len(patterns)
+            self.add(entity_key, etype, attrs, specs)
+
+    def add(self, entity_key, etype, attrs, specs):
+        if isinstance(entity_key, basestring):
+            entity_key = vocab.strings[entity_key]
+        if isinstance(etype, basestring):
+            etype = vocab.strings[etype]
+        elif etype is None:
+            etype = -1
+        # TODO: Do something more clever about multiple patterns for single
+        # entity
+        for spec in specs:
+            spec = _convert_strings(spec, vocab.strings)
+            self.patterns.push_back(init_pattern(self.mem, spec, etype))
 
     @classmethod
     def from_dir(cls, vocab, data_dir):
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index b35ed2ccb..c187a6aa6 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -108,6 +108,11 @@ cdef class StringStore:
         else:
             raise TypeError(type(string_or_id))
 
+    def __iter__(self):
+        cdef int i
+        for i in range(self.size):
+            yield self[i]
+
     cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
         # 0 means missing, but we don't bother offsetting the index.
         key = hash64(chars, length * sizeof(char), 0)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ac2e11e11..dcb7d575c 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -36,24 +36,20 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
 cdef class Vocab:
     '''A map container for a language's LexemeC structs.
     '''
-    def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
-                 pos_tags=None, oov_prob=-30):
-        if oov_prob is None:
-            oov_prob = -30
+    def __init__(self, data_dir=None, get_lex_attr=None):
         self.mem = Pool()
         self._by_hash = PreshMap()
         self._by_orth = PreshMap()
         self.strings = StringStore()
         self.pos_tags = pos_tags if pos_tags is not None else {}
-
-        self.lexeme_props_getter = get_lex_props
+        
+        self.get_lex_attr = get_lex_attr
         self.repvec_length = 0
         self.length = 0
         self._add_lex_to_vocab(0, &EMPTY_LEXEME)
         if data_dir is not None:
             if not path.exists(data_dir):
                 raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
-        if data_dir is not None:
             if not path.isdir(data_dir):
                 raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
             self.load_lexemes(path.join(data_dir, 'strings.txt'),
@@ -63,7 +59,6 @@ cdef class Vocab:
 
         self._serializer = None
         self.data_dir = data_dir
-        self.oov_prob = oov_prob
 
     property serializer:
         def __get__(self):
@@ -91,18 +86,8 @@ cdef class Vocab:
         lex = <LexemeC*>self._by_hash.get(key)
         if lex != NULL:
             return lex
-        cdef bint is_oov = mem is not self.mem
-        if len(string) < 3:
-            mem = self.mem
-        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
-        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
-        if is_oov:
-            lex.id = 0
         else:
-            self._add_lex_to_vocab(key, lex)
-        assert lex != NULL, string
-        return lex
+            return self._new_lexeme(mem, string)
 
     cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
@@ -114,18 +99,21 @@ cdef class Vocab:
         lex = <LexemeC*>self._by_orth.get(orth)
         if lex != NULL:
             return lex
-        cdef unicode string = self.strings[orth]
+        else:
+            return self._new_lexeme(mem, self.strings[orth])
+
+    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
         cdef bint is_oov = mem is not self.mem
         if len(string) < 3:
             mem = self.mem
         lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
-        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
+        for attr, func in self.lex_attr_getters.items():
+            Lexeme.set_struct_attr(lex, attr, func(string))
         if is_oov:
             lex.id = 0
         else:
-            self._add_lex_to_vocab(hash_string(string), lex)
-        assert lex != NULL, orth
+            self._add_lex_to_vocab(key, lex)
+        assert lex != NULL, string
         return lex
 
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
@@ -171,15 +159,6 @@ cdef class Vocab:
                 "int --> Lexeme" % str(type(id_or_string)))
         return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
 
-    def __setitem__(self, unicode string, dict props):
-        cdef hash_t key = hash_string(string)
-        cdef LexemeC* lex
-        lex = <LexemeC*>self._by_hash.get(key)
-        if lex == NULL:
-            lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
-        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
-        self._add_lex_to_vocab(key, lex)
-
     def dump(self, loc):
         if path.exists(loc):
             assert not path.isdir(loc)