From d2fc104a26b8832162847b946d0d3973e95cfaaa Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 6 Sep 2015 19:45:15 +0200
Subject: [PATCH] * Begin merge of Gazetteer and DE branches

---
 spacy/lexeme.pxd       | 50 +++++++++++++++++++++++++++++-------------
 spacy/lexeme.pyx       | 30 +++++++++++--------------
 spacy/matcher.pyx      |  7 +++---
 spacy/tokens/doc.pyx   |  5 ++---
 spacy/tokens/token.pyx | 26 +++++++++++-----------
 spacy/vocab.pxd        |  1 +
 spacy/vocab.pyx        | 12 +++++-----
 7 files changed, 74 insertions(+), 57 deletions(-)

diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 321f7c616..130966765 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE
 
 from .structs cimport LexemeC
 from .strings cimport StringStore
+from .vocab cimport Vocab
 
 from numpy cimport ndarray
 
@@ -15,21 +16,31 @@ cdef class Lexeme:
     cdef readonly Vocab vocab
     cdef readonly attr_t orth
 
-    cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
-        lex.length = props['length']
-        lex.orth = vocab.strings[props['orth']]
-        lex.lower = vocab.strings[props['lower']]
-        lex.norm = vocab.strings[props['norm']]
-        lex.shape = vocab.strings[props['shape']]
-        lex.prefix = vocab.strings[props['prefix']]
-        lex.suffix = vocab.strings[props['suffix']]
-
-        lex.cluster = props['cluster']
-        lex.prob = props['prob']
-        lex.sentiment = props['sentiment']
-
-        lex.flags = props['flags']
-        lex.repvec = empty_vec
+    @staticmethod
+    cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
+        cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
+        self.c = lex
+        self.vocab = vocab
+        self.orth = lex.orth
+    
+    @staticmethod
+    cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
+        if name < (sizeof(flags_t) * 8):
+            Lexeme.set_flag(lex, name, value)
+        elif name == ID:
+            lex.id = value
+        elif name == LOWER:
+            lex.lower = value
+        elif name == NORM:
+            lex.norm = value
+        elif name == SHAPE:
+            lex.shape = value
+        elif name == PREFIX:
+            lex.prefix = value
+        elif name == SUFFIX:
+            lex.suffix = value
+        elif name == CLUSTER:
+            lex.cluster = value
 
     @staticmethod
     cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@@ -56,5 +67,14 @@ cdef class Lexeme:
         else:
             return 0
 
+    @staticmethod
     cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
         return lexeme.flags & (1 << flag_id)
+
+    @staticmethod
+    cdef inline bint set_flag(LexemeC* lexeme, attr_id_t flag_id, bint value) nogil:
+        cdef flags_t one = 1
+        if value:
+            lexeme.flags |= one << flag_id
+        else:
+            lexeme.flags &= ~(one << flag_id)
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index f0b3303f1..832f4fec7 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -26,12 +26,8 @@ cdef class Lexeme:
     def __init__(self, Vocab vocab, int orth):
         self.vocab = vocab
         self.orth = orth
-        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
+        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
 
-    property orth:
-        def __get__(self): 
-            return self.c.orth
-    
     property lower:
         def __get__(self): return self.c.lower
         def __set__(self, int x): self.c.lower = x
@@ -78,44 +74,44 @@ cdef class Lexeme:
 
     property is_oov:
         def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x)
 
     property is_alpha:
         def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ALPHA, x)
     
     property is_ascii:
         def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ASCII, x)
 
     property is_digit:
         def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_DIGIT, x)
 
     property is_lower:
         def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_LOWER, x)
 
     property is_title:
         def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_TITLE, x)
 
     property is_punct:
         def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_PUNCT, x)
 
     property is_space: 
         def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, IS_SPACE, x)
 
     property like_url:
         def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_URL, x)
     
     property like_num:
-        def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
+        def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_NUM, x)
 
     property like_email:
         def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
-        def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
+        def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)
diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 72473b073..caafe6498 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -102,21 +102,22 @@ cdef class Matcher:
     cdef readonly int n_patterns
 
     def __init__(self, vocab, patterns):
+        self.vocab = vocab
         self.mem = Pool()
         for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
             self.add(entity_key, etype, attrs, specs)
 
     def add(self, entity_key, etype, attrs, specs):
         if isinstance(entity_key, basestring):
-            entity_key = vocab.strings[entity_key]
+            entity_key = self.vocab.strings[entity_key]
         if isinstance(etype, basestring):
-            etype = vocab.strings[etype]
+            etype = self.vocab.strings[etype]
         elif etype is None:
             etype = -1
         # TODO: Do something more clever about multiple patterns for single
         # entity
         for spec in specs:
-            spec = _convert_strings(spec, vocab.strings)
+            spec = _convert_strings(spec, self.vocab.strings)
             self.patterns.push_back(init_pattern(self.mem, spec, etype))
 
     @classmethod
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 955e9b45f..4ba0d675a 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -5,6 +5,7 @@ from libc.stdint cimport uint32_t
 import numpy
 import struct
 
+from ..lexeme cimport Lexeme
 from ..lexeme cimport EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
@@ -13,8 +14,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech import UNIV_POS_NAMES
 from ..parts_of_speech cimport CONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
-from ..lexeme cimport check_flag
-from ..lexeme cimport get_attr as get_lex_attr
 from .spans cimport Span
 from .token cimport Token
 from ..serialize.bits cimport BitArray
@@ -48,7 +47,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
     elif feat_name == ENT_TYPE:
         return token.ent_type
     else:
-        return get_lex_attr(token.lex, feat_name)
+        return Lexeme.get_struct_attr(token.lex, feat_name)
 
 
 cdef class Doc:
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index cc50fdd08..2fa1366a1 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -1,6 +1,5 @@
 from libc.string cimport memcpy
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
-from ..lexeme cimport check_flag
 # Compiler crashes on memory view coercion without this. Should report bug.
 from cython.view cimport array as cvarray
 cimport numpy as np
@@ -9,6 +8,7 @@ np.import_array()
 import numpy
 
 
+from ..lexeme cimport Lexeme
 from ..parts_of_speech import UNIV_POS_NAMES
 
 from ..attrs cimport LEMMA
@@ -42,7 +42,7 @@ cdef class Token:
         return self.string
 
     cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-        return check_flag(self.c.lex, flag_id)
+        return Lexeme.check_flag(self.c.lex, flag_id)
 
     def nbor(self, int i=1):
         return self.doc[self.i+i]
@@ -286,37 +286,37 @@ cdef class Token:
             return self.vocab.strings[self.c.dep]
 
     property is_oov:
-        def __get__(self): return check_flag(self.c.lex, IS_OOV)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
 
     property is_alpha:
-        def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)
 
     property is_ascii:
-        def __get__(self): return check_flag(self.c.lex, IS_ASCII)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII)
 
     property is_digit:
-        def __get__(self): return check_flag(self.c.lex, IS_DIGIT)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT)
 
     property is_lower:
-        def __get__(self): return check_flag(self.c.lex, IS_LOWER)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER)
 
     property is_title:
-        def __get__(self): return check_flag(self.c.lex, IS_TITLE)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE)
 
     property is_punct:
-        def __get__(self): return check_flag(self.c.lex, IS_PUNCT)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT)
 
     property is_space: 
-        def __get__(self): return check_flag(self.c.lex, IS_SPACE)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE)
 
     property like_url:
-        def __get__(self): return check_flag(self.c.lex, LIKE_URL)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL)
 
     property like_num:
-        def __get__(self): return check_flag(self.c.lex, LIKE_NUM)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM)
 
     property like_email:
-        def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL)
 
 
 _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 2503cdcee..710a1b5ec 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -37,6 +37,7 @@ cdef class Vocab:
     cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
     
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
+    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
 
     cdef PreshMap _by_hash
     cdef PreshMap _by_orth
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index dcb7d575c..2d67e59f2 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -12,7 +12,6 @@ import math
 import json
 
 from .lexeme cimport EMPTY_LEXEME
-from .lexeme cimport set_lex_struct_props
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
 from .orth cimport word_shape
@@ -36,12 +35,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
 cdef class Vocab:
     '''A map container for a language's LexemeC structs.
     '''
-    def __init__(self, data_dir=None, get_lex_attr=None):
+    def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False):
         self.mem = Pool()
         self._by_hash = PreshMap()
         self._by_orth = PreshMap()
         self.strings = StringStore()
-        self.pos_tags = pos_tags if pos_tags is not None else {}
+        #self.pos_tags = pos_tags if pos_tags is not None else {}
+        self.pos_tags = {}
         
         self.get_lex_attr = get_lex_attr
         self.repvec_length = 0
@@ -112,7 +112,7 @@ cdef class Vocab:
         if is_oov:
             lex.id = 0
         else:
-            self._add_lex_to_vocab(key, lex)
+            self._add_lex_to_vocab(hash_string(string), lex)
         assert lex != NULL, string
         return lex
 
@@ -125,7 +125,7 @@ cdef class Vocab:
         cdef attr_t orth
         cdef size_t addr
         for orth, addr in self._by_orth.items():
-            yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length)
+            yield Lexeme.from_ptr(<LexemeC*>addr, self, self.repvec_length)
 
     def __getitem__(self,  id_or_string):
         '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
@@ -157,7 +157,7 @@ cdef class Vocab:
             raise ValueError("Vocab unable to map type: "
                 "%s. Maps unicode --> Lexeme or "
                 "int --> Lexeme" % str(type(id_or_string)))
-        return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
+        return Lexeme.from_ptr(<LexemeC*><void*>lexeme, self, self.repvec_length)
 
     def dump(self, loc):
         if path.exists(loc):