From 34b68a18ab04a4cb63411acced6f30ce0b4ba027 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Sat, 16 Aug 2014 19:59:38 +0200
Subject: [PATCH] * Progress to getting WordTree working. Tests pass, but so
 far it's slower.

---
 spacy/_hashing.pxd | 14 ++++++++++++++
 spacy/_hashing.pyx | 45 +++++++++++++++++++++++++++++++++++++++++++++
 spacy/spacy.pxd    |  6 ++++--
 spacy/spacy.pyx    | 40 +++++++++++++++++++++-------------------
 4 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/spacy/_hashing.pxd b/spacy/_hashing.pxd
index d87704c1a..f9722f492 100644
--- a/spacy/_hashing.pxd
+++ b/spacy/_hashing.pxd
@@ -1,5 +1,7 @@
 from libc.stdint cimport uint64_t
 
+from chartree cimport CharTree
+
 
 cdef class FixedTable:
     cdef size_t size
@@ -9,3 +11,15 @@ cdef class FixedTable:
     cdef size_t insert(self, uint64_t key, size_t value) nogil
     cdef size_t get(self, uint64_t key) nogil
     cdef int erase(self, uint64_t key) nogil
+
+
+cdef class WordTree:
+    cdef size_t max_length
+    cdef size_t default
+    cdef CharTree* _trees
+    cdef dict _dict
+
+    cdef size_t get(self, unicode string) except *
+    cdef int set(self, unicode string, size_t value) except *
+    cdef bint contains(self, unicode string) except *
+    
diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx
index 99c8e7406..eb71dff51 100644
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@@ -1,6 +1,8 @@
 from libc.stdlib cimport calloc, free
 import cython
 
+cimport chartree
+
 
 cdef class FixedTable:
     def __cinit__(self, const size_t size):
@@ -51,3 +53,46 @@ cdef class FixedTable:
 @cython.cdivision
 cdef inline size_t _find(uint64_t key, size_t size) nogil:
     return key % size
+
+
+cdef class WordTree:
+    def __cinit__(self, size_t default, size_t max_length):
+        self.max_length = max_length
+        self.default = default
+        self._trees = <CharTree*>calloc(max_length, sizeof(CharTree))
+        for i in range(self.max_length):
+            chartree.init(&self._trees[i], i)
+        self._dict = {}
+
+    cdef size_t get(self, unicode ustring) except *:
+        cdef bytes bstring = ustring.encode('utf8')
+        cdef size_t length = len(bstring)
+        if length >= self.max_length:
+            return self._dict.get(bstring, 0)
+        else:
+            return chartree.getitem(&self._trees[length], bstring)
+
+    cdef int set(self, unicode ustring, size_t value) except *:
+        cdef bytes bstring = ustring.encode('utf8')
+        cdef size_t length = len(bstring)
+        if length >= self.max_length:
+            self._dict[bstring] = value
+        else:
+            chartree.setitem(&self._trees[length], bstring, value)
+
+    cdef bint contains(self, unicode ustring) except *:
+        cdef bytes bstring = ustring.encode('utf8')
+        cdef size_t length = len(bstring)
+        if length >= self.max_length:
+            return bstring in self._dict
+        else:
+            return chartree.contains(&self._trees[length], bstring)
+
+    def __getitem__(self, unicode key):
+        return self.get(key)
+
+    def __setitem__(self, unicode key, size_t value):
+        self.set(key, value)
+    
+    def __contains__(self, unicode key):
+        return self.contains(key)
diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd
index a286a27f4..65b31f176 100644
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@@ -4,6 +4,7 @@ from libc.stdint cimport uint64_t
 
 from sparsehash.dense_hash_map cimport dense_hash_map
 from _hashing cimport FixedTable
+from _hashing cimport WordTree
 
 # Circular import problems here
 ctypedef size_t Lexeme_addr
@@ -22,11 +23,12 @@ ctypedef int ClusterID
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport Distribution
 from spacy.lexeme cimport Orthography
+from spacy._hashing cimport WordTree
 
 
 cdef class Language:
     cdef object name
-    cdef Vocab* vocab
+    cdef WordTree vocab
     cdef Vocab* distri
     cdef Vocab* ortho
     cdef dict bacov
@@ -38,7 +40,7 @@ cdef class Language:
     cdef Orthography* lookup_orth(self, StringHash key, unicode lex) except NULL
     cdef Distribution* lookup_dist(self, StringHash key) except NULL
     
-    cdef Lexeme* new_lexeme(self, StringHash key, unicode lex) except NULL
+    cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL
     cdef Orthography* new_orth(self, StringHash hashed, unicode lex) except NULL
     cdef Distribution* new_dist(self, StringHash key) except NULL
     
diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index 743ebc771..301b9d412 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -5,6 +5,7 @@ from libc.stdlib cimport calloc, free
 from libcpp.pair cimport pair
 from cython.operator cimport dereference as deref
 
+from murmurhash cimport mrmr
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport BLANK_WORD
 
@@ -15,6 +16,13 @@ from os import path
 cimport cython
 
 
+#cdef inline StringHash hash_string(unicode string, size_t length):
+#    '''Hash unicode with MurmurHash64A'''
+#    return hash(string)
+#    #cdef bytes byte_string = string.encode('utf8')
+#    #return mrmr.hash32(<char*>byte_string, len(byte_string) * sizeof(char), 0)
+
+
 def get_normalized(unicode lex, size_t length):
     if lex.isalpha() and lex.islower():
         return lex
@@ -56,10 +64,9 @@ cdef class Language:
     def __cinit__(self, name):
         self.name = name
         self.bacov = {}
-        self.vocab = new Vocab()
+        self.vocab = WordTree(0, 5)
         self.ortho = new Vocab()
         self.distri = new Vocab()
-        self.vocab[0].set_empty_key(0)
         self.distri[0].set_empty_key(0)
         self.ortho[0].set_empty_key(0)
         self.load_tokenization(util.read_tokenization(name))
@@ -93,9 +100,9 @@ cdef class Language:
 
         cdef StringHash hashed = hash(string)
         # First, check words seen 2+ times
-        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
+        cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
         if word_ptr == NULL:
-            word_ptr = self.new_lexeme(hashed, string)
+            word_ptr = self.new_lexeme(string, string)
         return <Lexeme_addr>word_ptr
 
     cdef Lexeme_addr lookup_chunk(self, unicode string) except 0:
@@ -106,18 +113,16 @@ cdef class Language:
         cdef size_t length = len(string)
         if length == 0:
             return <Lexeme_addr>&BLANK_WORD
-        cdef StringHash hashed = hash(string)
         # First, check words seen 2+ times
-        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
+        cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
         cdef int split
         if word_ptr == NULL:
             split = self.find_split(string, length)
             if split != 0 and split != -1 and split < length:
-                word_ptr = self.new_lexeme(hashed, string[:split])
+                word_ptr = self.new_lexeme(string, string[:split])
                 word_ptr.tail = <Lexeme*>self.lookup_chunk(string[split:])
-                self.bacov[hashed] = string
             else:
-                word_ptr = self.new_lexeme(hashed, string)
+                word_ptr = self.new_lexeme(string, string)
         return <Lexeme_addr>word_ptr
 
     cdef Orthography* lookup_orth(self, StringHash hashed, unicode lex):
@@ -132,14 +137,15 @@ cdef class Language:
             dist = self.new_dist(hashed)
         return dist
 
-    cdef Lexeme* new_lexeme(self, StringHash key, unicode string) except NULL:
+    cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL:
         cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
-        word.sic = key
+        word.sic = hash(key)
         word.lex = hash(string)
         self.bacov[word.lex] = string
+        self.bacov[word.sic] = key
         word.orth = self.lookup_orth(word.lex, string)
         word.dist = self.lookup_dist(word.lex)
-        self.vocab[0][key] = <size_t>word
+        self.vocab.set(key, <size_t>word)
         return word   
 
     cdef Orthography* new_orth(self, StringHash hashed, unicode lex) except NULL:
@@ -185,13 +191,10 @@ cdef class Language:
         cdef Lexeme* word
         cdef StringHash hashed
         for chunk, lex, tokens in token_rules:
-            hashed = hash(chunk)
-            word = <Lexeme*>self.new_lexeme(hashed, lex)
+            word = <Lexeme*>self.new_lexeme(chunk, lex)
             for i, lex in enumerate(tokens):
                 token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
-                length = len(token_string)
-                hashed = hash(token_string)
-                word.tail = <Lexeme*>self.new_lexeme(hashed, lex)
+                word.tail = <Lexeme*>self.new_lexeme(token_string, lex)
                 word = word.tail
 
     def load_clusters(self):
@@ -208,8 +211,7 @@ cdef class Language:
                 # the first 4 bits. See redshift._parse_features.pyx
                 cluster = int(cluster_str[::-1], 2)
                 upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
-                hashed = hash(token_string)
-                word = self.init_lexeme(hashed, token_string)
+                word = self.new_lexeme(token_string, token_string)
 
 
 cdef inline bint _is_whitespace(unsigned char c) nogil: