From 85d68e8e956fc211232dd3123d2b18f8f5f33ad6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Sat, 13 Sep 2014 03:14:43 +0200
Subject: [PATCH] * Replaced cache with own hash table. Similar timing

---
 spacy/_hashing.pxd |  8 +++++---
 spacy/_hashing.pyx | 26 ++++++++++++++++++++------
 spacy/en.pyx       |  3 ++-
 spacy/lang.pxd     |  3 ++-
 spacy/lang.pyx     | 18 +++++++++++-------
 5 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/spacy/_hashing.pxd b/spacy/_hashing.pxd
index a7448a639..2be9d109d 100644
--- a/spacy/_hashing.pxd
+++ b/spacy/_hashing.pxd
@@ -1,5 +1,7 @@
-ctypedef key_t size_t
-ctypedef val_t size_t
+from libc.stdint cimport uint64_t
+
+ctypedef uint64_t key_t
+ctypedef size_t val_t
 
 
 cdef struct Cell:
@@ -14,5 +16,5 @@ cdef class PointerHash:
 
     cdef size_t find_slot(self, key_t key)
     cdef Cell* lookup(self, key_t key)
-    cdef void insert(self, key_t key)
+    cdef void insert(self, key_t key, val_t value)
     cdef void resize(self, size_t new_size)
diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx
index 4c0637478..2645d2bcf 100644
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@@ -1,3 +1,8 @@
+# cython: profile=True
+from libc.stdlib cimport calloc, free
+cimport cython
+
+
 cdef class PointerHash:
     def __cinit__(self, size_t initial_size=8):
         self.size = initial_size
@@ -10,20 +15,26 @@ cdef class PointerHash:
         free(self.cells)
 
     def __getitem__(self, key_t key):
+        assert key != 0
         cdef Cell* cell = self.lookup(key)
         return cell.value if cell.key != 0 else None
 
     def __setitem__(self, key_t key,  val_t value):
-        self.insert(key, value
+        assert key != 0
+        self.insert(key, value)
 
+    @cython.cdivision
     cdef size_t find_slot(self, key_t key):
-        cdef size_t i = key % self.size
+        cdef size_t i = (key % self.size)
         while self.cells[i].key != 0 and self.cells[i].key != key:
             i = (i + 1) % self.size
         return i
 
+    @cython.cdivision
     cdef Cell* lookup(self, key_t key):
-        cdef size_t i = self.find_slot(key)
+        cdef size_t i = (key % self.size)
+        while self.cells[i].key != 0 and self.cells[i].key != key:
+            i = (i + 1) % self.size
         return &self.cells[i]
 
     cdef void insert(self, key_t key, val_t value):
@@ -36,7 +47,7 @@ cdef class PointerHash:
             self.resize(self.size * 2)
 
     cdef void resize(self, size_t new_size):
-        assert new_size & (new_size - 1)) == 0 # Must be a power of 2
+        assert (new_size & (new_size - 1)) == 0 # Must be a power of 2
         assert self.filled * 4 <= new_size * 3
         
         self.size = new_size
@@ -46,6 +57,9 @@ cdef class PointerHash:
 
         self.size = new_size
         self.cells = <Cell*>calloc(new_size, sizeof(Cell))
-
+        
+        self.filled = 0
+        cdef size_t i
         for i in range(old_size):
-            self.insert(self.cells[i].key, self.cells[i].value)
+            if self.cells[i].key != 0:
+                self.insert(self.cells[i].key, self.cells[i].value)
diff --git a/spacy/en.pyx b/spacy/en.pyx
index eb2486711..a3ce4da59 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -43,6 +43,7 @@ from libc.stdint cimport uint64_t
 cimport lang
 from spacy.lexeme cimport lexeme_check_flag
 from spacy.lexeme cimport lexeme_string_view
+from spacy._hashing cimport PointerHash
 
 from spacy import util
 
@@ -236,7 +237,7 @@ cdef class English(Language):
     fl_is_digit = Flag_IsDigit
     v_shape = View_WordShape
     def __cinit__(self, name, user_string_features, user_flag_features):
-        self.cache.set_empty_key(0)
+        self.cache = PointerHash(2 ** 25)
         self.specials.set_empty_key(0)
         lang_data = util.read_lang_data(name)
         rules, words, probs, clusters, case_stats, tag_stats = lang_data
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 28afd6e28..619993ebc 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -3,6 +3,7 @@ from libc.stdint cimport uint64_t
 from spacy.word cimport Lexeme
 from spacy.tokens cimport Tokens
 from spacy.lexeme cimport LexemeC
+from spacy._hashing cimport PointerHash
 
 from libcpp.utility cimport pair
 from libcpp.vector cimport vector
@@ -77,7 +78,7 @@ cdef class Lexicon:
 
 cdef class Language:
     cdef unicode name
-    cdef dense_hash_map[uint64_t, size_t] cache
+    cdef PointerHash cache
     cdef dense_hash_map[uint64_t, size_t] specials
     cpdef readonly Lexicon lexicon
     cpdef readonly object tokens_class
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 3c4823972..a9ed5be3d 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -19,6 +19,8 @@ from spacy.tokens import Tokens
 from spacy.lexeme cimport LexemeC, lexeme_init
 from murmurhash.mrmr cimport hash64
 
+from spacy._hashing cimport PointerHash
+from spacy._hashing cimport Cell
 
 cdef class Language:
     """Base class for language-specific tokenizers.
@@ -40,7 +42,7 @@ cdef class Language:
         if string_features is None:
             string_features = []
         self.name = name
-        self.cache.set_empty_key(0)
+        self.cache = PointerHash(2 ** 22)
         self.specials.set_empty_key(0)
         lang_data = read_lang_data(name)
         rules, words, probs, clusters, case_stats, tag_stats = lang_data
@@ -110,17 +112,19 @@ cdef class Language:
         return tokens
 
     cdef int _tokenize(self, Tokens tokens, String* string):
-        cdef LexemeC** lexemes = <LexemeC**>self.cache[string.key]
-        lexemes = <LexemeC**>self.cache[string.key]
+        cdef Cell* cell = self.cache.lookup(string.key)
+        cdef LexemeC** lexemes 
         cdef size_t i
-        if lexemes != NULL:
+        if cell.key != 0:
+            lexemes = <LexemeC**>cell.value
             i = 0
             while lexemes[i] != NULL:
                 tokens.push_back(lexemes[i])
                 i += 1
             return 0
-        cdef uint64_t hashed = string.key
 
+        cell.key = string.key
+        self.cache.filled += 1
         cdef size_t first_token = tokens.length
         cdef int split
         cdef int remaining = string.n
@@ -141,7 +145,7 @@ cdef class Language:
         cdef size_t j
         for i, j in enumerate(range(first_token, tokens.length)):
             lexemes[i] = tokens.lexemes[j]
-        self.cache[hashed] = <size_t>lexemes
+        cell.value = <size_t>lexemes
 
     cdef int _split_one(self, Py_UNICODE* characters, size_t length):
         return length
@@ -169,7 +173,7 @@ cdef class Language:
             lexemes[i + 1] = NULL
             string_from_unicode(&string, uni_string)
             self.specials[string.key] = <size_t>lexemes
-            self.cache[string.key] = <size_t>lexemes
+            self.cache.insert(string.key, <size_t>lexemes)
 
 
 cdef class Lexicon: