* Replaced cache with own hash table. Similar timing

2025-10-18 09:44:16 +03:00 · 2014-09-13 03:14:43 +02:00 · 2014-09-13 03:14:43 +02:00 · 85d68e8e95
commit 85d68e8e95
parent c8db76e3e1
5 changed files with 40 additions and 18 deletions
--- a/spacy/_hashing.pxd
+++ b/spacy/_hashing.pxd
@ -1,5 +1,7 @@
-ctypedef key_t size_t
-ctypedef val_t size_t
+from libc.stdint cimport uint64_t
+
+ctypedef uint64_t key_t
+ctypedef size_t val_t


 cdef struct Cell:
@ -14,5 +16,5 @@ cdef class PointerHash:

    cdef size_t find_slot(self, key_t key)
    cdef Cell* lookup(self, key_t key)
-    cdef void insert(self, key_t key)
+    cdef void insert(self, key_t key, val_t value)
    cdef void resize(self, size_t new_size)
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@ -1,3 +1,8 @@
+# cython: profile=True
+from libc.stdlib cimport calloc, free
+cimport cython
+
+
 cdef class PointerHash:
    def __cinit__(self, size_t initial_size=8):
        self.size = initial_size
@ -10,20 +15,26 @@ cdef class PointerHash:
        free(self.cells)

    def __getitem__(self, key_t key):
+        assert key != 0
        cdef Cell* cell = self.lookup(key)
        return cell.value if cell.key != 0 else None

    def __setitem__(self, key_t key,  val_t value):
-        self.insert(key, value
+        assert key != 0
+        self.insert(key, value)

+    @cython.cdivision
    cdef size_t find_slot(self, key_t key):
-        cdef size_t i = key % self.size
+        cdef size_t i = (key % self.size)
        while self.cells[i].key != 0 and self.cells[i].key != key:
            i = (i + 1) % self.size
        return i

+    @cython.cdivision
    cdef Cell* lookup(self, key_t key):
-        cdef size_t i = self.find_slot(key)
+        cdef size_t i = (key % self.size)
+        while self.cells[i].key != 0 and self.cells[i].key != key:
+            i = (i + 1) % self.size
        return &self.cells[i]

    cdef void insert(self, key_t key, val_t value):
@ -36,7 +47,7 @@ cdef class PointerHash:
            self.resize(self.size * 2)

    cdef void resize(self, size_t new_size):
-        assert new_size & (new_size - 1)) == 0 # Must be a power of 2
+        assert (new_size & (new_size - 1)) == 0 # Must be a power of 2
        assert self.filled * 4 <= new_size * 3
        
        self.size = new_size
@ -47,5 +58,8 @@ cdef class PointerHash:
        self.size = new_size
        self.cells = <Cell*>calloc(new_size, sizeof(Cell))
        
+        self.filled = 0
+        cdef size_t i
        for i in range(old_size):
+            if self.cells[i].key != 0:
                self.insert(self.cells[i].key, self.cells[i].value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -43,6 +43,7 @@ from libc.stdint cimport uint64_t
 cimport lang
 from spacy.lexeme cimport lexeme_check_flag
 from spacy.lexeme cimport lexeme_string_view
+from spacy._hashing cimport PointerHash

 from spacy import util

@ -236,7 +237,7 @@ cdef class English(Language):
    fl_is_digit = Flag_IsDigit
    v_shape = View_WordShape
    def __cinit__(self, name, user_string_features, user_flag_features):
-        self.cache.set_empty_key(0)
+        self.cache = PointerHash(2 ** 25)
        self.specials.set_empty_key(0)
        lang_data = util.read_lang_data(name)
        rules, words, probs, clusters, case_stats, tag_stats = lang_data
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -3,6 +3,7 @@ from libc.stdint cimport uint64_t
 from spacy.word cimport Lexeme
 from spacy.tokens cimport Tokens
 from spacy.lexeme cimport LexemeC
+from spacy._hashing cimport PointerHash

 from libcpp.utility cimport pair
 from libcpp.vector cimport vector
@ -77,7 +78,7 @@ cdef class Lexicon:

 cdef class Language:
    cdef unicode name
-    cdef dense_hash_map[uint64_t, size_t] cache
+    cdef PointerHash cache
    cdef dense_hash_map[uint64_t, size_t] specials
    cpdef readonly Lexicon lexicon
    cpdef readonly object tokens_class
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -19,6 +19,8 @@ from spacy.tokens import Tokens
 from spacy.lexeme cimport LexemeC, lexeme_init
 from murmurhash.mrmr cimport hash64

+from spacy._hashing cimport PointerHash
+from spacy._hashing cimport Cell

 cdef class Language:
    """Base class for language-specific tokenizers.
@ -40,7 +42,7 @@ cdef class Language:
        if string_features is None:
            string_features = []
        self.name = name
-        self.cache.set_empty_key(0)
+        self.cache = PointerHash(2 ** 22)
        self.specials.set_empty_key(0)
        lang_data = read_lang_data(name)
        rules, words, probs, clusters, case_stats, tag_stats = lang_data
@ -110,17 +112,19 @@ cdef class Language:
        return tokens

    cdef int _tokenize(self, Tokens tokens, String* string):
-        cdef LexemeC** lexemes = <LexemeC**>self.cache[string.key]
-        lexemes = <LexemeC**>self.cache[string.key]
+        cdef Cell* cell = self.cache.lookup(string.key)
+        cdef LexemeC** lexemes 
        cdef size_t i
-        if lexemes != NULL:
+        if cell.key != 0:
+            lexemes = <LexemeC**>cell.value
            i = 0
            while lexemes[i] != NULL:
                tokens.push_back(lexemes[i])
                i += 1
            return 0
-        cdef uint64_t hashed = string.key

+        cell.key = string.key
+        self.cache.filled += 1
        cdef size_t first_token = tokens.length
        cdef int split
        cdef int remaining = string.n
@ -141,7 +145,7 @@ cdef class Language:
        cdef size_t j
        for i, j in enumerate(range(first_token, tokens.length)):
            lexemes[i] = tokens.lexemes[j]
-        self.cache[hashed] = <size_t>lexemes
+        cell.value = <size_t>lexemes

    cdef int _split_one(self, Py_UNICODE* characters, size_t length):
        return length
@ -169,7 +173,7 @@ cdef class Language:
            lexemes[i + 1] = NULL
            string_from_unicode(&string, uni_string)
            self.specials[string.key] = <size_t>lexemes
-            self.cache[string.key] = <size_t>lexemes
+            self.cache.insert(string.key, <size_t>lexemes)


 cdef class Lexicon: