* Add FixedTable for hashing

2025-11-04 09:57:26 +03:00 · 2014-08-01 07:27:21 +01:00 · 2014-08-01 07:27:21 +01:00 · f39211b2b1
commit f39211b2b1
parent a44e15f623
4 changed files with 67 additions and 23 deletions
--- a/spacy/_hashing.pxd
+++ b/spacy/_hashing.pxd
@ -0,0 +1,11 @@
 from libc.stdint cimport uint64_t
 cdef class FixedTable:
    cdef size_t size
    cdef uint64_t* keys
    cdef size_t* values
    cdef int insert(self, uint64_t key, size_t value) nogil
    cdef size_t get(self, uint64_t key) nogil
    cdef int erase(self, uint64_t key) nogil
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@ -0,0 +1,48 @@
 from libc.stdlib cimport calloc, free
 import cython
 cdef class FixedTable:
    def __cinit__(self, const size_t size):
        self.size = size
        self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
        self.values = <size_t*>calloc(self.size, sizeof(size_t))
    def __dealloc__(self):
        free(self.keys)
        free(self.values)
    def __getitem__(self, uint64_t key):
        return self.get(key)
    def __setitem__(self, uint64_t key, size_t value):
        self.insert(key, value)
    def pop(self, uint64_t key):
        self.delete(key)
    def bucket(self, uint64_t key):
        return _find(key, self.size)
    cdef int insert(self, uint64_t key, size_t value) nogil:
        cdef size_t bucket = _find(key, self.size)
        self.keys[bucket] = key
        self.values[bucket] = value
    cdef size_t get(self, uint64_t key) nogil:
        cdef size_t bucket = _find(key, self.size)
        if self.keys[bucket] == key:
            return self.values[bucket]
        else:
            return 0
    cdef int erase(self, uint64_t key) nogil:
        cdef size_t bucket = _find(key, self.size)
        self.keys[bucket] = 0
@cython.cdivision
 cdef inline size_t _find(uint64_t key, size_t size) nogil:
    return key % size
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -2,14 +2,12 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t
 from sparsehash.dense_hash_map cimport dense_hash_map
-from sparsehash.sparse_hash_map cimport sparse_hash_map
+from _hashing cimport FixedTable
 # Circular import problems here
 ctypedef size_t Lexeme_addr
 ctypedef uint64_t StringHash
 ctypedef dense_hash_map[StringHash, size_t] Vocab
 ctypedef sparse_hash_map[StringHash, size_t] SparseVocab
 from spacy.lexeme cimport Lexeme
 from spacy.tokens cimport Tokens
@ -27,7 +25,7 @@ from spacy.lexeme cimport Orthography
 cdef class Language:
    cdef object name
-    cdef SparseVocab* happax
+    cdef FixedTable happax
    cdef Vocab* vocab
    cdef Vocab* distri
    cdef Vocab* ortho
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -55,18 +55,17 @@ def set_orth_flags(lex, length):
    return 0
-DEF MAX_HAPPAX = 1000000
+DEF MAX_HAPPAX = 1048576
 cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
-        self.happax = new SparseVocab()
+        self.happax = FixedTable(MAX_HAPPAX)
        self.vocab = new Vocab()
        self.ortho = new Vocab()
        self.distri = new Vocab()
        self.happax[0].set_deleted_key(0)
        self.vocab[0].set_empty_key(0)
        self.distri[0].set_empty_key(0)
        self.ortho[0].set_empty_key(0)
@ -108,7 +107,7 @@ cdef class Language:
    cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
        '''Hash unicode with MurmurHash64A'''
-        return mrmr.hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
+        return mrmr.real_hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
    cdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
@ -128,32 +127,20 @@ cdef class Language:
        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
        if word_ptr == NULL:
            # Now check words seen exactly once
-            word_ptr = <Lexeme*>self.happax[0][hashed]
+            word_ptr = <Lexeme*>self.happax.get(hashed)
            if word_ptr == NULL:
                start = self.find_split(string, length) if start == -1 else start
                word_ptr = self._add(hashed, string, start, length)
            else:
                # Second time word seen, move to vocab
                self.vocab[0][hashed] = <Lexeme_addr>word_ptr
-                self.happax[0].erase(hashed)
+                self.happax.erase(hashed)
        return <Lexeme_addr>word_ptr
    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
        cdef size_t i
        cdef sparse_hash_map[StringHash, size_t].iterator it
        cdef pair[StringHash, size_t] last_elem
        if self.happax[0].size() >= MAX_HAPPAX:
            # Delete last element.
            last_elem = deref(self.happax[0].end())
            free(<Orthography*>self.ortho[0][last_elem.first])
            # TODO: Do this when we set distributions
            #free(<Distribution*>self.distri[0][last_elem.first])
            free(<Lexeme*>last_elem.second)
            self.happax[0].erase(last_elem.first)
            self.ortho[0].erase(last_elem.first)
            self.distri[0].erase(last_elem.first)
        word = self.init_lexeme(string, hashed, split, length)
-        self.happax[0][hashed] = <Lexeme_addr>word
+        self.happax.insert(hashed, <size_t>word)
        self.bacov[hashed] = string
        return word