* Add FixedTable for hashing

This commit is contained in:
Matthew Honnibal 2014-08-01 07:27:21 +01:00
parent a44e15f623
commit f39211b2b1
4 changed files with 67 additions and 23 deletions

11
spacy/_hashing.pxd Normal file
View File

@ -0,0 +1,11 @@
from libc.stdint cimport uint64_t
cdef class FixedTable:
cdef size_t size
cdef uint64_t* keys
cdef size_t* values
cdef int insert(self, uint64_t key, size_t value) nogil
cdef size_t get(self, uint64_t key) nogil
cdef int erase(self, uint64_t key) nogil

48
spacy/_hashing.pyx Normal file
View File

@ -0,0 +1,48 @@
from libc.stdlib cimport calloc, free
import cython
cdef class FixedTable:
def __cinit__(self, const size_t size):
self.size = size
self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
self.values = <size_t*>calloc(self.size, sizeof(size_t))
def __dealloc__(self):
free(self.keys)
free(self.values)
def __getitem__(self, uint64_t key):
return self.get(key)
def __setitem__(self, uint64_t key, size_t value):
self.insert(key, value)
def pop(self, uint64_t key):
self.delete(key)
def bucket(self, uint64_t key):
return _find(key, self.size)
cdef int insert(self, uint64_t key, size_t value) nogil:
cdef size_t bucket = _find(key, self.size)
self.keys[bucket] = key
self.values[bucket] = value
cdef size_t get(self, uint64_t key) nogil:
cdef size_t bucket = _find(key, self.size)
if self.keys[bucket] == key:
return self.values[bucket]
else:
return 0
cdef int erase(self, uint64_t key) nogil:
cdef size_t bucket = _find(key, self.size)
self.keys[bucket] = 0
@cython.cdivision
cdef inline size_t _find(uint64_t key, size_t size) nogil:
return key % size

View File

@ -2,14 +2,12 @@ from libcpp.vector cimport vector
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from sparsehash.dense_hash_map cimport dense_hash_map from sparsehash.dense_hash_map cimport dense_hash_map
from sparsehash.sparse_hash_map cimport sparse_hash_map from _hashing cimport FixedTable
# Circular import problems here # Circular import problems here
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
ctypedef uint64_t StringHash ctypedef uint64_t StringHash
ctypedef dense_hash_map[StringHash, size_t] Vocab ctypedef dense_hash_map[StringHash, size_t] Vocab
ctypedef sparse_hash_map[StringHash, size_t] SparseVocab
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens
@ -27,7 +25,7 @@ from spacy.lexeme cimport Orthography
cdef class Language: cdef class Language:
cdef object name cdef object name
cdef SparseVocab* happax cdef FixedTable happax
cdef Vocab* vocab cdef Vocab* vocab
cdef Vocab* distri cdef Vocab* distri
cdef Vocab* ortho cdef Vocab* ortho

View File

@ -55,18 +55,17 @@ def set_orth_flags(lex, length):
return 0 return 0
DEF MAX_HAPPAX = 1000000 DEF MAX_HAPPAX = 1048576
cdef class Language: cdef class Language:
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name
self.bacov = {} self.bacov = {}
self.happax = new SparseVocab() self.happax = FixedTable(MAX_HAPPAX)
self.vocab = new Vocab() self.vocab = new Vocab()
self.ortho = new Vocab() self.ortho = new Vocab()
self.distri = new Vocab() self.distri = new Vocab()
self.happax[0].set_deleted_key(0)
self.vocab[0].set_empty_key(0) self.vocab[0].set_empty_key(0)
self.distri[0].set_empty_key(0) self.distri[0].set_empty_key(0)
self.ortho[0].set_empty_key(0) self.ortho[0].set_empty_key(0)
@ -108,7 +107,7 @@ cdef class Language:
cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0: cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
'''Hash unicode with MurmurHash64A''' '''Hash unicode with MurmurHash64A'''
return mrmr.hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0) return mrmr.real_hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
cdef unicode unhash(self, StringHash hash_value): cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.''' '''Fetch a string from the reverse index, given its hash value.'''
@ -128,32 +127,20 @@ cdef class Language:
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed] cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
if word_ptr == NULL: if word_ptr == NULL:
# Now check words seen exactly once # Now check words seen exactly once
word_ptr = <Lexeme*>self.happax[0][hashed] word_ptr = <Lexeme*>self.happax.get(hashed)
if word_ptr == NULL: if word_ptr == NULL:
start = self.find_split(string, length) if start == -1 else start start = self.find_split(string, length) if start == -1 else start
word_ptr = self._add(hashed, string, start, length) word_ptr = self._add(hashed, string, start, length)
else: else:
# Second time word seen, move to vocab # Second time word seen, move to vocab
self.vocab[0][hashed] = <Lexeme_addr>word_ptr self.vocab[0][hashed] = <Lexeme_addr>word_ptr
self.happax[0].erase(hashed) self.happax.erase(hashed)
return <Lexeme_addr>word_ptr return <Lexeme_addr>word_ptr
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
cdef size_t i cdef size_t i
cdef sparse_hash_map[StringHash, size_t].iterator it
cdef pair[StringHash, size_t] last_elem
if self.happax[0].size() >= MAX_HAPPAX:
# Delete last element.
last_elem = deref(self.happax[0].end())
free(<Orthography*>self.ortho[0][last_elem.first])
# TODO: Do this when we set distributions
#free(<Distribution*>self.distri[0][last_elem.first])
free(<Lexeme*>last_elem.second)
self.happax[0].erase(last_elem.first)
self.ortho[0].erase(last_elem.first)
self.distri[0].erase(last_elem.first)
word = self.init_lexeme(string, hashed, split, length) word = self.init_lexeme(string, hashed, split, length)
self.happax[0][hashed] = <Lexeme_addr>word self.happax.insert(hashed, <size_t>word)
self.bacov[hashed] = string self.bacov[hashed] = string
return word return word