* Switch to 32bit hash for strings

This commit is contained in:
Matthew Honnibal 2014-08-02 21:51:52 +01:00
parent 365a2af756
commit d6e07aa922
3 changed files with 11 additions and 9 deletions

View File

@ -1,8 +1,9 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
# Put these above import to avoid circular import problem # Put these above import to avoid circular import problem
ctypedef int ClusterID ctypedef int ClusterID
ctypedef uint64_t StringHash ctypedef uint32_t StringHash
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
ctypedef char Bits8 ctypedef char Bits8
ctypedef uint64_t Bits64 ctypedef uint64_t Bits64

View File

@ -1,4 +1,5 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from sparsehash.dense_hash_map cimport dense_hash_map from sparsehash.dense_hash_map cimport dense_hash_map
@ -6,7 +7,7 @@ from _hashing cimport FixedTable
# Circular import problems here # Circular import problems here
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
ctypedef uint64_t StringHash ctypedef uint32_t StringHash
ctypedef dense_hash_map[StringHash, size_t] Vocab ctypedef dense_hash_map[StringHash, size_t] Vocab
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme

View File

@ -107,11 +107,11 @@ cdef class Language:
cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0: cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
'''Hash unicode with MurmurHash64A''' '''Hash unicode with MurmurHash64A'''
return mrmr.hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0) return mrmr.hash32(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
cdef unicode unhash(self, StringHash hash_value): cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.''' '''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value] return self.bacov[hash_value].decode('utf8')
cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0: cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen, '''Fetch a Lexeme representing a word string. If the word has not been seen,
@ -147,7 +147,7 @@ cdef class Language:
self._happax_to_vocab(self.happax.keys[hashed % self.happax.size], self._happax_to_vocab(self.happax.keys[hashed % self.happax.size],
self.happax.values[hashed % self.happax.size]) self.happax.values[hashed % self.happax.size])
self.happax.insert(hashed, <size_t>word) self.happax.insert(hashed, <size_t>word)
self.bacov[hashed] = string self.bacov[hashed] = string.encode('utf8')
return word return word
cpdef Tokens tokenize(self, unicode string): cpdef Tokens tokenize(self, unicode string):
@ -202,7 +202,7 @@ cdef class Language:
tail_string = '' tail_string = ''
word.lex = self.hash_string(lex, len(lex)) word.lex = self.hash_string(lex, len(lex))
self.bacov[word.lex] = lex self.bacov[word.lex] = lex.encode('utf8')
word.orth = <Orthography*>self.ortho[0][word.lex] word.orth = <Orthography*>self.ortho[0][word.lex]
if word.orth == NULL: if word.orth == NULL:
word.orth = self.init_orth(word.lex, lex) word.orth = self.init_orth(word.lex, lex)
@ -231,9 +231,9 @@ cdef class Language:
orth.shape = self.hash_string(shape, len(shape)) orth.shape = self.hash_string(shape, len(shape))
orth.norm = self.hash_string(norm, len(norm)) orth.norm = self.hash_string(norm, len(norm))
self.bacov[orth.last3] = last3 self.bacov[orth.last3] = last3.encode('utf8')
self.bacov[orth.shape] = shape self.bacov[orth.shape] = shape.encode('utf8')
self.bacov[orth.norm] = norm self.bacov[orth.norm] = norm.encode('utf8')
self.ortho[0][hashed] = <size_t>orth self.ortho[0][hashed] = <size_t>orth
return orth return orth