mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
* Switch to 32bit hash for strings
This commit is contained in:
parent
365a2af756
commit
d6e07aa922
|
@ -1,8 +1,9 @@
|
||||||
|
from libc.stdint cimport uint32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
# Put these above import to avoid circular import problem
|
# Put these above import to avoid circular import problem
|
||||||
ctypedef int ClusterID
|
ctypedef int ClusterID
|
||||||
ctypedef uint64_t StringHash
|
ctypedef uint32_t StringHash
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t Lexeme_addr
|
||||||
ctypedef char Bits8
|
ctypedef char Bits8
|
||||||
ctypedef uint64_t Bits64
|
ctypedef uint64_t Bits64
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
from libc.stdint cimport uint32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
from sparsehash.dense_hash_map cimport dense_hash_map
|
from sparsehash.dense_hash_map cimport dense_hash_map
|
||||||
|
@ -6,7 +7,7 @@ from _hashing cimport FixedTable
|
||||||
|
|
||||||
# Circular import problems here
|
# Circular import problems here
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t Lexeme_addr
|
||||||
ctypedef uint64_t StringHash
|
ctypedef uint32_t StringHash
|
||||||
ctypedef dense_hash_map[StringHash, size_t] Vocab
|
ctypedef dense_hash_map[StringHash, size_t] Vocab
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
|
|
|
@ -107,11 +107,11 @@ cdef class Language:
|
||||||
|
|
||||||
cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
|
cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
|
||||||
'''Hash unicode with MurmurHash64A'''
|
'''Hash unicode with MurmurHash64A'''
|
||||||
return mrmr.hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
|
return mrmr.hash32(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hash_value):
|
cdef unicode unhash(self, StringHash hash_value):
|
||||||
'''Fetch a string from the reverse index, given its hash value.'''
|
'''Fetch a string from the reverse index, given its hash value.'''
|
||||||
return self.bacov[hash_value]
|
return self.bacov[hash_value].decode('utf8')
|
||||||
|
|
||||||
cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0:
|
cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0:
|
||||||
'''Fetch a Lexeme representing a word string. If the word has not been seen,
|
'''Fetch a Lexeme representing a word string. If the word has not been seen,
|
||||||
|
@ -147,7 +147,7 @@ cdef class Language:
|
||||||
self._happax_to_vocab(self.happax.keys[hashed % self.happax.size],
|
self._happax_to_vocab(self.happax.keys[hashed % self.happax.size],
|
||||||
self.happax.values[hashed % self.happax.size])
|
self.happax.values[hashed % self.happax.size])
|
||||||
self.happax.insert(hashed, <size_t>word)
|
self.happax.insert(hashed, <size_t>word)
|
||||||
self.bacov[hashed] = string
|
self.bacov[hashed] = string.encode('utf8')
|
||||||
return word
|
return word
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode string):
|
cpdef Tokens tokenize(self, unicode string):
|
||||||
|
@ -202,7 +202,7 @@ cdef class Language:
|
||||||
tail_string = ''
|
tail_string = ''
|
||||||
|
|
||||||
word.lex = self.hash_string(lex, len(lex))
|
word.lex = self.hash_string(lex, len(lex))
|
||||||
self.bacov[word.lex] = lex
|
self.bacov[word.lex] = lex.encode('utf8')
|
||||||
word.orth = <Orthography*>self.ortho[0][word.lex]
|
word.orth = <Orthography*>self.ortho[0][word.lex]
|
||||||
if word.orth == NULL:
|
if word.orth == NULL:
|
||||||
word.orth = self.init_orth(word.lex, lex)
|
word.orth = self.init_orth(word.lex, lex)
|
||||||
|
@ -231,9 +231,9 @@ cdef class Language:
|
||||||
orth.shape = self.hash_string(shape, len(shape))
|
orth.shape = self.hash_string(shape, len(shape))
|
||||||
orth.norm = self.hash_string(norm, len(norm))
|
orth.norm = self.hash_string(norm, len(norm))
|
||||||
|
|
||||||
self.bacov[orth.last3] = last3
|
self.bacov[orth.last3] = last3.encode('utf8')
|
||||||
self.bacov[orth.shape] = shape
|
self.bacov[orth.shape] = shape.encode('utf8')
|
||||||
self.bacov[orth.norm] = norm
|
self.bacov[orth.norm] = norm.encode('utf8')
|
||||||
|
|
||||||
self.ortho[0][hashed] = <size_t>orth
|
self.ortho[0][hashed] = <size_t>orth
|
||||||
return orth
|
return orth
|
||||||
|
|
Loading…
Reference in New Issue
Block a user