mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
* Add FixedTable for hashing
This commit is contained in:
parent
a44e15f623
commit
f39211b2b1
11
spacy/_hashing.pxd
Normal file
11
spacy/_hashing.pxd
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
|
|
||||||
|
cdef class FixedTable:
|
||||||
|
cdef size_t size
|
||||||
|
cdef uint64_t* keys
|
||||||
|
cdef size_t* values
|
||||||
|
|
||||||
|
cdef int insert(self, uint64_t key, size_t value) nogil
|
||||||
|
cdef size_t get(self, uint64_t key) nogil
|
||||||
|
cdef int erase(self, uint64_t key) nogil
|
48
spacy/_hashing.pyx
Normal file
48
spacy/_hashing.pyx
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
from libc.stdlib cimport calloc, free
|
||||||
|
import cython
|
||||||
|
|
||||||
|
|
||||||
|
cdef class FixedTable:
|
||||||
|
def __cinit__(self, const size_t size):
|
||||||
|
self.size = size
|
||||||
|
self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
|
||||||
|
self.values = <size_t*>calloc(self.size, sizeof(size_t))
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
free(self.keys)
|
||||||
|
free(self.values)
|
||||||
|
|
||||||
|
def __getitem__(self, uint64_t key):
|
||||||
|
return self.get(key)
|
||||||
|
|
||||||
|
def __setitem__(self, uint64_t key, size_t value):
|
||||||
|
self.insert(key, value)
|
||||||
|
|
||||||
|
def pop(self, uint64_t key):
|
||||||
|
self.delete(key)
|
||||||
|
|
||||||
|
def bucket(self, uint64_t key):
|
||||||
|
return _find(key, self.size)
|
||||||
|
|
||||||
|
cdef int insert(self, uint64_t key, size_t value) nogil:
|
||||||
|
cdef size_t bucket = _find(key, self.size)
|
||||||
|
self.keys[bucket] = key
|
||||||
|
self.values[bucket] = value
|
||||||
|
|
||||||
|
cdef size_t get(self, uint64_t key) nogil:
|
||||||
|
cdef size_t bucket = _find(key, self.size)
|
||||||
|
if self.keys[bucket] == key:
|
||||||
|
return self.values[bucket]
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int erase(self, uint64_t key) nogil:
|
||||||
|
cdef size_t bucket = _find(key, self.size)
|
||||||
|
self.keys[bucket] = 0
|
||||||
|
|
||||||
|
|
||||||
|
@cython.cdivision
|
||||||
|
cdef inline size_t _find(uint64_t key, size_t size) nogil:
|
||||||
|
return key % size
|
||||||
|
|
||||||
|
|
|
@ -2,14 +2,12 @@ from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
from sparsehash.dense_hash_map cimport dense_hash_map
|
from sparsehash.dense_hash_map cimport dense_hash_map
|
||||||
from sparsehash.sparse_hash_map cimport sparse_hash_map
|
from _hashing cimport FixedTable
|
||||||
|
|
||||||
|
|
||||||
# Circular import problems here
|
# Circular import problems here
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t Lexeme_addr
|
||||||
ctypedef uint64_t StringHash
|
ctypedef uint64_t StringHash
|
||||||
ctypedef dense_hash_map[StringHash, size_t] Vocab
|
ctypedef dense_hash_map[StringHash, size_t] Vocab
|
||||||
ctypedef sparse_hash_map[StringHash, size_t] SparseVocab
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
|
@ -27,7 +25,7 @@ from spacy.lexeme cimport Orthography
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef object name
|
cdef object name
|
||||||
cdef SparseVocab* happax
|
cdef FixedTable happax
|
||||||
cdef Vocab* vocab
|
cdef Vocab* vocab
|
||||||
cdef Vocab* distri
|
cdef Vocab* distri
|
||||||
cdef Vocab* ortho
|
cdef Vocab* ortho
|
||||||
|
|
|
@ -55,18 +55,17 @@ def set_orth_flags(lex, length):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
DEF MAX_HAPPAX = 1000000
|
DEF MAX_HAPPAX = 1048576
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.bacov = {}
|
self.bacov = {}
|
||||||
self.happax = new SparseVocab()
|
self.happax = FixedTable(MAX_HAPPAX)
|
||||||
self.vocab = new Vocab()
|
self.vocab = new Vocab()
|
||||||
self.ortho = new Vocab()
|
self.ortho = new Vocab()
|
||||||
self.distri = new Vocab()
|
self.distri = new Vocab()
|
||||||
self.happax[0].set_deleted_key(0)
|
|
||||||
self.vocab[0].set_empty_key(0)
|
self.vocab[0].set_empty_key(0)
|
||||||
self.distri[0].set_empty_key(0)
|
self.distri[0].set_empty_key(0)
|
||||||
self.ortho[0].set_empty_key(0)
|
self.ortho[0].set_empty_key(0)
|
||||||
|
@ -108,7 +107,7 @@ cdef class Language:
|
||||||
|
|
||||||
cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
|
cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
|
||||||
'''Hash unicode with MurmurHash64A'''
|
'''Hash unicode with MurmurHash64A'''
|
||||||
return mrmr.hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
|
return mrmr.real_hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
cdef unicode unhash(self, StringHash hash_value):
|
cdef unicode unhash(self, StringHash hash_value):
|
||||||
'''Fetch a string from the reverse index, given its hash value.'''
|
'''Fetch a string from the reverse index, given its hash value.'''
|
||||||
|
@ -128,32 +127,20 @@ cdef class Language:
|
||||||
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
||||||
if word_ptr == NULL:
|
if word_ptr == NULL:
|
||||||
# Now check words seen exactly once
|
# Now check words seen exactly once
|
||||||
word_ptr = <Lexeme*>self.happax[0][hashed]
|
word_ptr = <Lexeme*>self.happax.get(hashed)
|
||||||
if word_ptr == NULL:
|
if word_ptr == NULL:
|
||||||
start = self.find_split(string, length) if start == -1 else start
|
start = self.find_split(string, length) if start == -1 else start
|
||||||
word_ptr = self._add(hashed, string, start, length)
|
word_ptr = self._add(hashed, string, start, length)
|
||||||
else:
|
else:
|
||||||
# Second time word seen, move to vocab
|
# Second time word seen, move to vocab
|
||||||
self.vocab[0][hashed] = <Lexeme_addr>word_ptr
|
self.vocab[0][hashed] = <Lexeme_addr>word_ptr
|
||||||
self.happax[0].erase(hashed)
|
self.happax.erase(hashed)
|
||||||
return <Lexeme_addr>word_ptr
|
return <Lexeme_addr>word_ptr
|
||||||
|
|
||||||
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef sparse_hash_map[StringHash, size_t].iterator it
|
|
||||||
cdef pair[StringHash, size_t] last_elem
|
|
||||||
if self.happax[0].size() >= MAX_HAPPAX:
|
|
||||||
# Delete last element.
|
|
||||||
last_elem = deref(self.happax[0].end())
|
|
||||||
free(<Orthography*>self.ortho[0][last_elem.first])
|
|
||||||
# TODO: Do this when we set distributions
|
|
||||||
#free(<Distribution*>self.distri[0][last_elem.first])
|
|
||||||
free(<Lexeme*>last_elem.second)
|
|
||||||
self.happax[0].erase(last_elem.first)
|
|
||||||
self.ortho[0].erase(last_elem.first)
|
|
||||||
self.distri[0].erase(last_elem.first)
|
|
||||||
word = self.init_lexeme(string, hashed, split, length)
|
word = self.init_lexeme(string, hashed, split, length)
|
||||||
self.happax[0][hashed] = <Lexeme_addr>word
|
self.happax.insert(hashed, <size_t>word)
|
||||||
self.bacov[hashed] = string
|
self.bacov[hashed] = string
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user