mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Add FixedTable for hashing
This commit is contained in:
		
							parent
							
								
									a44e15f623
								
							
						
					
					
						commit
						f39211b2b1
					
				
							
								
								
									
										11
									
								
								spacy/_hashing.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								spacy/_hashing.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,11 @@
 | 
				
			||||||
 | 
					from libc.stdint cimport uint64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class FixedTable:
 | 
				
			||||||
 | 
					    cdef size_t size
 | 
				
			||||||
 | 
					    cdef uint64_t* keys
 | 
				
			||||||
 | 
					    cdef size_t* values
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int insert(self, uint64_t key, size_t value) nogil
 | 
				
			||||||
 | 
					    cdef size_t get(self, uint64_t key) nogil
 | 
				
			||||||
 | 
					    cdef int erase(self, uint64_t key) nogil
 | 
				
			||||||
							
								
								
									
										48
									
								
								spacy/_hashing.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								spacy/_hashing.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,48 @@
 | 
				
			||||||
 | 
					from libc.stdlib cimport calloc, free
 | 
				
			||||||
 | 
					import cython
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class FixedTable:
 | 
				
			||||||
 | 
					    def __cinit__(self, const size_t size):
 | 
				
			||||||
 | 
					        self.size = size
 | 
				
			||||||
 | 
					        self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
 | 
				
			||||||
 | 
					        self.values = <size_t*>calloc(self.size, sizeof(size_t))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __dealloc__(self):
 | 
				
			||||||
 | 
					        free(self.keys)
 | 
				
			||||||
 | 
					        free(self.values)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __getitem__(self, uint64_t key):
 | 
				
			||||||
 | 
					        return self.get(key)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __setitem__(self, uint64_t key, size_t value):
 | 
				
			||||||
 | 
					        self.insert(key, value)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def pop(self, uint64_t key):
 | 
				
			||||||
 | 
					        self.delete(key)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def bucket(self, uint64_t key):
 | 
				
			||||||
 | 
					        return _find(key, self.size)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int insert(self, uint64_t key, size_t value) nogil:
 | 
				
			||||||
 | 
					        cdef size_t bucket = _find(key, self.size)
 | 
				
			||||||
 | 
					        self.keys[bucket] = key
 | 
				
			||||||
 | 
					        self.values[bucket] = value
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef size_t get(self, uint64_t key) nogil:
 | 
				
			||||||
 | 
					        cdef size_t bucket = _find(key, self.size)
 | 
				
			||||||
 | 
					        if self.keys[bucket] == key:
 | 
				
			||||||
 | 
					            return self.values[bucket]
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int erase(self, uint64_t key) nogil:
 | 
				
			||||||
 | 
					        cdef size_t bucket = _find(key, self.size)
 | 
				
			||||||
 | 
					        self.keys[bucket] = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cython.cdivision
 | 
				
			||||||
 | 
					cdef inline size_t _find(uint64_t key, size_t size) nogil:
 | 
				
			||||||
 | 
					    return key % size
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,14 +2,12 @@ from libcpp.vector cimport vector
 | 
				
			||||||
from libc.stdint cimport uint64_t
 | 
					from libc.stdint cimport uint64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from sparsehash.dense_hash_map cimport dense_hash_map
 | 
					from sparsehash.dense_hash_map cimport dense_hash_map
 | 
				
			||||||
from sparsehash.sparse_hash_map cimport sparse_hash_map
 | 
					from _hashing cimport FixedTable
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Circular import problems here
 | 
					# Circular import problems here
 | 
				
			||||||
ctypedef size_t Lexeme_addr
 | 
					ctypedef size_t Lexeme_addr
 | 
				
			||||||
ctypedef uint64_t StringHash
 | 
					ctypedef uint64_t StringHash
 | 
				
			||||||
ctypedef dense_hash_map[StringHash, size_t] Vocab
 | 
					ctypedef dense_hash_map[StringHash, size_t] Vocab
 | 
				
			||||||
ctypedef sparse_hash_map[StringHash, size_t] SparseVocab
 | 
					 | 
				
			||||||
from spacy.lexeme cimport Lexeme
 | 
					from spacy.lexeme cimport Lexeme
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.tokens cimport Tokens
 | 
					from spacy.tokens cimport Tokens
 | 
				
			||||||
| 
						 | 
					@ -27,7 +25,7 @@ from spacy.lexeme cimport Orthography
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Language:
 | 
					cdef class Language:
 | 
				
			||||||
    cdef object name
 | 
					    cdef object name
 | 
				
			||||||
    cdef SparseVocab* happax
 | 
					    cdef FixedTable happax
 | 
				
			||||||
    cdef Vocab* vocab
 | 
					    cdef Vocab* vocab
 | 
				
			||||||
    cdef Vocab* distri
 | 
					    cdef Vocab* distri
 | 
				
			||||||
    cdef Vocab* ortho
 | 
					    cdef Vocab* ortho
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -55,18 +55,17 @@ def set_orth_flags(lex, length):
 | 
				
			||||||
    return 0
 | 
					    return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEF MAX_HAPPAX = 1000000
 | 
					DEF MAX_HAPPAX = 1048576
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Language:
 | 
					cdef class Language:
 | 
				
			||||||
    def __cinit__(self, name):
 | 
					    def __cinit__(self, name):
 | 
				
			||||||
        self.name = name
 | 
					        self.name = name
 | 
				
			||||||
        self.bacov = {}
 | 
					        self.bacov = {}
 | 
				
			||||||
        self.happax = new SparseVocab()
 | 
					        self.happax = FixedTable(MAX_HAPPAX)
 | 
				
			||||||
        self.vocab = new Vocab()
 | 
					        self.vocab = new Vocab()
 | 
				
			||||||
        self.ortho = new Vocab()
 | 
					        self.ortho = new Vocab()
 | 
				
			||||||
        self.distri = new Vocab()
 | 
					        self.distri = new Vocab()
 | 
				
			||||||
        self.happax[0].set_deleted_key(0)
 | 
					 | 
				
			||||||
        self.vocab[0].set_empty_key(0)
 | 
					        self.vocab[0].set_empty_key(0)
 | 
				
			||||||
        self.distri[0].set_empty_key(0)
 | 
					        self.distri[0].set_empty_key(0)
 | 
				
			||||||
        self.ortho[0].set_empty_key(0)
 | 
					        self.ortho[0].set_empty_key(0)
 | 
				
			||||||
| 
						 | 
					@ -108,7 +107,7 @@ cdef class Language:
 | 
				
			||||||
   
 | 
					   
 | 
				
			||||||
    cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
 | 
					    cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
 | 
				
			||||||
        '''Hash unicode with MurmurHash64A'''
 | 
					        '''Hash unicode with MurmurHash64A'''
 | 
				
			||||||
        return mrmr.hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
 | 
					        return mrmr.real_hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef unicode unhash(self, StringHash hash_value):
 | 
					    cdef unicode unhash(self, StringHash hash_value):
 | 
				
			||||||
        '''Fetch a string from the reverse index, given its hash value.'''
 | 
					        '''Fetch a string from the reverse index, given its hash value.'''
 | 
				
			||||||
| 
						 | 
					@ -128,32 +127,20 @@ cdef class Language:
 | 
				
			||||||
        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
 | 
					        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
 | 
				
			||||||
        if word_ptr == NULL:
 | 
					        if word_ptr == NULL:
 | 
				
			||||||
            # Now check words seen exactly once
 | 
					            # Now check words seen exactly once
 | 
				
			||||||
            word_ptr = <Lexeme*>self.happax[0][hashed]
 | 
					            word_ptr = <Lexeme*>self.happax.get(hashed)
 | 
				
			||||||
            if word_ptr == NULL:
 | 
					            if word_ptr == NULL:
 | 
				
			||||||
                start = self.find_split(string, length) if start == -1 else start
 | 
					                start = self.find_split(string, length) if start == -1 else start
 | 
				
			||||||
                word_ptr = self._add(hashed, string, start, length)
 | 
					                word_ptr = self._add(hashed, string, start, length)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                # Second time word seen, move to vocab
 | 
					                # Second time word seen, move to vocab
 | 
				
			||||||
                self.vocab[0][hashed] = <Lexeme_addr>word_ptr
 | 
					                self.vocab[0][hashed] = <Lexeme_addr>word_ptr
 | 
				
			||||||
                self.happax[0].erase(hashed)
 | 
					                self.happax.erase(hashed)
 | 
				
			||||||
        return <Lexeme_addr>word_ptr
 | 
					        return <Lexeme_addr>word_ptr
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
 | 
					    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
 | 
				
			||||||
        cdef size_t i
 | 
					        cdef size_t i
 | 
				
			||||||
        cdef sparse_hash_map[StringHash, size_t].iterator it
 | 
					 | 
				
			||||||
        cdef pair[StringHash, size_t] last_elem
 | 
					 | 
				
			||||||
        if self.happax[0].size() >= MAX_HAPPAX:
 | 
					 | 
				
			||||||
            # Delete last element.
 | 
					 | 
				
			||||||
            last_elem = deref(self.happax[0].end())
 | 
					 | 
				
			||||||
            free(<Orthography*>self.ortho[0][last_elem.first])
 | 
					 | 
				
			||||||
            # TODO: Do this when we set distributions
 | 
					 | 
				
			||||||
            #free(<Distribution*>self.distri[0][last_elem.first])
 | 
					 | 
				
			||||||
            free(<Lexeme*>last_elem.second)
 | 
					 | 
				
			||||||
            self.happax[0].erase(last_elem.first)
 | 
					 | 
				
			||||||
            self.ortho[0].erase(last_elem.first)
 | 
					 | 
				
			||||||
            self.distri[0].erase(last_elem.first)
 | 
					 | 
				
			||||||
        word = self.init_lexeme(string, hashed, split, length)
 | 
					        word = self.init_lexeme(string, hashed, split, length)
 | 
				
			||||||
        self.happax[0][hashed] = <Lexeme_addr>word
 | 
					        self.happax.insert(hashed, <size_t>word)
 | 
				
			||||||
        self.bacov[hashed] = string
 | 
					        self.bacov[hashed] = string
 | 
				
			||||||
        return word   
 | 
					        return word   
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user