mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
* Replaced cache with own hash table. Similar timing
This commit is contained in:
parent
c8db76e3e1
commit
85d68e8e95
|
@ -1,5 +1,7 @@
|
|||
ctypedef key_t size_t
|
||||
ctypedef val_t size_t
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
ctypedef uint64_t key_t
|
||||
ctypedef size_t val_t
|
||||
|
||||
|
||||
cdef struct Cell:
|
||||
|
@ -14,5 +16,5 @@ cdef class PointerHash:
|
|||
|
||||
cdef size_t find_slot(self, key_t key)
|
||||
cdef Cell* lookup(self, key_t key)
|
||||
cdef void insert(self, key_t key)
|
||||
cdef void insert(self, key_t key, val_t value)
|
||||
cdef void resize(self, size_t new_size)
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
# cython: profile=True
|
||||
from libc.stdlib cimport calloc, free
|
||||
cimport cython
|
||||
|
||||
|
||||
cdef class PointerHash:
|
||||
def __cinit__(self, size_t initial_size=8):
|
||||
self.size = initial_size
|
||||
|
@ -10,20 +15,26 @@ cdef class PointerHash:
|
|||
free(self.cells)
|
||||
|
||||
def __getitem__(self, key_t key):
|
||||
assert key != 0
|
||||
cdef Cell* cell = self.lookup(key)
|
||||
return cell.value if cell.key != 0 else None
|
||||
|
||||
def __setitem__(self, key_t key, val_t value):
|
||||
self.insert(key, value
|
||||
assert key != 0
|
||||
self.insert(key, value)
|
||||
|
||||
@cython.cdivision
|
||||
cdef size_t find_slot(self, key_t key):
|
||||
cdef size_t i = key % self.size
|
||||
cdef size_t i = (key % self.size)
|
||||
while self.cells[i].key != 0 and self.cells[i].key != key:
|
||||
i = (i + 1) % self.size
|
||||
return i
|
||||
|
||||
@cython.cdivision
|
||||
cdef Cell* lookup(self, key_t key):
|
||||
cdef size_t i = self.find_slot(key)
|
||||
cdef size_t i = (key % self.size)
|
||||
while self.cells[i].key != 0 and self.cells[i].key != key:
|
||||
i = (i + 1) % self.size
|
||||
return &self.cells[i]
|
||||
|
||||
cdef void insert(self, key_t key, val_t value):
|
||||
|
@ -36,7 +47,7 @@ cdef class PointerHash:
|
|||
self.resize(self.size * 2)
|
||||
|
||||
cdef void resize(self, size_t new_size):
|
||||
assert new_size & (new_size - 1)) == 0 # Must be a power of 2
|
||||
assert (new_size & (new_size - 1)) == 0 # Must be a power of 2
|
||||
assert self.filled * 4 <= new_size * 3
|
||||
|
||||
self.size = new_size
|
||||
|
@ -47,5 +58,8 @@ cdef class PointerHash:
|
|||
self.size = new_size
|
||||
self.cells = <Cell*>calloc(new_size, sizeof(Cell))
|
||||
|
||||
self.filled = 0
|
||||
cdef size_t i
|
||||
for i in range(old_size):
|
||||
if self.cells[i].key != 0:
|
||||
self.insert(self.cells[i].key, self.cells[i].value)
|
||||
|
|
|
@ -43,6 +43,7 @@ from libc.stdint cimport uint64_t
|
|||
cimport lang
|
||||
from spacy.lexeme cimport lexeme_check_flag
|
||||
from spacy.lexeme cimport lexeme_string_view
|
||||
from spacy._hashing cimport PointerHash
|
||||
|
||||
from spacy import util
|
||||
|
||||
|
@ -236,7 +237,7 @@ cdef class English(Language):
|
|||
fl_is_digit = Flag_IsDigit
|
||||
v_shape = View_WordShape
|
||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||
self.cache.set_empty_key(0)
|
||||
self.cache = PointerHash(2 ** 25)
|
||||
self.specials.set_empty_key(0)
|
||||
lang_data = util.read_lang_data(name)
|
||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||
|
|
|
@ -3,6 +3,7 @@ from libc.stdint cimport uint64_t
|
|||
from spacy.word cimport Lexeme
|
||||
from spacy.tokens cimport Tokens
|
||||
from spacy.lexeme cimport LexemeC
|
||||
from spacy._hashing cimport PointerHash
|
||||
|
||||
from libcpp.utility cimport pair
|
||||
from libcpp.vector cimport vector
|
||||
|
@ -77,7 +78,7 @@ cdef class Lexicon:
|
|||
|
||||
cdef class Language:
|
||||
cdef unicode name
|
||||
cdef dense_hash_map[uint64_t, size_t] cache
|
||||
cdef PointerHash cache
|
||||
cdef dense_hash_map[uint64_t, size_t] specials
|
||||
cpdef readonly Lexicon lexicon
|
||||
cpdef readonly object tokens_class
|
||||
|
|
|
@ -19,6 +19,8 @@ from spacy.tokens import Tokens
|
|||
from spacy.lexeme cimport LexemeC, lexeme_init
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from spacy._hashing cimport PointerHash
|
||||
from spacy._hashing cimport Cell
|
||||
|
||||
cdef class Language:
|
||||
"""Base class for language-specific tokenizers.
|
||||
|
@ -40,7 +42,7 @@ cdef class Language:
|
|||
if string_features is None:
|
||||
string_features = []
|
||||
self.name = name
|
||||
self.cache.set_empty_key(0)
|
||||
self.cache = PointerHash(2 ** 22)
|
||||
self.specials.set_empty_key(0)
|
||||
lang_data = read_lang_data(name)
|
||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||
|
@ -110,17 +112,19 @@ cdef class Language:
|
|||
return tokens
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* string):
|
||||
cdef LexemeC** lexemes = <LexemeC**>self.cache[string.key]
|
||||
lexemes = <LexemeC**>self.cache[string.key]
|
||||
cdef Cell* cell = self.cache.lookup(string.key)
|
||||
cdef LexemeC** lexemes
|
||||
cdef size_t i
|
||||
if lexemes != NULL:
|
||||
if cell.key != 0:
|
||||
lexemes = <LexemeC**>cell.value
|
||||
i = 0
|
||||
while lexemes[i] != NULL:
|
||||
tokens.push_back(lexemes[i])
|
||||
i += 1
|
||||
return 0
|
||||
cdef uint64_t hashed = string.key
|
||||
|
||||
cell.key = string.key
|
||||
self.cache.filled += 1
|
||||
cdef size_t first_token = tokens.length
|
||||
cdef int split
|
||||
cdef int remaining = string.n
|
||||
|
@ -141,7 +145,7 @@ cdef class Language:
|
|||
cdef size_t j
|
||||
for i, j in enumerate(range(first_token, tokens.length)):
|
||||
lexemes[i] = tokens.lexemes[j]
|
||||
self.cache[hashed] = <size_t>lexemes
|
||||
cell.value = <size_t>lexemes
|
||||
|
||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||
return length
|
||||
|
@ -169,7 +173,7 @@ cdef class Language:
|
|||
lexemes[i + 1] = NULL
|
||||
string_from_unicode(&string, uni_string)
|
||||
self.specials[string.key] = <size_t>lexemes
|
||||
self.cache[string.key] = <size_t>lexemes
|
||||
self.cache.insert(string.key, <size_t>lexemes)
|
||||
|
||||
|
||||
cdef class Lexicon:
|
||||
|
|
Loading…
Reference in New Issue
Block a user