diff --git a/spacy/_hashing.pxd b/spacy/_hashing.pxd index a7448a639..2be9d109d 100644 --- a/spacy/_hashing.pxd +++ b/spacy/_hashing.pxd @@ -1,5 +1,7 @@ -ctypedef key_t size_t -ctypedef val_t size_t +from libc.stdint cimport uint64_t + +ctypedef uint64_t key_t +ctypedef size_t val_t cdef struct Cell: @@ -14,5 +16,5 @@ cdef class PointerHash: cdef size_t find_slot(self, key_t key) cdef Cell* lookup(self, key_t key) - cdef void insert(self, key_t key) + cdef void insert(self, key_t key, val_t value) cdef void resize(self, size_t new_size) diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx index 4c0637478..2645d2bcf 100644 --- a/spacy/_hashing.pyx +++ b/spacy/_hashing.pyx @@ -1,3 +1,8 @@ +# cython: profile=True +from libc.stdlib cimport calloc, free +cimport cython + + cdef class PointerHash: def __cinit__(self, size_t initial_size=8): self.size = initial_size @@ -10,20 +15,26 @@ cdef class PointerHash: free(self.cells) def __getitem__(self, key_t key): + assert key != 0 cdef Cell* cell = self.lookup(key) return cell.value if cell.key != 0 else None def __setitem__(self, key_t key, val_t value): - self.insert(key, value + assert key != 0 + self.insert(key, value) + @cython.cdivision cdef size_t find_slot(self, key_t key): - cdef size_t i = key % self.size + cdef size_t i = (key % self.size) while self.cells[i].key != 0 and self.cells[i].key != key: i = (i + 1) % self.size return i + @cython.cdivision cdef Cell* lookup(self, key_t key): - cdef size_t i = self.find_slot(key) + cdef size_t i = (key % self.size) + while self.cells[i].key != 0 and self.cells[i].key != key: + i = (i + 1) % self.size return &self.cells[i] cdef void insert(self, key_t key, val_t value): @@ -36,7 +47,7 @@ cdef class PointerHash: self.resize(self.size * 2) cdef void resize(self, size_t new_size): - assert new_size & (new_size - 1)) == 0 # Must be a power of 2 + assert (new_size & (new_size - 1)) == 0 # Must be a power of 2 assert self.filled * 4 <= new_size * 3 self.size = new_size @@ -46,6 +57,9 @@ cdef class PointerHash: self.size = new_size self.cells = calloc(new_size, sizeof(Cell)) - + + self.filled = 0 + cdef size_t i for i in range(old_size): - self.insert(self.cells[i].key, self.cells[i].value) + if self.cells[i].key != 0: + self.insert(self.cells[i].key, self.cells[i].value) diff --git a/spacy/en.pyx b/spacy/en.pyx index eb2486711..a3ce4da59 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -43,6 +43,7 @@ from libc.stdint cimport uint64_t cimport lang from spacy.lexeme cimport lexeme_check_flag from spacy.lexeme cimport lexeme_string_view +from spacy._hashing cimport PointerHash from spacy import util @@ -236,7 +237,7 @@ cdef class English(Language): fl_is_digit = Flag_IsDigit v_shape = View_WordShape def __cinit__(self, name, user_string_features, user_flag_features): - self.cache.set_empty_key(0) + self.cache = PointerHash(2 ** 25) self.specials.set_empty_key(0) lang_data = util.read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 28afd6e28..619993ebc 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -3,6 +3,7 @@ from libc.stdint cimport uint64_t from spacy.word cimport Lexeme from spacy.tokens cimport Tokens from spacy.lexeme cimport LexemeC +from spacy._hashing cimport PointerHash from libcpp.utility cimport pair from libcpp.vector cimport vector @@ -77,7 +78,7 @@ cdef class Lexicon: cdef class Language: cdef unicode name - cdef dense_hash_map[uint64_t, size_t] cache + cdef PointerHash cache cdef dense_hash_map[uint64_t, size_t] specials cpdef readonly Lexicon lexicon cpdef readonly object tokens_class diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 3c4823972..a9ed5be3d 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -19,6 +19,8 @@ from spacy.tokens import Tokens from spacy.lexeme cimport LexemeC, lexeme_init from murmurhash.mrmr cimport hash64 +from spacy._hashing cimport PointerHash +from spacy._hashing cimport Cell cdef class Language: """Base class for language-specific tokenizers. @@ -40,7 +42,7 @@ cdef class Language: if string_features is None: string_features = [] self.name = name - self.cache.set_empty_key(0) + self.cache = PointerHash(2 ** 22) self.specials.set_empty_key(0) lang_data = read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data @@ -110,17 +112,19 @@ cdef class Language: return tokens cdef int _tokenize(self, Tokens tokens, String* string): - cdef LexemeC** lexemes = self.cache[string.key] - lexemes = self.cache[string.key] + cdef Cell* cell = self.cache.lookup(string.key) + cdef LexemeC** lexemes cdef size_t i - if lexemes != NULL: + if cell.key != 0: + lexemes = cell.value i = 0 while lexemes[i] != NULL: tokens.push_back(lexemes[i]) i += 1 return 0 - cdef uint64_t hashed = string.key + cell.key = string.key + self.cache.filled += 1 cdef size_t first_token = tokens.length cdef int split cdef int remaining = string.n @@ -141,7 +145,7 @@ cdef class Language: cdef size_t j for i, j in enumerate(range(first_token, tokens.length)): lexemes[i] = tokens.lexemes[j] - self.cache[hashed] = lexemes + cell.value = lexemes cdef int _split_one(self, Py_UNICODE* characters, size_t length): return length @@ -169,7 +173,7 @@ cdef class Language: lexemes[i + 1] = NULL string_from_unicode(&string, uni_string) self.specials[string.key] = lexemes - self.cache[string.key] = lexemes + self.cache.insert(string.key, lexemes) cdef class Lexicon: