diff --git a/setup.py b/setup.py index d33ab750e..5c588a70e 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ else: exts = [ Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes), + Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes), Extension("spacy.word", ["spacy/word.pyx"], language="c++", include_dirs=includes), Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", diff --git a/spacy/_hashing.pxd b/spacy/_hashing.pxd index 2be9d109d..f4c4f5b43 100644 --- a/spacy/_hashing.pxd +++ b/spacy/_hashing.pxd @@ -12,9 +12,9 @@ cdef struct Cell: cdef class PointerHash: cdef size_t size cdef size_t filled + cdef Cell* _last cdef Cell* cells - cdef size_t find_slot(self, key_t key) - cdef Cell* lookup(self, key_t key) - cdef void insert(self, key_t key, val_t value) - cdef void resize(self, size_t new_size) + cdef val_t lookup(self, key_t key) + cdef void insert(self, key_t key, val_t value) except * + cdef void resize(self, size_t new_size) except * diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx index 2645d2bcf..2218fb1c5 100644 --- a/spacy/_hashing.pyx +++ b/spacy/_hashing.pyx @@ -6,7 +6,9 @@ cimport cython cdef class PointerHash: def __cinit__(self, size_t initial_size=8): self.size = initial_size + self.size = 8 self.filled = 0 + self._last = NULL # Size must be power of two assert self.size & (self.size - 1) == 0 self.cells = calloc(self.size, sizeof(Cell)) @@ -16,42 +18,37 @@ cdef class PointerHash: def __getitem__(self, key_t key): assert key != 0 - cdef Cell* cell = self.lookup(key) - return cell.value if cell.key != 0 else None + cdef val_t value = self.lookup(key) + return value if value != 0 else None def __setitem__(self, key_t key, val_t value): assert key != 0 + assert value != 0 self.insert(key, value) - @cython.cdivision - cdef size_t find_slot(self, key_t key): - cdef size_t i = (key % self.size) - while self.cells[i].key != 0 and self.cells[i].key != key: - i = (i + 1) % self.size - return i + cdef val_t lookup(self, key_t key): + cell = _find_cell(self.cells, self.size, key) + self._last = cell + return cell.value - @cython.cdivision - cdef Cell* lookup(self, key_t key): - cdef size_t i = (key % self.size) - while self.cells[i].key != 0 and self.cells[i].key != key: - i = (i + 1) % self.size - return &self.cells[i] - - cdef void insert(self, key_t key, val_t value): - cdef size_t i = self.find_slot(key) - if self.cells[i].key == 0: - self.cells[i].key = key + cdef void insert(self, key_t key, val_t value) except *: + cdef Cell* cell + if self._last != NULL and key == self._last.key: + cell = self._last + else: + cell = _find_cell(self.cells, self.size, key) + self._last = NULL + if cell.key == 0: + cell.key = key self.filled += 1 - self.cells[i].value = value + cell.value = value if (self.filled + 1) * 4 >= (self.size * 3): self.resize(self.size * 2) - cdef void resize(self, size_t new_size): + cdef void resize(self, size_t new_size) except *: assert (new_size & (new_size - 1)) == 0 # Must be a power of 2 assert self.filled * 4 <= new_size * 3 - self.size = new_size - cdef Cell* old_cells = self.cells cdef size_t old_size = self.size @@ -60,6 +57,17 @@ cdef class PointerHash: self.filled = 0 cdef size_t i + cdef size_t slot for i in range(old_size): - if self.cells[i].key != 0: - self.insert(self.cells[i].key, self.cells[i].value) + if old_cells[i].key != 0: + assert old_cells[i].value != 0, i + self.insert(old_cells[i].key, old_cells[i].value) + free(old_cells) + + +@cython.cdivision +cdef inline Cell* _find_cell(Cell* cells, size_t size, key_t key) nogil: + cdef size_t i = (key % size) + while cells[i].key != 0 and cells[i].key != key: + i = (i + 1) % size + return &cells[i] diff --git a/spacy/en.pyx b/spacy/en.pyx index a3ce4da59..6f801d96e 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -238,7 +238,7 @@ cdef class English(Language): v_shape = View_WordShape def __cinit__(self, name, user_string_features, user_flag_features): self.cache = PointerHash(2 ** 25) - self.specials.set_empty_key(0) + self.specials = PointerHash(2 ** 16) lang_data = util.read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats, diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 619993ebc..1f61d0e95 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -15,49 +15,6 @@ cdef extern from "Python.h": cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch) -cdef extern from "sparsehash/dense_hash_map" namespace "google": - cdef cppclass dense_hash_map[K, D]: - K& key_type - D& data_type - pair[K, D]& value_type - uint64_t size_type - cppclass iterator: - pair[K, D]& operator*() nogil - iterator operator++() nogil - iterator operator--() nogil - bint operator==(iterator) nogil - bint operator!=(iterator) nogil - iterator begin() - iterator end() - uint64_t size() - uint64_t max_size() - bint empty() - uint64_t bucket_count() - uint64_t bucket_size(uint64_t i) - uint64_t bucket(K& key) - double max_load_factor() - void max_load_vactor(double new_grow) - double min_load_factor() - double min_load_factor(double new_grow) - void set_resizing_parameters(double shrink, double grow) - void resize(uint64_t n) - void rehash(uint64_t n) - dense_hash_map() - dense_hash_map(uint64_t n) - void swap(dense_hash_map&) - pair[iterator, bint] insert(pair[K, D]) nogil - void set_empty_key(K&) - void set_deleted_key(K& key) - void clear_deleted_key() - void erase(iterator pos) - uint64_t erase(K& k) - void erase(iterator first, iterator last) - void clear() - void clear_no_resize() - pair[iterator, iterator] equal_range(K& k) - D& operator[](K&) nogil - - cdef struct String: Py_UNICODE* chars size_t n @@ -70,7 +27,7 @@ cdef class Lexicon: cpdef Lexeme lookup(self, unicode string) cdef size_t get(self, String* s) - cdef dense_hash_map[uint64_t, size_t] _dict + cdef PointerHash _dict cdef list _string_features cdef list _flag_features @@ -79,7 +36,7 @@ cdef class Lexicon: cdef class Language: cdef unicode name cdef PointerHash cache - cdef dense_hash_map[uint64_t, size_t] specials + cdef PointerHash specials cpdef readonly Lexicon lexicon cpdef readonly object tokens_class diff --git a/spacy/lang.pyx b/spacy/lang.pyx index a9ed5be3d..172a99de2 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -43,7 +43,7 @@ cdef class Language: string_features = [] self.name = name self.cache = PointerHash(2 ** 22) - self.specials.set_empty_key(0) + self.specials = PointerHash(2 ** 16) lang_data = read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, @@ -52,10 +52,7 @@ cdef class Language: self.tokens_class = Tokens def __dealloc__(self): - cdef uint64_t hashed - cdef size_t lex_addr - for (hashed, lex_addr) in self.specials: - free(lex_addr) + pass property nr_types: def __get__(self): @@ -112,28 +109,25 @@ cdef class Language: return tokens cdef int _tokenize(self, Tokens tokens, String* string): - cdef Cell* cell = self.cache.lookup(string.key) - cdef LexemeC** lexemes + cdef LexemeC** lexemes = self.cache.lookup(string.key) cdef size_t i - if cell.key != 0: - lexemes = cell.value + if lexemes != NULL: i = 0 while lexemes[i] != NULL: tokens.push_back(lexemes[i]) i += 1 return 0 - - cell.key = string.key - self.cache.filled += 1 + cdef uint64_t key = string.key cdef size_t first_token = tokens.length cdef int split cdef int remaining = string.n cdef String prefix + cdef Cell* tmp_cell while remaining >= 1: split = self._split_one(string.chars, string.n) remaining -= split string_slice_prefix(string, &prefix, split) - lexemes = self.specials[prefix.key] + lexemes = self.specials.lookup(prefix.key) if lexemes != NULL: i = 0 while lexemes[i] != NULL: @@ -145,7 +139,7 @@ cdef class Language: cdef size_t j for i, j in enumerate(range(first_token, tokens.length)): lexemes[i] = tokens.lexemes[j] - cell.value = lexemes + self.cache.insert(key, lexemes) cdef int _split_one(self, Py_UNICODE* characters, size_t length): return length @@ -181,7 +175,7 @@ cdef class Lexicon: string_features, flag_features): self._flag_features = flag_features self._string_features = string_features - self._dict.set_empty_key(0) + self._dict = PointerHash(2 ** 20) self.size = 0 cdef Lexeme word for string in words: @@ -200,9 +194,9 @@ cdef class Lexicon: self.size += 1 cdef size_t get(self, String* string): - cdef LexemeC* lexeme = self._dict[string.key] - if lexeme != NULL: - return lexeme + cdef size_t lex_addr = self._dict.lookup(string.key) + if lex_addr != 0: + return lex_addr cdef unicode uni_string = string.chars[:string.n] views = [string_view(uni_string, 0.0, 0, {}, {}) @@ -212,8 +206,8 @@ cdef class Lexicon: if flag_feature(uni_string, 0.0, {}, {}): flags.add(i) - lexeme = lexeme_init(uni_string, 0, 0, views, flags) - self._dict[string.key] = lexeme + cdef LexemeC* lexeme = lexeme_init(uni_string, 0, 0, views, flags) + self._dict.insert(string.key, lexeme) self.size += 1 return lexeme diff --git a/tests/test_hashing.py b/tests/test_hashing.py new file mode 100644 index 000000000..408f0d017 --- /dev/null +++ b/tests/test_hashing.py @@ -0,0 +1,20 @@ +import pytest + +from spacy._hashing import PointerHash +import random + + +def test_insert(): + h = PointerHash() + assert h[1] is None + h[1] = 5 + assert h[1] == 5 + h[2] = 6 + assert h[1] == 5 + assert h[2] == 6 + +def test_resize(): + h = PointerHash(4) + for i in range(1, 100): + value = int(i * (random.random() + 1)) + h[i] = value