* PointerHash working, efficiency is good. 6-7 mins

This commit is contained in:
Matthew Honnibal 2014-09-13 16:43:42 +02:00
parent 85d68e8e95
commit 0447279c57
7 changed files with 75 additions and 95 deletions

View File

@ -46,6 +46,7 @@ else:
exts = [ exts = [
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes), Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes),
Extension("spacy.word", ["spacy/word.pyx"], language="c++", Extension("spacy.word", ["spacy/word.pyx"], language="c++",
include_dirs=includes), include_dirs=includes),
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++",

View File

@ -12,9 +12,9 @@ cdef struct Cell:
cdef class PointerHash: cdef class PointerHash:
cdef size_t size cdef size_t size
cdef size_t filled cdef size_t filled
cdef Cell* _last
cdef Cell* cells cdef Cell* cells
cdef size_t find_slot(self, key_t key) cdef val_t lookup(self, key_t key)
cdef Cell* lookup(self, key_t key) cdef void insert(self, key_t key, val_t value) except *
cdef void insert(self, key_t key, val_t value) cdef void resize(self, size_t new_size) except *
cdef void resize(self, size_t new_size)

View File

@ -6,7 +6,9 @@ cimport cython
cdef class PointerHash: cdef class PointerHash:
def __cinit__(self, size_t initial_size=8): def __cinit__(self, size_t initial_size=8):
self.size = initial_size self.size = initial_size
self.size = 8
self.filled = 0 self.filled = 0
self._last = NULL
# Size must be power of two # Size must be power of two
assert self.size & (self.size - 1) == 0 assert self.size & (self.size - 1) == 0
self.cells = <Cell*>calloc(self.size, sizeof(Cell)) self.cells = <Cell*>calloc(self.size, sizeof(Cell))
@ -16,42 +18,37 @@ cdef class PointerHash:
def __getitem__(self, key_t key): def __getitem__(self, key_t key):
assert key != 0 assert key != 0
cdef Cell* cell = self.lookup(key) cdef val_t value = self.lookup(key)
return cell.value if cell.key != 0 else None return value if value != 0 else None
def __setitem__(self, key_t key, val_t value): def __setitem__(self, key_t key, val_t value):
assert key != 0 assert key != 0
assert value != 0
self.insert(key, value) self.insert(key, value)
@cython.cdivision cdef val_t lookup(self, key_t key):
cdef size_t find_slot(self, key_t key): cell = _find_cell(self.cells, self.size, key)
cdef size_t i = (key % self.size) self._last = cell
while self.cells[i].key != 0 and self.cells[i].key != key: return cell.value
i = (i + 1) % self.size
return i
@cython.cdivision cdef void insert(self, key_t key, val_t value) except *:
cdef Cell* lookup(self, key_t key): cdef Cell* cell
cdef size_t i = (key % self.size) if self._last != NULL and key == self._last.key:
while self.cells[i].key != 0 and self.cells[i].key != key: cell = self._last
i = (i + 1) % self.size else:
return &self.cells[i] cell = _find_cell(self.cells, self.size, key)
self._last = NULL
cdef void insert(self, key_t key, val_t value): if cell.key == 0:
cdef size_t i = self.find_slot(key) cell.key = key
if self.cells[i].key == 0:
self.cells[i].key = key
self.filled += 1 self.filled += 1
self.cells[i].value = value cell.value = value
if (self.filled + 1) * 4 >= (self.size * 3): if (self.filled + 1) * 4 >= (self.size * 3):
self.resize(self.size * 2) self.resize(self.size * 2)
cdef void resize(self, size_t new_size): cdef void resize(self, size_t new_size) except *:
assert (new_size & (new_size - 1)) == 0 # Must be a power of 2 assert (new_size & (new_size - 1)) == 0 # Must be a power of 2
assert self.filled * 4 <= new_size * 3 assert self.filled * 4 <= new_size * 3
self.size = new_size
cdef Cell* old_cells = self.cells cdef Cell* old_cells = self.cells
cdef size_t old_size = self.size cdef size_t old_size = self.size
@ -60,6 +57,17 @@ cdef class PointerHash:
self.filled = 0 self.filled = 0
cdef size_t i cdef size_t i
cdef size_t slot
for i in range(old_size): for i in range(old_size):
if self.cells[i].key != 0: if old_cells[i].key != 0:
self.insert(self.cells[i].key, self.cells[i].value) assert old_cells[i].value != 0, i
self.insert(old_cells[i].key, old_cells[i].value)
free(old_cells)
@cython.cdivision
cdef inline Cell* _find_cell(Cell* cells, size_t size, key_t key) nogil:
cdef size_t i = (key % size)
while cells[i].key != 0 and cells[i].key != key:
i = (i + 1) % size
return &cells[i]

View File

@ -238,7 +238,7 @@ cdef class English(Language):
v_shape = View_WordShape v_shape = View_WordShape
def __cinit__(self, name, user_string_features, user_flag_features): def __cinit__(self, name, user_string_features, user_flag_features):
self.cache = PointerHash(2 ** 25) self.cache = PointerHash(2 ** 25)
self.specials.set_empty_key(0) self.specials = PointerHash(2 ** 16)
lang_data = util.read_lang_data(name) lang_data = util.read_lang_data(name)
rules, words, probs, clusters, case_stats, tag_stats = lang_data rules, words, probs, clusters, case_stats, tag_stats = lang_data
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats, self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,

View File

@ -15,49 +15,6 @@ cdef extern from "Python.h":
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch) cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
cdef extern from "sparsehash/dense_hash_map" namespace "google":
cdef cppclass dense_hash_map[K, D]:
K& key_type
D& data_type
pair[K, D]& value_type
uint64_t size_type
cppclass iterator:
pair[K, D]& operator*() nogil
iterator operator++() nogil
iterator operator--() nogil
bint operator==(iterator) nogil
bint operator!=(iterator) nogil
iterator begin()
iterator end()
uint64_t size()
uint64_t max_size()
bint empty()
uint64_t bucket_count()
uint64_t bucket_size(uint64_t i)
uint64_t bucket(K& key)
double max_load_factor()
void max_load_vactor(double new_grow)
double min_load_factor()
double min_load_factor(double new_grow)
void set_resizing_parameters(double shrink, double grow)
void resize(uint64_t n)
void rehash(uint64_t n)
dense_hash_map()
dense_hash_map(uint64_t n)
void swap(dense_hash_map&)
pair[iterator, bint] insert(pair[K, D]) nogil
void set_empty_key(K&)
void set_deleted_key(K& key)
void clear_deleted_key()
void erase(iterator pos)
uint64_t erase(K& k)
void erase(iterator first, iterator last)
void clear()
void clear_no_resize()
pair[iterator, iterator] equal_range(K& k)
D& operator[](K&) nogil
cdef struct String: cdef struct String:
Py_UNICODE* chars Py_UNICODE* chars
size_t n size_t n
@ -70,7 +27,7 @@ cdef class Lexicon:
cpdef Lexeme lookup(self, unicode string) cpdef Lexeme lookup(self, unicode string)
cdef size_t get(self, String* s) cdef size_t get(self, String* s)
cdef dense_hash_map[uint64_t, size_t] _dict cdef PointerHash _dict
cdef list _string_features cdef list _string_features
cdef list _flag_features cdef list _flag_features
@ -79,7 +36,7 @@ cdef class Lexicon:
cdef class Language: cdef class Language:
cdef unicode name cdef unicode name
cdef PointerHash cache cdef PointerHash cache
cdef dense_hash_map[uint64_t, size_t] specials cdef PointerHash specials
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cpdef readonly object tokens_class cpdef readonly object tokens_class

View File

@ -43,7 +43,7 @@ cdef class Language:
string_features = [] string_features = []
self.name = name self.name = name
self.cache = PointerHash(2 ** 22) self.cache = PointerHash(2 ** 22)
self.specials.set_empty_key(0) self.specials = PointerHash(2 ** 16)
lang_data = read_lang_data(name) lang_data = read_lang_data(name)
rules, words, probs, clusters, case_stats, tag_stats = lang_data rules, words, probs, clusters, case_stats, tag_stats = lang_data
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
@ -52,10 +52,7 @@ cdef class Language:
self.tokens_class = Tokens self.tokens_class = Tokens
def __dealloc__(self): def __dealloc__(self):
cdef uint64_t hashed pass
cdef size_t lex_addr
for (hashed, lex_addr) in self.specials:
free(<LexemeC*>lex_addr)
property nr_types: property nr_types:
def __get__(self): def __get__(self):
@ -112,28 +109,25 @@ cdef class Language:
return tokens return tokens
cdef int _tokenize(self, Tokens tokens, String* string): cdef int _tokenize(self, Tokens tokens, String* string):
cdef Cell* cell = self.cache.lookup(string.key) cdef LexemeC** lexemes = <LexemeC**>self.cache.lookup(string.key)
cdef LexemeC** lexemes
cdef size_t i cdef size_t i
if cell.key != 0: if lexemes != NULL:
lexemes = <LexemeC**>cell.value
i = 0 i = 0
while lexemes[i] != NULL: while lexemes[i] != NULL:
tokens.push_back(lexemes[i]) tokens.push_back(lexemes[i])
i += 1 i += 1
return 0 return 0
cdef uint64_t key = string.key
cell.key = string.key
self.cache.filled += 1
cdef size_t first_token = tokens.length cdef size_t first_token = tokens.length
cdef int split cdef int split
cdef int remaining = string.n cdef int remaining = string.n
cdef String prefix cdef String prefix
cdef Cell* tmp_cell
while remaining >= 1: while remaining >= 1:
split = self._split_one(string.chars, string.n) split = self._split_one(string.chars, string.n)
remaining -= split remaining -= split
string_slice_prefix(string, &prefix, split) string_slice_prefix(string, &prefix, split)
lexemes = <LexemeC**>self.specials[prefix.key] lexemes = <LexemeC**>self.specials.lookup(prefix.key)
if lexemes != NULL: if lexemes != NULL:
i = 0 i = 0
while lexemes[i] != NULL: while lexemes[i] != NULL:
@ -145,7 +139,7 @@ cdef class Language:
cdef size_t j cdef size_t j
for i, j in enumerate(range(first_token, tokens.length)): for i, j in enumerate(range(first_token, tokens.length)):
lexemes[i] = tokens.lexemes[j] lexemes[i] = tokens.lexemes[j]
cell.value = <size_t>lexemes self.cache.insert(key, <size_t>lexemes)
cdef int _split_one(self, Py_UNICODE* characters, size_t length): cdef int _split_one(self, Py_UNICODE* characters, size_t length):
return length return length
@ -181,7 +175,7 @@ cdef class Lexicon:
string_features, flag_features): string_features, flag_features):
self._flag_features = flag_features self._flag_features = flag_features
self._string_features = string_features self._string_features = string_features
self._dict.set_empty_key(0) self._dict = PointerHash(2 ** 20)
self.size = 0 self.size = 0
cdef Lexeme word cdef Lexeme word
for string in words: for string in words:
@ -200,9 +194,9 @@ cdef class Lexicon:
self.size += 1 self.size += 1
cdef size_t get(self, String* string): cdef size_t get(self, String* string):
cdef LexemeC* lexeme = <LexemeC*>self._dict[string.key] cdef size_t lex_addr = self._dict.lookup(string.key)
if lexeme != NULL: if lex_addr != 0:
return <size_t>lexeme return lex_addr
cdef unicode uni_string = string.chars[:string.n] cdef unicode uni_string = string.chars[:string.n]
views = [string_view(uni_string, 0.0, 0, {}, {}) views = [string_view(uni_string, 0.0, 0, {}, {})
@ -212,8 +206,8 @@ cdef class Lexicon:
if flag_feature(uni_string, 0.0, {}, {}): if flag_feature(uni_string, 0.0, {}, {}):
flags.add(i) flags.add(i)
lexeme = lexeme_init(uni_string, 0, 0, views, flags) cdef LexemeC* lexeme = lexeme_init(uni_string, 0, 0, views, flags)
self._dict[string.key] = <size_t>lexeme self._dict.insert(string.key, <size_t>lexeme)
self.size += 1 self.size += 1
return <size_t>lexeme return <size_t>lexeme

20
tests/test_hashing.py Normal file
View File

@ -0,0 +1,20 @@
import pytest
from spacy._hashing import PointerHash
import random
def test_insert():
h = PointerHash()
assert h[1] is None
h[1] = 5
assert h[1] == 5
h[2] = 6
assert h[1] == 5
assert h[2] == 6
def test_resize():
h = PointerHash(4)
for i in range(1, 100):
value = int(i * (random.random() + 1))
h[i] = value