* Replaced cache with own hash table. Similar timing

This commit is contained in:
Matthew Honnibal 2014-09-13 03:14:43 +02:00
parent c8db76e3e1
commit 85d68e8e95
5 changed files with 40 additions and 18 deletions

View File

@ -1,5 +1,7 @@
ctypedef key_t size_t from libc.stdint cimport uint64_t
ctypedef val_t size_t
ctypedef uint64_t key_t
ctypedef size_t val_t
cdef struct Cell: cdef struct Cell:
@ -14,5 +16,5 @@ cdef class PointerHash:
cdef size_t find_slot(self, key_t key) cdef size_t find_slot(self, key_t key)
cdef Cell* lookup(self, key_t key) cdef Cell* lookup(self, key_t key)
cdef void insert(self, key_t key) cdef void insert(self, key_t key, val_t value)
cdef void resize(self, size_t new_size) cdef void resize(self, size_t new_size)

View File

@ -1,3 +1,8 @@
# cython: profile=True
from libc.stdlib cimport calloc, free
cimport cython
cdef class PointerHash: cdef class PointerHash:
def __cinit__(self, size_t initial_size=8): def __cinit__(self, size_t initial_size=8):
self.size = initial_size self.size = initial_size
@ -10,20 +15,26 @@ cdef class PointerHash:
free(self.cells) free(self.cells)
def __getitem__(self, key_t key): def __getitem__(self, key_t key):
assert key != 0
cdef Cell* cell = self.lookup(key) cdef Cell* cell = self.lookup(key)
return cell.value if cell.key != 0 else None return cell.value if cell.key != 0 else None
def __setitem__(self, key_t key, val_t value): def __setitem__(self, key_t key, val_t value):
self.insert(key, value assert key != 0
self.insert(key, value)
@cython.cdivision
cdef size_t find_slot(self, key_t key): cdef size_t find_slot(self, key_t key):
cdef size_t i = key % self.size cdef size_t i = (key % self.size)
while self.cells[i].key != 0 and self.cells[i].key != key: while self.cells[i].key != 0 and self.cells[i].key != key:
i = (i + 1) % self.size i = (i + 1) % self.size
return i return i
@cython.cdivision
cdef Cell* lookup(self, key_t key): cdef Cell* lookup(self, key_t key):
cdef size_t i = self.find_slot(key) cdef size_t i = (key % self.size)
while self.cells[i].key != 0 and self.cells[i].key != key:
i = (i + 1) % self.size
return &self.cells[i] return &self.cells[i]
cdef void insert(self, key_t key, val_t value): cdef void insert(self, key_t key, val_t value):
@ -36,7 +47,7 @@ cdef class PointerHash:
self.resize(self.size * 2) self.resize(self.size * 2)
cdef void resize(self, size_t new_size): cdef void resize(self, size_t new_size):
assert new_size & (new_size - 1)) == 0 # Must be a power of 2 assert (new_size & (new_size - 1)) == 0 # Must be a power of 2
assert self.filled * 4 <= new_size * 3 assert self.filled * 4 <= new_size * 3
self.size = new_size self.size = new_size
@ -47,5 +58,8 @@ cdef class PointerHash:
self.size = new_size self.size = new_size
self.cells = <Cell*>calloc(new_size, sizeof(Cell)) self.cells = <Cell*>calloc(new_size, sizeof(Cell))
self.filled = 0
cdef size_t i
for i in range(old_size): for i in range(old_size):
if self.cells[i].key != 0:
self.insert(self.cells[i].key, self.cells[i].value) self.insert(self.cells[i].key, self.cells[i].value)

View File

@ -43,6 +43,7 @@ from libc.stdint cimport uint64_t
cimport lang cimport lang
from spacy.lexeme cimport lexeme_check_flag from spacy.lexeme cimport lexeme_check_flag
from spacy.lexeme cimport lexeme_string_view from spacy.lexeme cimport lexeme_string_view
from spacy._hashing cimport PointerHash
from spacy import util from spacy import util
@ -236,7 +237,7 @@ cdef class English(Language):
fl_is_digit = Flag_IsDigit fl_is_digit = Flag_IsDigit
v_shape = View_WordShape v_shape = View_WordShape
def __cinit__(self, name, user_string_features, user_flag_features): def __cinit__(self, name, user_string_features, user_flag_features):
self.cache.set_empty_key(0) self.cache = PointerHash(2 ** 25)
self.specials.set_empty_key(0) self.specials.set_empty_key(0)
lang_data = util.read_lang_data(name) lang_data = util.read_lang_data(name)
rules, words, probs, clusters, case_stats, tag_stats = lang_data rules, words, probs, clusters, case_stats, tag_stats = lang_data

View File

@ -3,6 +3,7 @@ from libc.stdint cimport uint64_t
from spacy.word cimport Lexeme from spacy.word cimport Lexeme
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens
from spacy.lexeme cimport LexemeC from spacy.lexeme cimport LexemeC
from spacy._hashing cimport PointerHash
from libcpp.utility cimport pair from libcpp.utility cimport pair
from libcpp.vector cimport vector from libcpp.vector cimport vector
@ -77,7 +78,7 @@ cdef class Lexicon:
cdef class Language: cdef class Language:
cdef unicode name cdef unicode name
cdef dense_hash_map[uint64_t, size_t] cache cdef PointerHash cache
cdef dense_hash_map[uint64_t, size_t] specials cdef dense_hash_map[uint64_t, size_t] specials
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cpdef readonly object tokens_class cpdef readonly object tokens_class

View File

@ -19,6 +19,8 @@ from spacy.tokens import Tokens
from spacy.lexeme cimport LexemeC, lexeme_init from spacy.lexeme cimport LexemeC, lexeme_init
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from spacy._hashing cimport PointerHash
from spacy._hashing cimport Cell
cdef class Language: cdef class Language:
"""Base class for language-specific tokenizers. """Base class for language-specific tokenizers.
@ -40,7 +42,7 @@ cdef class Language:
if string_features is None: if string_features is None:
string_features = [] string_features = []
self.name = name self.name = name
self.cache.set_empty_key(0) self.cache = PointerHash(2 ** 22)
self.specials.set_empty_key(0) self.specials.set_empty_key(0)
lang_data = read_lang_data(name) lang_data = read_lang_data(name)
rules, words, probs, clusters, case_stats, tag_stats = lang_data rules, words, probs, clusters, case_stats, tag_stats = lang_data
@ -110,17 +112,19 @@ cdef class Language:
return tokens return tokens
cdef int _tokenize(self, Tokens tokens, String* string): cdef int _tokenize(self, Tokens tokens, String* string):
cdef LexemeC** lexemes = <LexemeC**>self.cache[string.key] cdef Cell* cell = self.cache.lookup(string.key)
lexemes = <LexemeC**>self.cache[string.key] cdef LexemeC** lexemes
cdef size_t i cdef size_t i
if lexemes != NULL: if cell.key != 0:
lexemes = <LexemeC**>cell.value
i = 0 i = 0
while lexemes[i] != NULL: while lexemes[i] != NULL:
tokens.push_back(lexemes[i]) tokens.push_back(lexemes[i])
i += 1 i += 1
return 0 return 0
cdef uint64_t hashed = string.key
cell.key = string.key
self.cache.filled += 1
cdef size_t first_token = tokens.length cdef size_t first_token = tokens.length
cdef int split cdef int split
cdef int remaining = string.n cdef int remaining = string.n
@ -141,7 +145,7 @@ cdef class Language:
cdef size_t j cdef size_t j
for i, j in enumerate(range(first_token, tokens.length)): for i, j in enumerate(range(first_token, tokens.length)):
lexemes[i] = tokens.lexemes[j] lexemes[i] = tokens.lexemes[j]
self.cache[hashed] = <size_t>lexemes cell.value = <size_t>lexemes
cdef int _split_one(self, Py_UNICODE* characters, size_t length): cdef int _split_one(self, Py_UNICODE* characters, size_t length):
return length return length
@ -169,7 +173,7 @@ cdef class Language:
lexemes[i + 1] = NULL lexemes[i + 1] = NULL
string_from_unicode(&string, uni_string) string_from_unicode(&string, uni_string)
self.specials[string.key] = <size_t>lexemes self.specials[string.key] = <size_t>lexemes
self.cache[string.key] = <size_t>lexemes self.cache.insert(string.key, <size_t>lexemes)
cdef class Lexicon: cdef class Lexicon: