* Switch hash interface, using void* instead of size_t, to avoid casts.

This commit is contained in:
Matthew Honnibal 2014-09-13 17:02:06 +02:00
parent 0447279c57
commit 45865be37e
2 changed files with 29 additions and 29 deletions

View File

@ -18,20 +18,19 @@ cdef class PointerHash:
def __getitem__(self, key_t key): def __getitem__(self, key_t key):
assert key != 0 assert key != 0
cdef val_t value = self.lookup(key) cdef val_t value = self.get(key)
return value if value != 0 else None return <size_t>value if value != NULL else None
def __setitem__(self, key_t key, val_t value): def __setitem__(self, key_t key, size_t value):
assert key != 0 assert key != 0 and value != 0
assert value != 0 self.set(key, <val_t>value)
self.insert(key, value)
cdef val_t lookup(self, key_t key): cdef val_t get(self, key_t key):
cell = _find_cell(self.cells, self.size, key) cell = _find_cell(self.cells, self.size, key)
self._last = cell self._last = cell
return cell.value return cell.value
cdef void insert(self, key_t key, val_t value) except *: cdef void set(self, key_t key, val_t value) except *:
cdef Cell* cell cdef Cell* cell
if self._last != NULL and key == self._last.key: if self._last != NULL and key == self._last.key:
cell = self._last cell = self._last
@ -60,8 +59,8 @@ cdef class PointerHash:
cdef size_t slot cdef size_t slot
for i in range(old_size): for i in range(old_size):
if old_cells[i].key != 0: if old_cells[i].key != 0:
assert old_cells[i].value != 0, i assert old_cells[i].value != NULL, i
self.insert(old_cells[i].key, old_cells[i].value) self.set(old_cells[i].key, old_cells[i].value)
free(old_cells) free(old_cells)

View File

@ -109,7 +109,7 @@ cdef class Language:
return tokens return tokens
cdef int _tokenize(self, Tokens tokens, String* string): cdef int _tokenize(self, Tokens tokens, String* string):
cdef LexemeC** lexemes = <LexemeC**>self.cache.lookup(string.key) cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
cdef size_t i cdef size_t i
if lexemes != NULL: if lexemes != NULL:
i = 0 i = 0
@ -127,7 +127,7 @@ cdef class Language:
split = self._split_one(string.chars, string.n) split = self._split_one(string.chars, string.n)
remaining -= split remaining -= split
string_slice_prefix(string, &prefix, split) string_slice_prefix(string, &prefix, split)
lexemes = <LexemeC**>self.specials.lookup(prefix.key) lexemes = <LexemeC**>self.specials.get(prefix.key)
if lexemes != NULL: if lexemes != NULL:
i = 0 i = 0
while lexemes[i] != NULL: while lexemes[i] != NULL:
@ -139,7 +139,7 @@ cdef class Language:
cdef size_t j cdef size_t j
for i, j in enumerate(range(first_token, tokens.length)): for i, j in enumerate(range(first_token, tokens.length)):
lexemes[i] = tokens.lexemes[j] lexemes[i] = tokens.lexemes[j]
self.cache.insert(key, <size_t>lexemes) self.cache.set(key, lexemes)
cdef int _split_one(self, Py_UNICODE* characters, size_t length): cdef int _split_one(self, Py_UNICODE* characters, size_t length):
return length return length
@ -166,8 +166,8 @@ cdef class Language:
lexemes[i] = <LexemeC*>self.lexicon.get(&string) lexemes[i] = <LexemeC*>self.lexicon.get(&string)
lexemes[i + 1] = NULL lexemes[i + 1] = NULL
string_from_unicode(&string, uni_string) string_from_unicode(&string, uni_string)
self.specials[string.key] = <size_t>lexemes self.specials.set(string.key, lexemes)
self.cache.insert(string.key, <size_t>lexemes) self.cache.set(string.key, lexemes)
cdef class Lexicon: cdef class Lexicon:
@ -177,26 +177,27 @@ cdef class Lexicon:
self._string_features = string_features self._string_features = string_features
self._dict = PointerHash(2 ** 20) self._dict = PointerHash(2 ** 20)
self.size = 0 self.size = 0
cdef Lexeme word cdef String string
for string in words: for uni_string in words:
prob = probs.get(string, 0.0) prob = probs.get(uni_string, 0.0)
cluster = clusters.get(string, 0.0) cluster = clusters.get(uni_string, 0.0)
cases = case_stats.get(string, {}) cases = case_stats.get(uni_string, {})
tags = tag_stats.get(string, {}) tags = tag_stats.get(uni_string, {})
views = [string_view(string, prob, cluster, cases, tags) views = [string_view(uni_string, prob, cluster, cases, tags)
for string_view in self._string_features] for string_view in self._string_features]
flags = set() flags = set()
for i, flag_feature in enumerate(self._flag_features): for i, flag_feature in enumerate(self._flag_features):
if flag_feature(string, prob, cluster, cases, tags): if flag_feature(uni_string, prob, cluster, cases, tags):
flags.add(i) flags.add(i)
lexeme = lexeme_init(string, prob, cluster, views, flags) lexeme = lexeme_init(uni_string, prob, cluster, views, flags)
self._dict[string] = <size_t>lexeme string_from_unicode(&string, uni_string)
self._dict.set(string.key, lexeme)
self.size += 1 self.size += 1
cdef size_t get(self, String* string): cdef size_t get(self, String* string):
cdef size_t lex_addr = self._dict.lookup(string.key) cdef LexemeC* lex_addr = <LexemeC*>self._dict.get(string.key)
if lex_addr != 0: if lex_addr != NULL:
return lex_addr return <size_t>lex_addr
cdef unicode uni_string = string.chars[:string.n] cdef unicode uni_string = string.chars[:string.n]
views = [string_view(uni_string, 0.0, 0, {}, {}) views = [string_view(uni_string, 0.0, 0, {}, {})
@ -207,7 +208,7 @@ cdef class Lexicon:
flags.add(i) flags.add(i)
cdef LexemeC* lexeme = lexeme_init(uni_string, 0, 0, views, flags) cdef LexemeC* lexeme = lexeme_init(uni_string, 0, 0, views, flags)
self._dict.insert(string.key, <size_t>lexeme) self._dict.set(string.key, lexeme)
self.size += 1 self.size += 1
return <size_t>lexeme return <size_t>lexeme