mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
* Switch hash interface, using void* instead of size_t, to avoid casts.
This commit is contained in:
parent
0447279c57
commit
45865be37e
|
@ -18,20 +18,19 @@ cdef class PointerHash:
|
||||||
|
|
||||||
def __getitem__(self, key_t key):
|
def __getitem__(self, key_t key):
|
||||||
assert key != 0
|
assert key != 0
|
||||||
cdef val_t value = self.lookup(key)
|
cdef val_t value = self.get(key)
|
||||||
return value if value != 0 else None
|
return <size_t>value if value != NULL else None
|
||||||
|
|
||||||
def __setitem__(self, key_t key, val_t value):
|
def __setitem__(self, key_t key, size_t value):
|
||||||
assert key != 0
|
assert key != 0 and value != 0
|
||||||
assert value != 0
|
self.set(key, <val_t>value)
|
||||||
self.insert(key, value)
|
|
||||||
|
|
||||||
cdef val_t lookup(self, key_t key):
|
cdef val_t get(self, key_t key):
|
||||||
cell = _find_cell(self.cells, self.size, key)
|
cell = _find_cell(self.cells, self.size, key)
|
||||||
self._last = cell
|
self._last = cell
|
||||||
return cell.value
|
return cell.value
|
||||||
|
|
||||||
cdef void insert(self, key_t key, val_t value) except *:
|
cdef void set(self, key_t key, val_t value) except *:
|
||||||
cdef Cell* cell
|
cdef Cell* cell
|
||||||
if self._last != NULL and key == self._last.key:
|
if self._last != NULL and key == self._last.key:
|
||||||
cell = self._last
|
cell = self._last
|
||||||
|
@ -60,8 +59,8 @@ cdef class PointerHash:
|
||||||
cdef size_t slot
|
cdef size_t slot
|
||||||
for i in range(old_size):
|
for i in range(old_size):
|
||||||
if old_cells[i].key != 0:
|
if old_cells[i].key != 0:
|
||||||
assert old_cells[i].value != 0, i
|
assert old_cells[i].value != NULL, i
|
||||||
self.insert(old_cells[i].key, old_cells[i].value)
|
self.set(old_cells[i].key, old_cells[i].value)
|
||||||
free(old_cells)
|
free(old_cells)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -109,7 +109,7 @@ cdef class Language:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* string):
|
cdef int _tokenize(self, Tokens tokens, String* string):
|
||||||
cdef LexemeC** lexemes = <LexemeC**>self.cache.lookup(string.key)
|
cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
if lexemes != NULL:
|
if lexemes != NULL:
|
||||||
i = 0
|
i = 0
|
||||||
|
@ -127,7 +127,7 @@ cdef class Language:
|
||||||
split = self._split_one(string.chars, string.n)
|
split = self._split_one(string.chars, string.n)
|
||||||
remaining -= split
|
remaining -= split
|
||||||
string_slice_prefix(string, &prefix, split)
|
string_slice_prefix(string, &prefix, split)
|
||||||
lexemes = <LexemeC**>self.specials.lookup(prefix.key)
|
lexemes = <LexemeC**>self.specials.get(prefix.key)
|
||||||
if lexemes != NULL:
|
if lexemes != NULL:
|
||||||
i = 0
|
i = 0
|
||||||
while lexemes[i] != NULL:
|
while lexemes[i] != NULL:
|
||||||
|
@ -139,7 +139,7 @@ cdef class Language:
|
||||||
cdef size_t j
|
cdef size_t j
|
||||||
for i, j in enumerate(range(first_token, tokens.length)):
|
for i, j in enumerate(range(first_token, tokens.length)):
|
||||||
lexemes[i] = tokens.lexemes[j]
|
lexemes[i] = tokens.lexemes[j]
|
||||||
self.cache.insert(key, <size_t>lexemes)
|
self.cache.set(key, lexemes)
|
||||||
|
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||||
return length
|
return length
|
||||||
|
@ -166,8 +166,8 @@ cdef class Language:
|
||||||
lexemes[i] = <LexemeC*>self.lexicon.get(&string)
|
lexemes[i] = <LexemeC*>self.lexicon.get(&string)
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
string_from_unicode(&string, uni_string)
|
string_from_unicode(&string, uni_string)
|
||||||
self.specials[string.key] = <size_t>lexemes
|
self.specials.set(string.key, lexemes)
|
||||||
self.cache.insert(string.key, <size_t>lexemes)
|
self.cache.set(string.key, lexemes)
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
@ -177,26 +177,27 @@ cdef class Lexicon:
|
||||||
self._string_features = string_features
|
self._string_features = string_features
|
||||||
self._dict = PointerHash(2 ** 20)
|
self._dict = PointerHash(2 ** 20)
|
||||||
self.size = 0
|
self.size = 0
|
||||||
cdef Lexeme word
|
cdef String string
|
||||||
for string in words:
|
for uni_string in words:
|
||||||
prob = probs.get(string, 0.0)
|
prob = probs.get(uni_string, 0.0)
|
||||||
cluster = clusters.get(string, 0.0)
|
cluster = clusters.get(uni_string, 0.0)
|
||||||
cases = case_stats.get(string, {})
|
cases = case_stats.get(uni_string, {})
|
||||||
tags = tag_stats.get(string, {})
|
tags = tag_stats.get(uni_string, {})
|
||||||
views = [string_view(string, prob, cluster, cases, tags)
|
views = [string_view(uni_string, prob, cluster, cases, tags)
|
||||||
for string_view in self._string_features]
|
for string_view in self._string_features]
|
||||||
flags = set()
|
flags = set()
|
||||||
for i, flag_feature in enumerate(self._flag_features):
|
for i, flag_feature in enumerate(self._flag_features):
|
||||||
if flag_feature(string, prob, cluster, cases, tags):
|
if flag_feature(uni_string, prob, cluster, cases, tags):
|
||||||
flags.add(i)
|
flags.add(i)
|
||||||
lexeme = lexeme_init(string, prob, cluster, views, flags)
|
lexeme = lexeme_init(uni_string, prob, cluster, views, flags)
|
||||||
self._dict[string] = <size_t>lexeme
|
string_from_unicode(&string, uni_string)
|
||||||
|
self._dict.set(string.key, lexeme)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
cdef size_t get(self, String* string):
|
cdef size_t get(self, String* string):
|
||||||
cdef size_t lex_addr = self._dict.lookup(string.key)
|
cdef LexemeC* lex_addr = <LexemeC*>self._dict.get(string.key)
|
||||||
if lex_addr != 0:
|
if lex_addr != NULL:
|
||||||
return lex_addr
|
return <size_t>lex_addr
|
||||||
|
|
||||||
cdef unicode uni_string = string.chars[:string.n]
|
cdef unicode uni_string = string.chars[:string.n]
|
||||||
views = [string_view(uni_string, 0.0, 0, {}, {})
|
views = [string_view(uni_string, 0.0, 0, {}, {})
|
||||||
|
@ -207,7 +208,7 @@ cdef class Lexicon:
|
||||||
flags.add(i)
|
flags.add(i)
|
||||||
|
|
||||||
cdef LexemeC* lexeme = lexeme_init(uni_string, 0, 0, views, flags)
|
cdef LexemeC* lexeme = lexeme_init(uni_string, 0, 0, views, flags)
|
||||||
self._dict.insert(string.key, <size_t>lexeme)
|
self._dict.set(string.key, lexeme)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
return <size_t>lexeme
|
return <size_t>lexeme
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user