* Replace the use of dense_hash_map with Python dict

This commit is contained in:
Matthew Honnibal 2014-08-22 17:13:09 +02:00
parent 6f83dca218
commit 47fbd0475a
2 changed files with 20 additions and 16 deletions

View File

@ -2,12 +2,9 @@ from libcpp.vector cimport vector
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from sparsehash.dense_hash_map cimport dense_hash_map
# Circular import problems here # Circular import problems here
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
ctypedef uint32_t StringHash ctypedef uint32_t StringHash
ctypedef dense_hash_map[StringHash, size_t] Vocab
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens
@ -23,8 +20,8 @@ from spacy.lexeme cimport Lexeme
cdef class Language: cdef class Language:
cdef object name cdef object name
cdef dense_hash_map[StringHash, size_t] chunks cdef dict chunks
cdef dense_hash_map[StringHash, size_t] vocab cdef dict vocab
cdef dict bacov cdef dict bacov
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)

View File

@ -28,10 +28,8 @@ cdef class Language:
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name
self.bacov = {} self.bacov = {}
self.chunks = dense_hash_map[StringHash, size_t]() self.chunks = {}
self.vocab = dense_hash_map[StringHash, size_t]() self.vocab = {}
self.chunks.set_empty_key(0)
self.vocab.set_empty_key(0)
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
self.load_dist_info(util.read_dist_info(name)) self.load_dist_info(util.read_dist_info(name))
@ -65,16 +63,24 @@ cdef class Language:
cdef Lexeme* lookup(self, unicode string) except NULL: cdef Lexeme* lookup(self, unicode string) except NULL:
assert len(string) != 0 assert len(string) != 0
cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)] cdef Lexeme* word
if word == NULL: cdef LexID lex_id
cdef StringHash h = hash(string)
if h in self.vocab:
lex_id = self.vocab[h]
word = <Lexeme*>lex_id
else:
word = self.new_lexeme(string) word = self.new_lexeme(string)
return word return word
cdef Lexeme** lookup_chunk(self, unicode string) except NULL: cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
cdef StringHash h = hash(string) cdef StringHash h = hash(string)
cdef Lexeme** chunk = <Lexeme**>self.chunks[h] cdef Lexeme** chunk
cdef int split cdef size_t chunk_id
if chunk == NULL: if h in self.chunks:
chunk_id = self.chunks[h]
chunk = <Lexeme**>chunk_id
else:
chunk = self.new_chunk(string, self.find_substrings(string)) chunk = self.new_chunk(string, self.find_substrings(string))
return chunk return chunk
@ -83,7 +89,8 @@ cdef class Language:
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
chunk[i] = self.lookup(substring) chunk[i] = self.lookup(substring)
chunk[i + 1] = NULL chunk[i + 1] = NULL
self.chunks[hash(string)] = <size_t>chunk cdef StringHash h = hash(string)
self.chunks[h] = <size_t>chunk
return chunk return chunk
cdef Lexeme* new_lexeme(self, unicode string) except NULL: cdef Lexeme* new_lexeme(self, unicode string) except NULL:
@ -115,7 +122,7 @@ cdef class Language:
cdef LexID lex_id cdef LexID lex_id
cdef Lexeme* word cdef Lexeme* word
for key, lex_id in self.vocab: for key, lex_id in self.vocab.items():
word = <Lexeme*>lex_id word = <Lexeme*>lex_id
free(word.string_views) free(word.string_views)
word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash)) word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))