From 47fbd0475ab8901ace56140c9fa2abe3ba7b1627 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 Aug 2014 17:13:09 +0200 Subject: [PATCH] * Replace the use of dense_hash_map with Python dict --- spacy/spacy.pxd | 7 ++----- spacy/spacy.pyx | 29 ++++++++++++++++++----------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index e1e7a8c7f..91b361f0a 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -2,12 +2,9 @@ from libcpp.vector cimport vector from libc.stdint cimport uint32_t from libc.stdint cimport uint64_t -from sparsehash.dense_hash_map cimport dense_hash_map - # Circular import problems here ctypedef size_t Lexeme_addr ctypedef uint32_t StringHash -ctypedef dense_hash_map[StringHash, size_t] Vocab from spacy.lexeme cimport Lexeme from spacy.tokens cimport Tokens @@ -23,8 +20,8 @@ from spacy.lexeme cimport Lexeme cdef class Language: cdef object name - cdef dense_hash_map[StringHash, size_t] chunks - cdef dense_hash_map[StringHash, size_t] vocab + cdef dict chunks + cdef dict vocab cdef dict bacov cpdef Tokens tokenize(self, unicode text) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 7f54b1225..57877250f 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -28,10 +28,8 @@ cdef class Language: def __cinit__(self, name): self.name = name self.bacov = {} - self.chunks = dense_hash_map[StringHash, size_t]() - self.vocab = dense_hash_map[StringHash, size_t]() - self.chunks.set_empty_key(0) - self.vocab.set_empty_key(0) + self.chunks = {} + self.vocab = {} self.load_tokenization(util.read_tokenization(name)) self.load_dist_info(util.read_dist_info(name)) @@ -65,16 +63,24 @@ cdef class Language: cdef Lexeme* lookup(self, unicode string) except NULL: assert len(string) != 0 - cdef Lexeme* word = self.vocab[hash(string)] - if word == NULL: + cdef Lexeme* word + cdef LexID lex_id + cdef StringHash h = hash(string) + if h in self.vocab: + lex_id = self.vocab[h] + word = lex_id + else: word = self.new_lexeme(string) return word cdef Lexeme** lookup_chunk(self, unicode string) except NULL: cdef StringHash h = hash(string) - cdef Lexeme** chunk = self.chunks[h] - cdef int split - if chunk == NULL: + cdef Lexeme** chunk + cdef size_t chunk_id + if h in self.chunks: + chunk_id = self.chunks[h] + chunk = chunk_id + else: chunk = self.new_chunk(string, self.find_substrings(string)) return chunk @@ -83,7 +89,8 @@ cdef class Language: for i, substring in enumerate(substrings): chunk[i] = self.lookup(substring) chunk[i + 1] = NULL - self.chunks[hash(string)] = chunk + cdef StringHash h = hash(string) + self.chunks[h] = chunk return chunk cdef Lexeme* new_lexeme(self, unicode string) except NULL: @@ -115,7 +122,7 @@ cdef class Language: cdef LexID lex_id cdef Lexeme* word - for key, lex_id in self.vocab: + for key, lex_id in self.vocab.items(): word = lex_id free(word.string_views) word.string_views = calloc(nr_views, sizeof(StringHash))