diff --git a/spacy/en.pyx b/spacy/en.pyx index 785d21e24..dc8465fc9 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -224,7 +224,7 @@ cdef class English(Language): fl_is_digit = Flag_IsDigit v_shape = View_WordShape def __cinit__(self, name, user_string_features, user_flag_features): - self.cache = {} + self.cache.set_empty_key(0) lang_data = util.read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats, diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 20a40d175..988fa715d 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -3,14 +3,52 @@ from libc.stdint cimport uint64_t from spacy.word cimport Lexeme from spacy.tokens cimport Tokens - -cdef struct Flags: - size_t is_alpha - size_t can_noun +from libcpp.utility cimport pair +from libcpp.vector cimport vector +from libc.stdint cimport uint64_t, int64_t -cdef struct ViewIDs: - size_t canon_form +cdef extern from "sparsehash/dense_hash_map" namespace "google": + cdef cppclass dense_hash_map[K, D]: + K& key_type + D& data_type + pair[K, D]& value_type + uint64_t size_type + cppclass iterator: + pair[K, D]& operator*() nogil + iterator operator++() nogil + iterator operator--() nogil + bint operator==(iterator) nogil + bint operator!=(iterator) nogil + iterator begin() + iterator end() + uint64_t size() + uint64_t max_size() + bint empty() + uint64_t bucket_count() + uint64_t bucket_size(uint64_t i) + uint64_t bucket(K& key) + double max_load_factor() + void max_load_vactor(double new_grow) + double min_load_factor() + double min_load_factor(double new_grow) + void set_resizing_parameters(double shrink, double grow) + void resize(uint64_t n) + void rehash(uint64_t n) + dense_hash_map() + dense_hash_map(uint64_t n) + void swap(dense_hash_map&) + pair[iterator, bint] insert(pair[K, D]) nogil + void set_empty_key(K&) + void set_deleted_key(K& key) + void clear_deleted_key() + void erase(iterator pos) + uint64_t erase(K& k) + void erase(iterator first, iterator last) + void clear() + void clear_no_resize() + pair[iterator, iterator] equal_range(K& k) + D& operator[](K&) nogil cdef class Lexicon: @@ -27,13 +65,14 @@ cdef class Lexicon: cdef class Language: cdef unicode name - cdef dict cache + cdef dense_hash_map[uint64_t, size_t] cache + cdef size_t cache_size cpdef readonly Lexicon lexicon cpdef readonly object tokens_class cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) - cdef _tokenize(self, Tokens tokens, unicode string) + cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) cdef list _split(self, unicode string) cdef int _split_one(self, unicode word) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 5a7f98948..dd3394152 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -40,7 +40,8 @@ cdef class Language: if string_features is None: string_features = [] self.name = name - self.cache = {} + self.cache.set_empty_key(0) + self.cache_size = 0 lang_data = read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, @@ -102,11 +103,10 @@ cdef class Language: cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length): cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) cdef unicode string - cdef LexemeC** lexemes cdef bint free_chunk = False cdef size_t i = 0 - if hashed in self.cache: - lexemes = self.cache[hashed] + cdef LexemeC** lexemes = self.cache[hashed] + if lexemes is not NULL: while lexemes[i] != NULL: tokens.push_back(lexemes[i]) i += 1 @@ -121,8 +121,9 @@ cdef class Language: # The intuition here is that if an element belongs in the cache, it # has several chances to get in. And if the cache is large, we less # believe that the element belongs there. - if not self.cache or random.random() < (100000.0 / len(self.cache)): + if self.cache_size == 0 or random.random() < (100000.0 / self.cache_size): self.cache[hashed] = lexemes + self.cache_size += 1 else: free(lexemes) @@ -172,6 +173,7 @@ cdef class Language: lexemes[i + 1] = NULL hashed = hash64(string, len(string) * sizeof(Py_UNICODE), 0) self.cache[hashed] = lexemes + self.cache_size += 1 cdef class Lexicon: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index ccfe45d24..6ac78de1b 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -5,7 +5,7 @@ cdef class Tokens: cdef size_t size cdef LexemeC** lexemes - cdef push_back(self, LexemeC* lexeme) + cdef int push_back(self, LexemeC* lexeme) except -1 cpdef unicode string(self, size_t i) cpdef double prob(self, size_t i) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 209ae94d6..9ab5170d2 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -44,7 +44,7 @@ cdef class Tokens: def append(self, Lexeme lexeme): self.push_back(lexeme._c) - cdef push_back(self, LexemeC* lexeme): + cdef int push_back(self, LexemeC* lexeme) except -1: if (self.size + 1) == self.length: self.size *= 2 self.lexemes = realloc(self.lexemes, self.size * sizeof(LexemeC*))