diff --git a/spacy/_hashing.pxd b/spacy/_hashing.pxd index 44733451d..d87704c1a 100644 --- a/spacy/_hashing.pxd +++ b/spacy/_hashing.pxd @@ -6,6 +6,6 @@ cdef class FixedTable: cdef uint64_t* keys cdef size_t* values - cdef int insert(self, uint64_t key, size_t value) nogil + cdef size_t insert(self, uint64_t key, size_t value) nogil cdef size_t get(self, uint64_t key) nogil cdef int erase(self, uint64_t key) nogil diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx index bd59e0469..f77cc5f7f 100644 --- a/spacy/_hashing.pyx +++ b/spacy/_hashing.pyx @@ -24,10 +24,16 @@ cdef class FixedTable: def bucket(self, uint64_t key): return _find(key, self.size) - cdef int insert(self, uint64_t key, size_t value) nogil: + cdef size_t insert(self, uint64_t key, size_t value) nogil: cdef size_t bucket = _find(key, self.size) + cdef size_t clobbered + if self.values[bucket] == value: + clobbered = 0 + else: + clobbered = self.values[clobbered] self.keys[bucket] = key self.values[bucket] = value + return clobbered cdef size_t get(self, uint64_t key) nogil: cdef size_t bucket = _find(key, self.size) @@ -39,6 +45,7 @@ cdef class FixedTable: cdef int erase(self, uint64_t key) nogil: cdef size_t bucket = _find(key, self.size) self.keys[bucket] = 0 + self.values[bucket] = 0 @cython.cdivision diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 30234fcba..aca8795fa 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -140,7 +140,9 @@ cdef class Language: cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef size_t i word = self.init_lexeme(string, hashed, split, length) - self.happax.insert(hashed, word) + cdef Lexeme* clobbered = self.happax.insert(hashed, word) + if clobbered != NULL: + free(clobbered) self.bacov[hashed] = string return word