From 5b6457e80eebfbe07e8df6858e09d8da4373279d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 1 Aug 2014 07:37:50 +0100 Subject: [PATCH] * Free lexemes clobbered as happaxes --- spacy/_hashing.pxd | 2 +- spacy/_hashing.pyx | 9 ++++++++- spacy/spacy.pyx | 4 +++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/spacy/_hashing.pxd b/spacy/_hashing.pxd index 44733451d..d87704c1a 100644 --- a/spacy/_hashing.pxd +++ b/spacy/_hashing.pxd @@ -6,6 +6,6 @@ cdef class FixedTable: cdef uint64_t* keys cdef size_t* values - cdef int insert(self, uint64_t key, size_t value) nogil + cdef size_t insert(self, uint64_t key, size_t value) nogil cdef size_t get(self, uint64_t key) nogil cdef int erase(self, uint64_t key) nogil diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx index bd59e0469..f77cc5f7f 100644 --- a/spacy/_hashing.pyx +++ b/spacy/_hashing.pyx @@ -24,10 +24,16 @@ cdef class FixedTable: def bucket(self, uint64_t key): return _find(key, self.size) - cdef int insert(self, uint64_t key, size_t value) nogil: + cdef size_t insert(self, uint64_t key, size_t value) nogil: cdef size_t bucket = _find(key, self.size) + cdef size_t clobbered + if self.values[bucket] == value: + clobbered = 0 + else: + clobbered = self.values[clobbered] self.keys[bucket] = key self.values[bucket] = value + return clobbered cdef size_t get(self, uint64_t key) nogil: cdef size_t bucket = _find(key, self.size) @@ -39,6 +45,7 @@ cdef class FixedTable: cdef int erase(self, uint64_t key) nogil: cdef size_t bucket = _find(key, self.size) self.keys[bucket] = 0 + self.values[bucket] = 0 @cython.cdivision diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 30234fcba..aca8795fa 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -140,7 +140,9 @@ cdef class Language: cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef size_t i word = self.init_lexeme(string, hashed, split, length) - self.happax.insert(hashed, word) + cdef Lexeme* clobbered = self.happax.insert(hashed, word) + if clobbered != NULL: + free(clobbered) self.bacov[hashed] = string return word