* Ugly but seemingly working fix to the token memory leak

This commit is contained in:
Matthew Honnibal 2014-08-01 09:43:19 +01:00
parent c7bb6b329c
commit fc7c10d7f8
2 changed files with 10 additions and 6 deletions

View File

@ -42,4 +42,6 @@ cdef class Language:
int split, size_t length) int split, size_t length)
cdef Orthography* init_orth(self, StringHash hashed, unicode lex) cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr)

View File

@ -85,6 +85,7 @@ cdef class Language:
length = len(token_string) length = len(token_string)
hashed = self.hash_string(token_string, length) hashed = self.hash_string(token_string, length)
word.tail = self._add(hashed, lex, 0, len(lex)) word.tail = self._add(hashed, lex, 0, len(lex))
self._happax_to_vocab(hashed, <Lexeme_addr>word.tail)
word = word.tail word = word.tail
def load_clusters(self): def load_clusters(self):
@ -133,19 +134,19 @@ cdef class Language:
word_ptr = self._add(hashed, string, start, length) word_ptr = self._add(hashed, string, start, length)
else: else:
# Second time word seen, move to vocab # Second time word seen, move to vocab
self.vocab[0][hashed] = <Lexeme_addr>word_ptr self._happax_to_vocab(hashed, <Lexeme_addr>word_ptr)
self.happax.erase(hashed)
return <Lexeme_addr>word_ptr return <Lexeme_addr>word_ptr
cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr):
self.vocab[0][hashed] = <Lexeme_addr>word_ptr
self.happax.erase(hashed)
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
cdef size_t i cdef size_t i
word = self.init_lexeme(string, hashed, split, length) word = self.init_lexeme(string, hashed, split, length)
cdef Lexeme* clobbered = <Lexeme*>self.happax.insert(hashed, <size_t>word) cdef Lexeme* clobbered = <Lexeme*>self.happax.insert(hashed, <size_t>word)
if clobbered != NULL: if clobbered != NULL:
# Can't do this --- we might be pointing to the Lexeme in .tail. free(clobbered)
# Fix that to reduce memory, probably.
#free(clobbered)
pass
self.bacov[hashed] = string self.bacov[hashed] = string
return word return word
@ -210,6 +211,7 @@ cdef class Language:
# Now recurse, and deal with the tail # Now recurse, and deal with the tail
if tail_string: if tail_string:
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string)) word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
self._happax_to_vocab(word.tail.sic, <Lexeme_addr>word.tail)
return word return word
cdef Orthography* init_orth(self, StringHash hashed, unicode lex): cdef Orthography* init_orth(self, StringHash hashed, unicode lex):