* Fix memory leak in tokenizer, caused by having a fixed vocab.

This commit is contained in:
Matthew Honnibal 2014-07-31 18:19:38 +01:00
parent 5b81ee716f
commit 4cb88c940b

View File

@ -2,6 +2,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
from libcpp.pair cimport pair
from cython.operator cimport dereference as deref
from murmurhash cimport mrmr from murmurhash cimport mrmr
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
@ -68,6 +70,9 @@ cdef class Language:
self.vocab[0].set_empty_key(0) self.vocab[0].set_empty_key(0)
self.distri[0].set_empty_key(0) self.distri[0].set_empty_key(0)
self.ortho[0].set_empty_key(0) self.ortho[0].set_empty_key(0)
self.vocab[0].set_deleted_key(1)
self.distri[0].set_deleted_key(1)
self.ortho[0].set_deleted_key(1)
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
def load_tokenization(self, token_rules=None): def load_tokenization(self, token_rules=None):
@ -136,9 +141,16 @@ cdef class Language:
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
cdef size_t i cdef size_t i
cdef sparse_hash_map[StringHash, size_t].iterator it cdef sparse_hash_map[StringHash, size_t].iterator it
cdef pair[StringHash, size_t] last_elem
if self.happax[0].size() >= MAX_HAPPAX: if self.happax[0].size() >= MAX_HAPPAX:
# Delete last element. # Delete last element.
self.happax[0].erase(self.happax[0].end()) last_elem = deref(self.happax[0].end())
free(<Orthography*>self.ortho[0][last_elem.first])
free(<Distribution*>self.distri[0][last_elem.first])
free(<Lexeme*>last_elem.second)
self.happax[0].erase(last_elem.first)
self.ortho[0].erase(last_elem.first)
self.distri[0].erase(last_elem.first)
word = self.init_lexeme(string, hashed, split, length) word = self.init_lexeme(string, hashed, split, length)
self.happax[0][hashed] = <Lexeme_addr>word self.happax[0][hashed] = <Lexeme_addr>word
self.bacov[hashed] = string self.bacov[hashed] = string