* Switch to using a Python ref counted gateway to malloc/free, to prevent memory leaks

This commit is contained in:
Matthew Honnibal 2014-09-17 20:02:26 +02:00
parent 5a20dfc03e
commit 6266cac593
10 changed files with 28 additions and 32 deletions

View File

@ -1,4 +1,5 @@
from libc.stdint cimport uint64_t
from .memory cimport Address
ctypedef uint64_t key_t
ctypedef void* val_t
@ -13,6 +14,7 @@ cdef class PointerHash:
cdef size_t size
cdef size_t filled
cdef Cell* cells
cdef Address _mem
cdef val_t get(self, key_t key) nogil
cdef void set(self, key_t key, val_t value) except *

View File

@ -1,5 +1,5 @@
# cython: profile=True
from libc.stdlib cimport calloc, free
from .memory cimport Address
cimport cython
@ -10,10 +10,8 @@ cdef class PointerHash:
# Size must be power of two
assert self.size != 0
assert self.size & (self.size - 1) == 0
self.cells = <Cell*>calloc(self.size, sizeof(Cell))
def __dealloc__(self):
free(self.cells)
self._mem = Address(self.size, sizeof(Cell))
self.cells = <Cell*>self._mem.addr
def __getitem__(self, key_t key):
assert key != 0
@ -47,7 +45,8 @@ cdef class PointerHash:
cdef size_t old_size = self.size
self.size = new_size
self.cells = <Cell*>calloc(new_size, sizeof(Cell))
cdef Address new_mem = Address(new_size, sizeof(Cell))
self.cells = <Cell*>new_mem.addr
self.filled = 0
cdef size_t i
@ -56,7 +55,7 @@ cdef class PointerHash:
if old_cells[i].key != 0:
assert old_cells[i].value != NULL, i
self.set(old_cells[i].key, old_cells[i].value)
free(old_cells)
self._mem = new_mem
@cython.cdivision

View File

@ -37,7 +37,6 @@ provides a fully Penn Treebank 3-compliant tokenizer.
from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
cimport lang

View File

@ -5,6 +5,8 @@ from spacy.tokens cimport Tokens
from spacy.lexeme cimport LexemeC
from spacy._hashing cimport PointerHash
from spacy.memory cimport Pool
from libcpp.utility cimport pair
from libcpp.vector cimport vector
from libc.stdint cimport uint64_t, int64_t
@ -22,6 +24,7 @@ cdef struct String:
cdef class Lexicon:
cdef Pool _mem
cpdef readonly size_t size
cpdef Lexeme lookup(self, unicode string)
@ -34,6 +37,7 @@ cdef class Lexicon:
cdef class Language:
cdef Pool _mem
cdef unicode name
cdef PointerHash cache
cdef PointerHash specials

View File

@ -8,8 +8,6 @@ Special-case tokenization rules are read from data/<lang>/tokenization .
"""
from __future__ import unicode_literals
from libc.stdlib cimport calloc, free
import json
import random
from os import path
@ -18,8 +16,11 @@ from .util import read_lang_data
from spacy.tokens import Tokens
from spacy.lexeme cimport LexemeC, lexeme_init
from murmurhash.mrmr cimport hash64
from cpython.ref cimport Py_INCREF
from .memory cimport Pool
from cython.operator cimport preincrement as preinc
from cython.operator cimport dereference as deref
@ -127,6 +128,7 @@ cdef class Language:
def __cinit__(self, name, user_string_features, user_flag_features):
self.name = name
self._mem = Pool()
self.cache = PointerHash(2 ** 25)
self.specials = PointerHash(2 ** 16)
lang_data = util.read_lang_data(name)
@ -203,7 +205,7 @@ cdef class Language:
if lexemes != NULL:
i = 0
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
tokens_v.push_back(lexemes[i])
i += 1
return 0
@ -292,7 +294,7 @@ cdef class Language:
cdef int _save_cached(self, vector[LexemeC*] *tokens,
uint64_t key, size_t n) except -1:
assert tokens.size() > n
lexemes = <LexemeC**>calloc((tokens.size() - n) + 1, sizeof(LexemeC**))
lexemes = <LexemeC**>self._mem.alloc((tokens.size() - n) + 1, sizeof(LexemeC**))
cdef size_t i, j
for i, j in enumerate(range(n, tokens.size())):
lexemes[i] = tokens.at(j)
@ -404,7 +406,7 @@ cdef class Language:
cdef uint64_t hashed
cdef String string
for uni_string, substrings in token_rules:
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
lexemes = <LexemeC**>self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*))
for i, substring in enumerate(substrings):
string_from_unicode(&string, substring)
lexemes[i] = <LexemeC*>self.lexicon.get(&string)
@ -417,6 +419,7 @@ cdef class Language:
cdef class Lexicon:
def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
string_features, flag_features):
self._mem = Pool()
self._flag_features = flag_features
self._string_features = string_features
self._dict = PointerHash(2 ** 20)
@ -433,7 +436,7 @@ cdef class Lexicon:
for i, flag_feature in enumerate(self._flag_features):
if flag_feature(uni_string, prob, cluster, cases, tags):
flags.add(i)
lexeme = lexeme_init(uni_string, prob, cluster, views, flags)
lexeme = lexeme_init(self._mem, uni_string, prob, cluster, views, flags)
string_from_unicode(&string, uni_string)
self._dict.set(string.key, lexeme)
self.size += 1
@ -452,7 +455,7 @@ cdef class Lexicon:
if flag_feature(uni_string, 0.0, {}, {}):
flags.add(i)
lexeme = lexeme_init(uni_string, 0, 0, views, flags)
lexeme = lexeme_init(self._mem, uni_string, 0, 0, views, flags)
self._dict.set(string.key, lexeme)
self.size += 1
return lexeme

View File

@ -1,4 +1,5 @@
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
from .memory cimport Pool
cdef struct LexemeC:
@ -12,9 +13,8 @@ cdef struct LexemeC:
flag_t flags
cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
cdef LexemeC* lexeme_init(Pool mem, unicode string, double prob, size_t cluster,
list views, set flags)
cdef int lexeme_free(LexemeC* lexeme) except -1
cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id)
cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)

View File

@ -1,14 +1,14 @@
from libc.stdlib cimport calloc, free
from cpython.ref cimport Py_INCREF
from .memory cimport Pool
cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
cdef LexemeC* lexeme_init(Pool mem, unicode string, double prob, size_t cluster,
list views, set flags):
cdef LexemeC* lexeme = <LexemeC*>calloc(1, sizeof(LexemeC))
cdef LexemeC* lexeme = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
lexeme.cluster = cluster
lexeme.prob = prob
lexeme.string = intern_and_encode(string, &lexeme.length)
lexeme.views = <char**>calloc(len(views), sizeof(char*))
lexeme.views = <char**>mem.alloc(len(views), sizeof(char*))
cdef size_t length = 0
for i, string in enumerate(views):
lexeme.views[i] = intern_and_encode(string, &length)
@ -18,11 +18,6 @@ cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
return lexeme
cdef int lexeme_free(LexemeC* lexeme) except -1:
free(lexeme.views)
free(lexeme)
cdef char* intern_and_encode(unicode string, size_t* length):
cdef bytes byte_string = string.encode('utf8')
cdef bytes utf8_string = intern(byte_string)

View File

@ -5,7 +5,6 @@ boldly assume no collisions.
from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t

View File

@ -1,6 +1,4 @@
# cython: profile=True
from libc.stdlib cimport calloc, free, realloc
from spacy.word cimport Lexeme
from spacy.lexeme cimport lexeme_check_flag
from spacy.lexeme cimport lexeme_string_view

View File

@ -1,9 +1,6 @@
# cython: profile=True
# cython: embedsignature=True
from libc.stdlib cimport calloc, free, realloc
from spacy.lexeme cimport lexeme_free, lexeme_init
from spacy.lexeme cimport lexeme_check_flag, lexeme_string_view