mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Switch to using a Python ref counted gateway to malloc/free, to prevent memory leaks
This commit is contained in:
parent
5a20dfc03e
commit
6266cac593
|
@ -1,4 +1,5 @@
|
|||
from libc.stdint cimport uint64_t
|
||||
from .memory cimport Address
|
||||
|
||||
ctypedef uint64_t key_t
|
||||
ctypedef void* val_t
|
||||
|
@ -13,6 +14,7 @@ cdef class PointerHash:
|
|||
cdef size_t size
|
||||
cdef size_t filled
|
||||
cdef Cell* cells
|
||||
cdef Address _mem
|
||||
|
||||
cdef val_t get(self, key_t key) nogil
|
||||
cdef void set(self, key_t key, val_t value) except *
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# cython: profile=True
|
||||
from libc.stdlib cimport calloc, free
|
||||
from .memory cimport Address
|
||||
cimport cython
|
||||
|
||||
|
||||
|
@ -10,10 +10,8 @@ cdef class PointerHash:
|
|||
# Size must be power of two
|
||||
assert self.size != 0
|
||||
assert self.size & (self.size - 1) == 0
|
||||
self.cells = <Cell*>calloc(self.size, sizeof(Cell))
|
||||
|
||||
def __dealloc__(self):
|
||||
free(self.cells)
|
||||
self._mem = Address(self.size, sizeof(Cell))
|
||||
self.cells = <Cell*>self._mem.addr
|
||||
|
||||
def __getitem__(self, key_t key):
|
||||
assert key != 0
|
||||
|
@ -47,7 +45,8 @@ cdef class PointerHash:
|
|||
cdef size_t old_size = self.size
|
||||
|
||||
self.size = new_size
|
||||
self.cells = <Cell*>calloc(new_size, sizeof(Cell))
|
||||
cdef Address new_mem = Address(new_size, sizeof(Cell))
|
||||
self.cells = <Cell*>new_mem.addr
|
||||
|
||||
self.filled = 0
|
||||
cdef size_t i
|
||||
|
@ -56,7 +55,7 @@ cdef class PointerHash:
|
|||
if old_cells[i].key != 0:
|
||||
assert old_cells[i].value != NULL, i
|
||||
self.set(old_cells[i].key, old_cells[i].value)
|
||||
free(old_cells)
|
||||
self._mem = new_mem
|
||||
|
||||
|
||||
@cython.cdivision
|
||||
|
|
|
@ -37,7 +37,6 @@ provides a fully Penn Treebank 3-compliant tokenizer.
|
|||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
cimport lang
|
||||
|
|
|
@ -5,6 +5,8 @@ from spacy.tokens cimport Tokens
|
|||
from spacy.lexeme cimport LexemeC
|
||||
from spacy._hashing cimport PointerHash
|
||||
|
||||
from spacy.memory cimport Pool
|
||||
|
||||
from libcpp.utility cimport pair
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport uint64_t, int64_t
|
||||
|
@ -22,6 +24,7 @@ cdef struct String:
|
|||
|
||||
|
||||
cdef class Lexicon:
|
||||
cdef Pool _mem
|
||||
cpdef readonly size_t size
|
||||
|
||||
cpdef Lexeme lookup(self, unicode string)
|
||||
|
@ -34,6 +37,7 @@ cdef class Lexicon:
|
|||
|
||||
|
||||
cdef class Language:
|
||||
cdef Pool _mem
|
||||
cdef unicode name
|
||||
cdef PointerHash cache
|
||||
cdef PointerHash specials
|
||||
|
|
|
@ -8,8 +8,6 @@ Special-case tokenization rules are read from data/<lang>/tokenization .
|
|||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdlib cimport calloc, free
|
||||
|
||||
import json
|
||||
import random
|
||||
from os import path
|
||||
|
@ -18,8 +16,11 @@ from .util import read_lang_data
|
|||
from spacy.tokens import Tokens
|
||||
from spacy.lexeme cimport LexemeC, lexeme_init
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from cpython.ref cimport Py_INCREF
|
||||
|
||||
from .memory cimport Pool
|
||||
|
||||
from cython.operator cimport preincrement as preinc
|
||||
from cython.operator cimport dereference as deref
|
||||
|
||||
|
@ -127,6 +128,7 @@ cdef class Language:
|
|||
|
||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||
self.name = name
|
||||
self._mem = Pool()
|
||||
self.cache = PointerHash(2 ** 25)
|
||||
self.specials = PointerHash(2 ** 16)
|
||||
lang_data = util.read_lang_data(name)
|
||||
|
@ -203,7 +205,7 @@ cdef class Language:
|
|||
if lexemes != NULL:
|
||||
i = 0
|
||||
while lexemes[i] != NULL:
|
||||
tokens.push_back(lexemes[i])
|
||||
tokens_v.push_back(lexemes[i])
|
||||
i += 1
|
||||
return 0
|
||||
|
||||
|
@ -292,7 +294,7 @@ cdef class Language:
|
|||
cdef int _save_cached(self, vector[LexemeC*] *tokens,
|
||||
uint64_t key, size_t n) except -1:
|
||||
assert tokens.size() > n
|
||||
lexemes = <LexemeC**>calloc((tokens.size() - n) + 1, sizeof(LexemeC**))
|
||||
lexemes = <LexemeC**>self._mem.alloc((tokens.size() - n) + 1, sizeof(LexemeC**))
|
||||
cdef size_t i, j
|
||||
for i, j in enumerate(range(n, tokens.size())):
|
||||
lexemes[i] = tokens.at(j)
|
||||
|
@ -404,7 +406,7 @@ cdef class Language:
|
|||
cdef uint64_t hashed
|
||||
cdef String string
|
||||
for uni_string, substrings in token_rules:
|
||||
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
||||
lexemes = <LexemeC**>self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*))
|
||||
for i, substring in enumerate(substrings):
|
||||
string_from_unicode(&string, substring)
|
||||
lexemes[i] = <LexemeC*>self.lexicon.get(&string)
|
||||
|
@ -417,6 +419,7 @@ cdef class Language:
|
|||
cdef class Lexicon:
|
||||
def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
|
||||
string_features, flag_features):
|
||||
self._mem = Pool()
|
||||
self._flag_features = flag_features
|
||||
self._string_features = string_features
|
||||
self._dict = PointerHash(2 ** 20)
|
||||
|
@ -433,7 +436,7 @@ cdef class Lexicon:
|
|||
for i, flag_feature in enumerate(self._flag_features):
|
||||
if flag_feature(uni_string, prob, cluster, cases, tags):
|
||||
flags.add(i)
|
||||
lexeme = lexeme_init(uni_string, prob, cluster, views, flags)
|
||||
lexeme = lexeme_init(self._mem, uni_string, prob, cluster, views, flags)
|
||||
string_from_unicode(&string, uni_string)
|
||||
self._dict.set(string.key, lexeme)
|
||||
self.size += 1
|
||||
|
@ -452,7 +455,7 @@ cdef class Lexicon:
|
|||
if flag_feature(uni_string, 0.0, {}, {}):
|
||||
flags.add(i)
|
||||
|
||||
lexeme = lexeme_init(uni_string, 0, 0, views, flags)
|
||||
lexeme = lexeme_init(self._mem, uni_string, 0, 0, views, flags)
|
||||
self._dict.set(string.key, lexeme)
|
||||
self.size += 1
|
||||
return lexeme
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
||||
from .memory cimport Pool
|
||||
|
||||
|
||||
cdef struct LexemeC:
|
||||
|
@ -12,9 +13,8 @@ cdef struct LexemeC:
|
|||
flag_t flags
|
||||
|
||||
|
||||
cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
|
||||
cdef LexemeC* lexeme_init(Pool mem, unicode string, double prob, size_t cluster,
|
||||
list views, set flags)
|
||||
cdef int lexeme_free(LexemeC* lexeme) except -1
|
||||
|
||||
cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id)
|
||||
cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
from libc.stdlib cimport calloc, free
|
||||
from cpython.ref cimport Py_INCREF
|
||||
from .memory cimport Pool
|
||||
|
||||
|
||||
cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
|
||||
cdef LexemeC* lexeme_init(Pool mem, unicode string, double prob, size_t cluster,
|
||||
list views, set flags):
|
||||
cdef LexemeC* lexeme = <LexemeC*>calloc(1, sizeof(LexemeC))
|
||||
cdef LexemeC* lexeme = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
||||
lexeme.cluster = cluster
|
||||
lexeme.prob = prob
|
||||
lexeme.string = intern_and_encode(string, &lexeme.length)
|
||||
lexeme.views = <char**>calloc(len(views), sizeof(char*))
|
||||
lexeme.views = <char**>mem.alloc(len(views), sizeof(char*))
|
||||
cdef size_t length = 0
|
||||
for i, string in enumerate(views):
|
||||
lexeme.views[i] = intern_and_encode(string, &length)
|
||||
|
@ -18,11 +18,6 @@ cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster,
|
|||
return lexeme
|
||||
|
||||
|
||||
cdef int lexeme_free(LexemeC* lexeme) except -1:
|
||||
free(lexeme.views)
|
||||
free(lexeme)
|
||||
|
||||
|
||||
cdef char* intern_and_encode(unicode string, size_t* length):
|
||||
cdef bytes byte_string = string.encode('utf8')
|
||||
cdef bytes utf8_string = intern(byte_string)
|
||||
|
|
|
@ -5,7 +5,6 @@ boldly assume no collisions.
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
# cython: profile=True
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
|
||||
from spacy.word cimport Lexeme
|
||||
from spacy.lexeme cimport lexeme_check_flag
|
||||
from spacy.lexeme cimport lexeme_string_view
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
|
||||
from spacy.lexeme cimport lexeme_free, lexeme_init
|
||||
from spacy.lexeme cimport lexeme_check_flag, lexeme_string_view
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user