mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-01 22:33:40 +03:00
prevent zero-length mem alloc (#4429)
* raise specific error when removing a matcher rule that doesn't exist * rephrasing * goldparse init: allocate fields only if doc is not empty * avoid zero length alloc in saving tokenizer cache * avoid allocating zero length mem in matcher * asserts to avoid allocating zero length mem * fix zero-length allocation in matcher * bump cymem version * revert cymem version bump
This commit is contained in:
parent
3dfc764577
commit
48886afc78
|
@ -546,7 +546,7 @@ cdef class GoldParse:
|
||||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
|
def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
|
||||||
heads=None, deps=None, entities=None, make_projective=False,
|
heads=None, deps=None, entities=None, make_projective=False,
|
||||||
cats=None, links=None, **_):
|
cats=None, links=None, **_):
|
||||||
"""Create a GoldParse.
|
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
||||||
|
|
||||||
doc (Doc): The document the annotations refer to.
|
doc (Doc): The document the annotations refer to.
|
||||||
words (iterable): A sequence of unicode word strings.
|
words (iterable): A sequence of unicode word strings.
|
||||||
|
@ -575,6 +575,15 @@ cdef class GoldParse:
|
||||||
negative examples respectively.
|
negative examples respectively.
|
||||||
RETURNS (GoldParse): The newly constructed object.
|
RETURNS (GoldParse): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
|
self.mem = Pool()
|
||||||
|
self.loss = 0
|
||||||
|
self.length = len(doc)
|
||||||
|
|
||||||
|
self.cats = {} if cats is None else dict(cats)
|
||||||
|
self.links = links
|
||||||
|
|
||||||
|
# avoid allocating memory if the doc does not contain any tokens
|
||||||
|
if self.length > 0:
|
||||||
if words is None:
|
if words is None:
|
||||||
words = [token.text for token in doc]
|
words = [token.text for token in doc]
|
||||||
if tags is None:
|
if tags is None:
|
||||||
|
@ -596,9 +605,6 @@ cdef class GoldParse:
|
||||||
if not isinstance(entities[0], basestring):
|
if not isinstance(entities[0], basestring):
|
||||||
# Assume we have entities specified by character offset.
|
# Assume we have entities specified by character offset.
|
||||||
entities = biluo_tags_from_offsets(doc, entities)
|
entities = biluo_tags_from_offsets(doc, entities)
|
||||||
self.mem = Pool()
|
|
||||||
self.loss = 0
|
|
||||||
self.length = len(doc)
|
|
||||||
|
|
||||||
# These are filled by the tagger/parser/entity recogniser
|
# These are filled by the tagger/parser/entity recogniser
|
||||||
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
@ -608,8 +614,6 @@ cdef class GoldParse:
|
||||||
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||||
|
|
||||||
self.cats = {} if cats is None else dict(cats)
|
|
||||||
self.links = links
|
|
||||||
self.words = [None] * len(doc)
|
self.words = [None] * len(doc)
|
||||||
self.tags = [None] * len(doc)
|
self.tags = [None] * len(doc)
|
||||||
self.heads = [None] * len(doc)
|
self.heads = [None] * len(doc)
|
||||||
|
|
|
@ -254,6 +254,11 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
|
||||||
cdef PatternStateC state
|
cdef PatternStateC state
|
||||||
cdef int i, j, nr_extra_attr
|
cdef int i, j, nr_extra_attr
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
|
output = []
|
||||||
|
if doc.length == 0:
|
||||||
|
# avoid any processing or mem alloc if the document is empty
|
||||||
|
return output
|
||||||
|
if len(predicates) > 0:
|
||||||
predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
|
predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
|
||||||
if extensions is not None and len(extensions) >= 1:
|
if extensions is not None and len(extensions) >= 1:
|
||||||
nr_extra_attr = max(extensions.values()) + 1
|
nr_extra_attr = max(extensions.values()) + 1
|
||||||
|
@ -278,7 +283,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
|
||||||
predicate_cache += len(predicates)
|
predicate_cache += len(predicates)
|
||||||
# Handle matches that end in 0-width patterns
|
# Handle matches that end in 0-width patterns
|
||||||
finish_states(matches, states)
|
finish_states(matches, states)
|
||||||
output = []
|
|
||||||
seen = set()
|
seen = set()
|
||||||
for i in range(matches.size()):
|
for i in range(matches.size()):
|
||||||
match = (
|
match = (
|
||||||
|
@ -560,11 +564,13 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
|
||||||
for j, (attr, value) in enumerate(spec):
|
for j, (attr, value) in enumerate(spec):
|
||||||
pattern[i].attrs[j].attr = attr
|
pattern[i].attrs[j].attr = attr
|
||||||
pattern[i].attrs[j].value = value
|
pattern[i].attrs[j].value = value
|
||||||
|
if len(extensions) > 0:
|
||||||
pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
|
pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
|
||||||
for j, (index, value) in enumerate(extensions):
|
for j, (index, value) in enumerate(extensions):
|
||||||
pattern[i].extra_attrs[j].index = index
|
pattern[i].extra_attrs[j].index = index
|
||||||
pattern[i].extra_attrs[j].value = value
|
pattern[i].extra_attrs[j].value = value
|
||||||
pattern[i].nr_extra_attr = len(extensions)
|
pattern[i].nr_extra_attr = len(extensions)
|
||||||
|
if len(predicates) > 0:
|
||||||
pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
|
pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
|
||||||
for j, index in enumerate(predicates):
|
for j, index in enumerate(predicates):
|
||||||
pattern[i].py_predicates[j] = index
|
pattern[i].py_predicates[j] = index
|
||||||
|
|
|
@ -364,6 +364,9 @@ cdef class Parser:
|
||||||
|
|
||||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||||
int nr_class, int batch_size) nogil:
|
int nr_class, int batch_size) nogil:
|
||||||
|
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||||
|
with gil:
|
||||||
|
assert self.moves.n_moves > 0
|
||||||
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
|
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
|
||||||
cdef int i, guess
|
cdef int i, guess
|
||||||
cdef Transition action
|
cdef Transition action
|
||||||
|
@ -547,6 +550,10 @@ cdef class Parser:
|
||||||
cdef GoldParse gold
|
cdef GoldParse gold
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef int i
|
cdef int i
|
||||||
|
|
||||||
|
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||||
|
assert self.moves.n_moves > 0
|
||||||
|
|
||||||
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
||||||
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
||||||
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
||||||
|
|
|
@ -83,6 +83,8 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
def get_oracle_sequence(self, doc, GoldParse gold):
|
def get_oracle_sequence(self, doc, GoldParse gold):
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
|
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||||
|
assert self.n_moves > 0
|
||||||
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
|
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
|
||||||
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
|
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
|
||||||
|
|
||||||
|
|
|
@ -331,6 +331,9 @@ cdef class Tokenizer:
|
||||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
||||||
int has_special, int n) except -1:
|
int has_special, int n) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
|
if n <= 0:
|
||||||
|
# avoid mem alloc of zero length
|
||||||
|
return 0
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -157,6 +157,9 @@ def _merge(Doc doc, merges):
|
||||||
cdef TokenC* token
|
cdef TokenC* token
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef int merged_iob = 0
|
cdef int merged_iob = 0
|
||||||
|
|
||||||
|
# merges should not be empty, but make sure to avoid zero-length mem alloc
|
||||||
|
assert len(merges) > 0
|
||||||
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
|
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
|
||||||
spans = []
|
spans = []
|
||||||
|
|
||||||
|
|
|
@ -791,6 +791,8 @@ cdef class Doc:
|
||||||
# Get set up for fast loading
|
# Get set up for fast loading
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef int n_attrs = len(attrs)
|
cdef int n_attrs = len(attrs)
|
||||||
|
# attrs should not be empty, but make sure to avoid zero-length mem alloc
|
||||||
|
assert n_attrs > 0
|
||||||
attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
|
attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
|
||||||
for i, attr_id in enumerate(attrs):
|
for i, attr_id in enumerate(attrs):
|
||||||
attr_ids[i] = attr_id
|
attr_ids[i] = attr_id
|
||||||
|
|
Loading…
Reference in New Issue
Block a user