prevent zero-length mem alloc (#4429)

* raise specific error when removing a matcher rule that doesn't exist

* rephrasing

* goldparse init: allocate fields only if doc is not empty

* avoid zero length alloc in saving tokenizer cache

* avoid allocating zero length mem in matcher

* asserts to avoid allocating zero length mem

* fix zero-length allocation in matcher

* bump cymem version

* revert cymem version bump
This commit is contained in:
Sofie Van Landeghem 2019-10-22 16:54:33 +02:00 committed by Matthew Honnibal
parent 3dfc764577
commit 48886afc78
7 changed files with 152 additions and 125 deletions

View File

@ -546,7 +546,7 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
heads=None, deps=None, entities=None, make_projective=False, heads=None, deps=None, entities=None, make_projective=False,
cats=None, links=None, **_): cats=None, links=None, **_):
"""Create a GoldParse. """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to. doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings. words (iterable): A sequence of unicode word strings.
@ -575,138 +575,142 @@ cdef class GoldParse:
negative examples respectively. negative examples respectively.
RETURNS (GoldParse): The newly constructed object. RETURNS (GoldParse): The newly constructed object.
""" """
if words is None:
words = [token.text for token in doc]
if tags is None:
tags = [None for _ in words]
if heads is None:
heads = [None for _ in words]
if deps is None:
deps = [None for _ in words]
if morphology is None:
morphology = [None for _ in words]
if entities is None:
entities = ["-" for _ in doc]
elif len(entities) == 0:
entities = ["O" for _ in doc]
else:
# Translate the None values to '-', to make processing easier.
# See Issue #2603
entities = [(ent if ent is not None else "-") for ent in entities]
if not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
self.mem = Pool() self.mem = Pool()
self.loss = 0 self.loss = 0
self.length = len(doc) self.length = len(doc)
# These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.cats = {} if cats is None else dict(cats) self.cats = {} if cats is None else dict(cats)
self.links = links self.links = links
self.words = [None] * len(doc)
self.tags = [None] * len(doc)
self.heads = [None] * len(doc)
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
self.morphology = [None] * len(doc)
# This needs to be done before we align the words # avoid allocating memory if the doc does not contain any tokens
if make_projective and heads is not None and deps is not None: if self.length > 0:
heads, deps = nonproj.projectivize(heads, deps) if words is None:
words = [token.text for token in doc]
# Do many-to-one alignment for misaligned tokens. if tags is None:
# If we over-segment, we'll have one gold word that covers a sequence tags = [None for _ in words]
# of predicted words if heads is None:
# If we under-segment, we'll have one predicted word that covers a heads = [None for _ in words]
# sequence of gold words. if deps is None:
# If we "mis-segment", we'll have a sequence of predicted words covering deps = [None for _ in words]
# a sequence of gold words. That's many-to-many -- we don't do that. if morphology is None:
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) morphology = [None for _ in words]
if entities is None:
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] entities = ["-" for _ in doc]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] elif len(entities) == 0:
entities = ["O" for _ in doc]
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
self.orig_annot = list(zip(*annot_tuples))
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
self.words[i] = doc[i].text
self.tags[i] = "_SP"
self.heads[i] = None
self.labels[i] = None
self.ner[i] = None
self.morphology[i] = set()
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
self.morphology[i] = morphology[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last
if not is_last:
self.heads[i] = i+1
self.labels[i] = "subtok"
else:
self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
self.labels[i] = deps[i2j_multi[i]]
# Now set NER...This is annoying because if we've split
# got an entity word split into two, we need to adjust the
# BILUO tags. We can't have BB or LL etc.
# Case 1: O -- easy.
ner_tag = entities[i2j_multi[i]]
if ner_tag == "O":
self.ner[i] = "O"
# Case 2: U. This has to become a B I* L sequence.
elif ner_tag.startswith("U-"):
if is_first:
self.ner[i] = ner_tag.replace("U-", "B-", 1)
elif is_last:
self.ner[i] = ner_tag.replace("U-", "L-", 1)
else:
self.ner[i] = ner_tag.replace("U-", "I-", 1)
# Case 3: L. If not last, change to I.
elif ner_tag.startswith("L-"):
if is_last:
self.ner[i] = ner_tag
else:
self.ner[i] = ner_tag.replace("L-", "I-", 1)
# Case 4: I. Stays correct
elif ner_tag.startswith("I-"):
self.ner[i] = ner_tag
else: else:
self.words[i] = words[gold_i] # Translate the None values to '-', to make processing easier.
self.tags[i] = tags[gold_i] # See Issue #2603
self.morphology[i] = morphology[gold_i] entities = [(ent if ent is not None else "-") for ent in entities]
if heads[gold_i] is None: if not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
# These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.words = [None] * len(doc)
self.tags = [None] * len(doc)
self.heads = [None] * len(doc)
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
self.morphology = [None] * len(doc)
# This needs to be done before we align the words
if make_projective and heads is not None and deps is not None:
heads, deps = nonproj.projectivize(heads, deps)
# Do many-to-one alignment for misaligned tokens.
# If we over-segment, we'll have one gold word that covers a sequence
# of predicted words
# If we under-segment, we'll have one predicted word that covers a
# sequence of gold words.
# If we "mis-segment", we'll have a sequence of predicted words covering
# a sequence of gold words. That's many-to-many -- we don't do that.
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
self.orig_annot = list(zip(*annot_tuples))
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
self.words[i] = doc[i].text
self.tags[i] = "_SP"
self.heads[i] = None self.heads[i] = None
self.labels[i] = None
self.ner[i] = None
self.morphology[i] = set()
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
self.morphology[i] = morphology[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last
if not is_last:
self.heads[i] = i+1
self.labels[i] = "subtok"
else:
self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
self.labels[i] = deps[i2j_multi[i]]
# Now set NER...This is annoying because if we've split
# got an entity word split into two, we need to adjust the
# BILUO tags. We can't have BB or LL etc.
# Case 1: O -- easy.
ner_tag = entities[i2j_multi[i]]
if ner_tag == "O":
self.ner[i] = "O"
# Case 2: U. This has to become a B I* L sequence.
elif ner_tag.startswith("U-"):
if is_first:
self.ner[i] = ner_tag.replace("U-", "B-", 1)
elif is_last:
self.ner[i] = ner_tag.replace("U-", "L-", 1)
else:
self.ner[i] = ner_tag.replace("U-", "I-", 1)
# Case 3: L. If not last, change to I.
elif ner_tag.startswith("L-"):
if is_last:
self.ner[i] = ner_tag
else:
self.ner[i] = ner_tag.replace("L-", "I-", 1)
# Case 4: I. Stays correct
elif ner_tag.startswith("I-"):
self.ner[i] = ner_tag
else: else:
self.heads[i] = self.gold_to_cand[heads[gold_i]] self.words[i] = words[gold_i]
self.labels[i] = deps[gold_i] self.tags[i] = tags[gold_i]
self.ner[i] = entities[gold_i] self.morphology[i] = morphology[gold_i]
if heads[gold_i] is None:
self.heads[i] = None
else:
self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i]
# Prevent whitespace that isn't within entities from being tagged as # Prevent whitespace that isn't within entities from being tagged as
# an entity. # an entity.
for i in range(len(self.ner)): for i in range(len(self.ner)):
if self.tags[i] == "_SP": if self.tags[i] == "_SP":
prev_ner = self.ner[i-1] if i >= 1 else None prev_ner = self.ner[i-1] if i >= 1 else None
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
if prev_ner == "O" or next_ner == "O": if prev_ner == "O" or next_ner == "O":
self.ner[i] = "O" self.ner[i] = "O"
cycle = nonproj.contains_cycle(self.heads) cycle = nonproj.contains_cycle(self.heads)
if cycle is not None: if cycle is not None:
raise ValueError(Errors.E069.format(cycle=cycle, raise ValueError(Errors.E069.format(cycle=cycle,
cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
doc_tokens=" ".join(words[:50]))) doc_tokens=" ".join(words[:50])))
def __len__(self): def __len__(self):
"""Get the number of gold-standard tokens. """Get the number of gold-standard tokens.

View File

@ -254,7 +254,12 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
cdef PatternStateC state cdef PatternStateC state
cdef int i, j, nr_extra_attr cdef int i, j, nr_extra_attr
cdef Pool mem = Pool() cdef Pool mem = Pool()
predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char)) output = []
if doc.length == 0:
# avoid any processing or mem alloc if the document is empty
return output
if len(predicates) > 0:
predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
if extensions is not None and len(extensions) >= 1: if extensions is not None and len(extensions) >= 1:
nr_extra_attr = max(extensions.values()) + 1 nr_extra_attr = max(extensions.values()) + 1
extra_attr_values = <attr_t*>mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t)) extra_attr_values = <attr_t*>mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t))
@ -278,7 +283,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
predicate_cache += len(predicates) predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns # Handle matches that end in 0-width patterns
finish_states(matches, states) finish_states(matches, states)
output = []
seen = set() seen = set()
for i in range(matches.size()): for i in range(matches.size()):
match = ( match = (
@ -560,12 +564,14 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
for j, (attr, value) in enumerate(spec): for j, (attr, value) in enumerate(spec):
pattern[i].attrs[j].attr = attr pattern[i].attrs[j].attr = attr
pattern[i].attrs[j].value = value pattern[i].attrs[j].value = value
pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC)) if len(extensions) > 0:
pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
for j, (index, value) in enumerate(extensions): for j, (index, value) in enumerate(extensions):
pattern[i].extra_attrs[j].index = index pattern[i].extra_attrs[j].index = index
pattern[i].extra_attrs[j].value = value pattern[i].extra_attrs[j].value = value
pattern[i].nr_extra_attr = len(extensions) pattern[i].nr_extra_attr = len(extensions)
pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t)) if len(predicates) > 0:
pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
for j, index in enumerate(predicates): for j, index in enumerate(predicates):
pattern[i].py_predicates[j] = index pattern[i].py_predicates[j] = index
pattern[i].nr_py = len(predicates) pattern[i].nr_py = len(predicates)

View File

@ -364,6 +364,9 @@ cdef class Parser:
cdef void c_transition_batch(self, StateC** states, const float* scores, cdef void c_transition_batch(self, StateC** states, const float* scores,
int nr_class, int batch_size) nogil: int nr_class, int batch_size) nogil:
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
with gil:
assert self.moves.n_moves > 0
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int)) is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
cdef int i, guess cdef int i, guess
cdef Transition action cdef Transition action
@ -547,6 +550,10 @@ cdef class Parser:
cdef GoldParse gold cdef GoldParse gold
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef int i cdef int i
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.moves.n_moves > 0
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int)) is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float)) costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves), cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),

View File

@ -83,6 +83,8 @@ cdef class TransitionSystem:
def get_oracle_sequence(self, doc, GoldParse gold): def get_oracle_sequence(self, doc, GoldParse gold):
cdef Pool mem = Pool() cdef Pool mem = Pool()
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.n_moves > 0
costs = <float*>mem.alloc(self.n_moves, sizeof(float)) costs = <float*>mem.alloc(self.n_moves, sizeof(float))
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int)) is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))

View File

@ -331,6 +331,9 @@ cdef class Tokenizer:
cdef int _save_cached(self, const TokenC* tokens, hash_t key, cdef int _save_cached(self, const TokenC* tokens, hash_t key,
int has_special, int n) except -1: int has_special, int n) except -1:
cdef int i cdef int i
if n <= 0:
# avoid mem alloc of zero length
return 0
for i in range(n): for i in range(n):
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
return 0 return 0

View File

@ -157,6 +157,9 @@ def _merge(Doc doc, merges):
cdef TokenC* token cdef TokenC* token
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef int merged_iob = 0 cdef int merged_iob = 0
# merges should not be empty, but make sure to avoid zero-length mem alloc
assert len(merges) > 0
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC)) tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
spans = [] spans = []

View File

@ -791,6 +791,8 @@ cdef class Doc:
# Get set up for fast loading # Get set up for fast loading
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef int n_attrs = len(attrs) cdef int n_attrs = len(attrs)
# attrs should not be empty, but make sure to avoid zero-length mem alloc
assert n_attrs > 0
attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t)) attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
for i, attr_id in enumerate(attrs): for i, attr_id in enumerate(attrs):
attr_ids[i] = attr_id attr_ids[i] = attr_id