From c49e44349aec773bc1b0aafea08e4953d71065b1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 8 May 2018 02:53:24 +0200 Subject: [PATCH] Fix beam parsing --- spacy/syntax/_beam_utils.pyx | 8 +++- spacy/syntax/_parser_model.pyx | 7 ++-- spacy/syntax/nn_parser.pyx | 74 ++++++++++++++++++++++++---------- 3 files changed, 62 insertions(+), 27 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 20f6bdc5e..b8ca338ee 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -59,7 +59,7 @@ cdef class ParserBeam(object): cdef public object dones def __init__(self, TransitionSystem moves, states, golds, - int width, float density): + int width, float density=0.): self.moves = moves self.states = states self.golds = golds @@ -133,8 +133,12 @@ cdef class ParserBeam(object): self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) if follow_gold: + min_cost = 0 for j in range(beam.nr_class): - if beam.costs[i][j] >= 1: + if beam.costs[i][j] < min_cost: + min_cost = beam.costs[i][j] + for j in range(beam.nr_class): + if beam.costs[i][j] > min_cost: beam.is_valid[i][j] = 0 diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 5a7d3609b..962461417 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -266,13 +266,14 @@ class ParserStepModel(Model): def get_token_ids(self, batch): states = _beam_utils.collect_states(batch) + cdef StateClass state + states = [state for state in states if not state.is_final()] cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF), dtype='i', order='C') + ids.fill(-1) c_ids = ids.data - cdef StateClass state for state in states: - if not state.c.is_final(): - state.c.set_context_tokens(c_ids, ids.shape[1]) + state.c.set_context_tokens(c_ids, ids.shape[1]) c_ids += ids.shape[1] return ids diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d7f64d937..9fad36f46 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -208,31 +208,62 @@ cdef class Parser: for doc in batch_in_order: yield doc - def predict(self, docs, beam_width=1): + def predict(self, docs, beam_width=1, drop=0.): if isinstance(docs, Doc): docs = [docs] + if beam_width < 2: + return self.greedy_parse(docs, drop=drop) + else: + return self.beam_parse(docs, beam_width=beam_width, drop=drop) + def greedy_parse(self, docs, drop=0.): cdef vector[StateC*] states cdef StateClass state model = self.model(docs) - if beam_width == 1: - batch = self.moves.init_batch(docs) - weights = get_c_weights(model) - for state in batch: - if not state.is_final(): - states.push_back(state.c) - sizes = get_c_sizes(model, states.size()) - with nogil: - self._parseC(&states[0], - weights, sizes) - else: - batch = self.moves.init_beams(docs, beam_width) - unfinished = list(batch) - while unfinished: - scores = model.predict(unfinished) - unfinished = self.transition_beams(batch, scores) + batch = self.moves.init_batch(docs) + weights = get_c_weights(model) + for state in batch: + if not state.is_final(): + states.push_back(state.c) + sizes = get_c_sizes(model, states.size()) + with nogil: + self._parseC(&states[0], + weights, sizes) return batch + def beam_parse(self, docs, int beam_width=3, float drop=0.): + cdef Beam beam + cdef Doc doc + cdef np.ndarray token_ids + model = self.model(docs) + beams = self.moves.init_beams(docs, beam_width) + token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), + dtype='i', order='C') + cdef int* c_ids + cdef int nr_feature = self.nr_feature + cdef int n_states + model = self.model(docs) + todo = [beam for beam in beams if not beam.is_done] + while todo: + token_ids.fill(-1) + c_ids = token_ids.data + n_states = 0 + for beam in todo: + for i in range(beam.size): + state = beam.at(i) + # This way we avoid having to score finalized states + # We do have to take care to keep indexes aligned, though + if not state.is_final(): + state.set_context_tokens(c_ids, nr_feature) + c_ids += nr_feature + n_states += 1 + if n_states == 0: + break + vectors = model.state2vec(token_ids[:n_states]) + scores = model.vec2scores(vectors) + todo = self.transition_beams(todo, scores) + return beams + cdef void _parseC(self, StateC** states, WeightsC weights, SizesC sizes) nogil: cdef int i, j @@ -325,7 +356,7 @@ cdef class Parser: beam_update_prob = 1-self.cfg.get('beam_update_prob', 0.5) if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= beam_update_prob: return self.update_beam(docs, golds, - self.cfg['beam_width'], self.cfg['beam_density'], + self.cfg['beam_width'], drop=drop, sgd=sgd, losses=losses) # Chop sequences into lengths of this many transitions, to make the # batch uniform length. @@ -352,12 +383,11 @@ cdef class Parser: def update_beam(self, docs, golds, width, drop=0., sgd=None, losses=None): lengths = [len(d) for d in docs] - states = self.moves.init_batch(docs) - for gold in golds: - self.moves.preprocess_gold(gold) + cut_gold = numpy.random.choice(range(20, 100)) + states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold) model, finish_update = self.model.begin_update(docs, drop=drop) states_d_scores, backprops, beams = _beam_utils.update_beam( - self.moves, self.nr_feature, 500, states, golds, model.state2vec, + self.moves, self.nr_feature, max_steps, states, golds, model.state2vec, model.vec2scores, width, drop=drop, losses=losses) for i, d_scores in enumerate(states_d_scores): ids, bp_vectors, bp_scores = backprops[i]