diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 78c26d22a..5c5ab5356 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -52,13 +52,12 @@ cdef inline bint _can_break_shift(const State* s) nogil: cdef int i if not USE_BREAK: return False - elif not _can_shift(s): + elif at_eol(s): return False else: # P. 757 # In UPP, if Shift(F) or RightArc(F) fail to result in a single parsing # tree, they cannot be performed as well. - seen_headless = False for i in range(s.stack_len): if s.sent[s.stack[i]].head == 0: return False @@ -92,12 +91,18 @@ cdef int _shift_cost(const State* s, const int* gold) except -1: cost += children_in_stack(s, s.i, gold) if NON_MONOTONIC: cost += gold[s.stack[0]] == s.i + # If we can break, and there's no cost to doing so, we should + if _can_break_shift(s) and _break_shift_cost(s, gold) == 0: + cost += 1 return cost cdef int _right_cost(const State* s, const int* gold) except -1: assert s.stack_len >= 1 cost = 0 + # If we can break, and there's no cost to doing so, we should + if _can_break_right(s) and _break_right_cost(s, gold) == 0: + cost += 1 if gold[s.i] == s.stack[0]: return cost cost += head_in_buffer(s, s.i, gold) @@ -130,24 +135,48 @@ cdef int _reduce_cost(const State* s, const int* gold) except -1: cdef int _break_shift_cost(const State* s, const int* gold) except -1: - cdef int cost = _shift_cost(s, gold) - # When we break, we Reduce all of the words on the stack. So, the Break - # cost is the sum of the Reduce costs - for i in range(s.stack_len): - cost += children_in_buffer(s, s.stack[i], gold) - if NON_MONOTONIC: - cost += head_in_buffer(s, s.stack[i], gold) + # When we break, we Reduce all of the words on the stack. We also remove + # the first word from the buffer. + # + # n0_cost: + cdef int cost = 0 + # number of head/child deps between n0 and N1...Nn + cost += children_in_buffer(s, s.i, gold) + cost += head_in_buffer(s, s.i, gold) + # Don't count self-deps + if gold[s.i] == s.i: + cost -= 2 + # number of child deps from N0 into stack + cost += children_in_stack(s, s.i, gold) + # number of head deps to N0 from stack + cost += head_in_stack(s, s.i, gold) + # Number of deps between S0...Sn and N1...Nn + for i in range(s.i+1, s.sent_len): + cost += children_in_stack(s, i, gold) + cost += head_in_stack(s, i, gold) return cost cdef int _break_right_cost(const State* s, const int* gold) except -1: - cdef int cost = _right_cost(s, gold) - # When we break, we Reduce all of the words on the stack. So, the Break - # cost is the sum of the Reduce costs - for i in range(s.stack_len): - cost += children_in_buffer(s, s.stack[i], gold) - if NON_MONOTONIC: - cost += head_in_buffer(s, s.stack[i], gold) + cdef int cost = 0 + assert s.stack_len >= 1 + cdef int i + # When we break, we Reduce all of the words on the stack. We also remove + # the first word from the buffer. + # + # n0_cost: + # number of head/child deps between n0 and N0...Nn + cost += children_in_buffer(s, s.i, gold) + cost += head_in_buffer(s, s.i, gold) + # number of child deps from N0 into stack + cost += children_in_stack(s, s.i, gold) + # number of head deps to N0 from S1..Sn + for i in range(1, s.stack_len): + cost += s.stack[-i] == gold[s.i] + # Number of deps between S0...Sn and N1...Nn + for i in range(s.i+1, s.sent_len): + cost += children_in_stack(s, i, gold) + cost += head_in_stack(s, i, gold) return cost @@ -213,14 +242,14 @@ cdef class TransitionSystem: add_dep(s, s.stack[0], s.i, t.label) push_stack(s) elif t.move == REDUCE: + # TODO: Huh? Is this some weirdness from the non-monotonic? add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep) pop_stack(s) elif t.move == BREAK_RIGHT: add_dep(s, s.stack[0], s.i, t.label) push_stack(s) while s.stack_len != 0: - if not has_head(get_s0(s)): - get_s0(s).dep = 0 + #add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep) s.stack -= 1 s.stack_len -= 1 if not at_eol(s): @@ -228,8 +257,9 @@ cdef class TransitionSystem: elif t.move == BREAK_SHIFT: push_stack(s) get_s0(s).dep = 0 - s.stack -= s.stack_len - s.stack_len = 0 + while s.stack_len != 0: + s.stack -= 1 + s.stack_len -= 1 if not at_eol(s): push_stack(s) else: @@ -289,10 +319,11 @@ cdef class TransitionSystem: elif gold_heads[s.i] == s.stack[0]: target_label = gold_labels[s.i] if guess.move == RIGHT or guess.move == BREAK_RIGHT: - guess.cost += guess.label != target_label + if unl_costs[guess.move] != 0: + guess.cost += guess.label != target_label for i in range(self.n_moves): t = self._moves[i] - if (t.move == RIGHT or t.move == BREAK_RIGHT) and t.label == target_label: + if t.label == target_label and unl_costs[t.move] == 0: return t cdef int best = -1 diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index c93bc4cf2..61324f69c 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -41,11 +41,12 @@ def set_debug(val): cdef unicode print_state(State* s, list words): words = list(words) + ['EOL'] - top = words[s.stack[0]] - second = words[s.stack[-1]] + top = words[s.stack[0]] + '_%d' % s.sent[s.stack[0]].head + second = words[s.stack[-1]] + '_%d' % s.sent[s.stack[-1]].head + third = words[s.stack[-2]] + '_%d' % s.sent[s.stack[-2]].head n0 = words[s.i] n1 = words[s.i + 1] - return ' '.join((second, top, '|', n0, n1)) + return ' '.join((str(s.stack_len), third, second, top, '|', n0, n1)) def get_templates(name): @@ -86,7 +87,8 @@ cdef class GreedyParser: tokens.is_parsed = True return 0 - def train_sent(self, Tokens tokens, list gold_heads, list gold_labels): + def train_sent(self, Tokens tokens, list gold_heads, list gold_labels, + force_gold=False): cdef: const Feature* feats const weight_t* scores @@ -104,15 +106,30 @@ cdef class GreedyParser: labels_array[i] = self.moves.label_ids[gold_labels[i]] py_words = [t.orth_ for t in tokens] + py_moves = ['S', 'D', 'L', 'R', 'BS', 'BR'] + history = [] + #print py_words cdef State* state = init_state(mem, tokens.data, tokens.length) while not is_final(state): fill_context(context, state) scores = self.model.score(context) guess = self.moves.best_valid(scores, state) best = self.moves.best_gold(&guess, scores, state, heads_array, labels_array) + history.append((py_moves[best.move], print_state(state, py_words))) self.model.update(context, guess.clas, best.clas, guess.cost) - self.moves.transition(state, &guess) + if force_gold: + self.moves.transition(state, &best) + else: + self.moves.transition(state, &guess) cdef int n_corr = 0 for i in range(tokens.length): n_corr += (i + state.sent[i].head) == gold_heads[i] + if force_gold and n_corr != tokens.length: + print py_words + print gold_heads + for move, state_str in history: + print move, state_str + for i in range(tokens.length): + print py_words[i], py_words[i + state.sent[i].head], py_words[gold_heads[i]] + raise Exception return n_corr