From b34a1325d398b96a8ac870e6c76d7ef0207fc51d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Dec 2014 05:42:23 +1100 Subject: [PATCH] * Everything compiling after reorg. About to start testing. --- spacy/syntax/_state.pxd | 26 +++++-- spacy/syntax/_state.pyx | 49 ++++++++++---- spacy/syntax/arc_eager.pxd | 10 ++- spacy/syntax/arc_eager.pyx | 135 ++++++++++++++++++++++--------------- 4 files changed, 143 insertions(+), 77 deletions(-) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index ab8ce3962..d54cd28d6 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -20,8 +20,7 @@ cdef int pop_stack(State *s) except -1 cdef int push_stack(State *s) except -1 -cdef inline bint has_head(const TokenC* t) nogil: - return t.head != 0 +cdef bint has_head(const TokenC* t) nogil cdef inline int get_idx(const State* s, const TokenC* t) nogil: @@ -71,14 +70,29 @@ cdef inline bint is_final(const State *s) nogil: return at_eol(s) # The stack will be attached to root anyway -cdef int children_in_buffer(const State *s, const int head, int* gold) except -1 -cdef int head_in_buffer(const State *s, const int child, int* gold) except -1 -cdef int children_in_stack(const State *s, const int head, int* gold) except -1 -cdef int head_in_stack(const State *s, const int child, int* gold) except -1 +cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1 +cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1 +cdef int children_in_stack(const State *s, const int head, const int* gold) except -1 +cdef int head_in_stack(const State *s, const int child, const int* gold) except -1 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL +cdef int count_left_kids(const TokenC* head) nogil + + +cdef int count_right_kids(const TokenC* head) nogil + + +# From https://en.wikipedia.org/wiki/Hamming_weight +cdef inline uint32_t _popcount(uint32_t x) nogil: + """Find number of non-zero bits.""" + cdef int count = 0 + while x != 0: + x &= x - 1 + count += 1 + return count + cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil: cdef int i diff --git a/spacy/syntax/_state.pyx b/spacy/syntax/_state.pyx index 6bdfdea3e..144b9b9b0 100644 --- a/spacy/syntax/_state.pyx +++ b/spacy/syntax/_state.pyx @@ -3,24 +3,32 @@ from libc.string cimport memmove from cymem.cymem cimport Pool from ..lexeme cimport EMPTY_LEXEME +from ..structs cimport TokenC + + +DEF PADDING = 5 +DEF NON_MONOTONIC = True cdef int add_dep(State *s, int head, int child, int label) except -1: - s.sent[child].head = head - child + cdef int dist = head - child + s.sent[child].head = dist s.sent[child].dep_tag = label # Keep a bit-vector tracking child dependencies. If a word has a child at # offset i from it, set that bit (tracking left and right separately) if child > head: - s.sent[head].r_kids |= 1 << (-s.sent[child].head) + s.sent[head].r_kids |= 1 << (-dist) else: - s.sent[head].l_kids |= 1 << s.sent[child].head + s.sent[head].l_kids |= 1 << dist cdef int pop_stack(State *s) except -1: assert s.stack_len >= 1 s.stack_len -= 1 s.stack -= 1 - + if s.stack_len == 0 and not at_eol(s): + push_stack(s) + cdef int push_stack(State *s) except -1: assert s.i < s.sent_len @@ -28,9 +36,14 @@ cdef int push_stack(State *s) except -1: s.stack[0] = s.i s.stack_len += 1 s.i += 1 + if at_eol(s): + while s.stack_len != 0: + if not has_head(get_s0(s)): + get_s0(s).dep_tag = 0 + pop_stack(s) -cdef int children_in_buffer(const State *s, int head, int* gold) except -1: +cdef int children_in_buffer(const State *s, int head, const int* gold) except -1: # Golds holds an array of head offsets --- the head of word i is i - golds[i] # Iterate over the tokens of the queue, and check whether their gold head is # our target @@ -42,20 +55,21 @@ cdef int children_in_buffer(const State *s, int head, int* gold) except -1: return n -cdef int head_in_buffer(const State *s, const int child, int* gold) except -1: +cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1: return gold[child] >= s.i -cdef int children_in_stack(const State *s, const int head, int* gold) except -1: +cdef int children_in_stack(const State *s, const int head, const int* gold) except -1: cdef int i cdef int n = 0 for i in range(s.stack_len): if gold[s.stack[-i]] == head: - n += 1 + if NON_MONOTONIC or not has_head(get_s0(s)): + n += 1 return n -cdef int head_in_stack(const State *s, const int child, int* gold) except -1: +cdef int head_in_stack(const State *s, const int child, const int* gold) except -1: cdef int i for i in range(s.stack_len): if gold[child] == s.stack[-i]: @@ -72,7 +86,7 @@ cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) n if child >= s.sent: return child else: - return s.sent - 1 + return NULL cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil: @@ -84,10 +98,20 @@ cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) if child < (s.sent + s.sent_len): return child else: - return s.sent - 1 + return NULL -DEF PADDING = 5 +cdef bint has_head(const TokenC* t) nogil: + return t.head != 0 + + +cdef int count_left_kids(const TokenC* head) nogil: + return _popcount(head.l_kids) + + +cdef int count_right_kids(const TokenC* head) nogil: + return _popcount(head.r_kids) + cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL: @@ -102,4 +126,5 @@ cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NUL s.stack_len = 0 s.i = 0 s.sent_len = sent_length + push_stack(s) return s diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index ee9d7b9a8..da8163e51 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -7,8 +7,11 @@ from ._state cimport State cdef struct Transition: + int clas int move int label + int cost + weight_t score cdef class TransitionSystem: @@ -18,7 +21,8 @@ cdef class TransitionSystem: cdef const Transition* _moves - cdef Transition best_valid(self, const weight_t* scores, const State* s) except -1 - cdef Transition best_gold(self, const weight_t* scores, const State* s, - int* gold_heads, int* gold_labels) except -1 + cdef Transition best_valid(self, const weight_t* scores, const State* s) except * + cdef Transition best_gold(self, Transition* guess, const weight_t* scores, + const State* s, + const int* gold_heads, const int* gold_labels) except * cdef int transition(self, State *s, const Transition* t) except -1 diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 2883aa403..25790bacd 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -5,7 +5,9 @@ from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep from ._state cimport head_in_buffer, children_in_buffer from ._state cimport head_in_stack, children_in_stack -from ..tokens cimport TokenC +from ..structs cimport TokenC + +DEF NON_MONOTONIC = True cdef enum: @@ -25,22 +27,30 @@ cdef inline bint _can_right(const State* s) nogil: cdef inline bint _can_left(const State* s) nogil: - return s.stack_len >= 1 and not has_head(get_s0(s)) + if NON_MONOTONIC: + return s.stack_len >= 1 + else: + return s.stack_len >= 1 and not has_head(get_s0(s)) cdef inline bint _can_reduce(const State* s) nogil: - return s.stack_len >= 2 and has_head(get_s0(s)) + if NON_MONOTONIC: + return s.stack_len >= 2 + else: + return s.stack_len >= 2 and has_head(get_s0(s)) -cdef int _shift_cost(const State* s, int* gold) except -1: +cdef int _shift_cost(const State* s, const int* gold) except -1: assert not at_eol(s) cost = 0 cost += head_in_stack(s, s.i, gold) cost += children_in_stack(s, s.i, gold) + if NON_MONOTONIC: + cost += gold[s.stack[0]] == s.i return cost -cdef int _right_cost(const State* s, int* gold) except -1: +cdef int _right_cost(const State* s, const int* gold) except -1: assert s.stack_len >= 1 cost = 0 if gold[s.i] == s.stack[0]: @@ -48,10 +58,12 @@ cdef int _right_cost(const State* s, int* gold) except -1: cost += head_in_buffer(s, s.i, gold) cost += children_in_stack(s, s.i, gold) cost += head_in_stack(s, s.i, gold) + if NON_MONOTONIC: + cost += gold[s.stack[0]] == s.i return cost -cdef int _left_cost(const State* s, int* gold) except -1: +cdef int _left_cost(const State* s, const int* gold) except -1: assert s.stack_len >= 1 cost = 0 if gold[s.stack[0]] == s.i: @@ -59,11 +71,17 @@ cdef int _left_cost(const State* s, int* gold) except -1: cost += head_in_buffer(s, s.stack[0], gold) cost += children_in_buffer(s, s.stack[0], gold) + if NON_MONOTONIC and s.stack_len >= 2: + cost += gold[s.stack[0]] == s.stack[-1] return cost -cdef int _reduce_cost(const State* s, int* gold) except -1: - return children_in_buffer(s, s.stack[0], gold) +cdef int _reduce_cost(const State* s, const int* gold) except -1: + cdef int cost = 0 + cost += children_in_buffer(s, s.stack[0], gold) + if NON_MONOTONIC: + cost += head_in_buffer(s, s.stack[0], gold) + return cost cdef class TransitionSystem: @@ -73,38 +91,40 @@ cdef class TransitionSystem: right_labels.sort() if 'ROOT' in right_labels: right_labels.pop(right_labels.index('ROOT')) - if 'dep' in right_labels: - right_labels.pop(right_labels.index('dep')) if 'ROOT' in left_labels: left_labels.pop(left_labels.index('ROOT')) - if 'dep' in left_labels: - left_labels.pop(left_labels.index('dep')) self.n_moves = 2 + len(left_labels) + len(right_labels) moves = self.mem.alloc(self.n_moves, sizeof(Transition)) cdef int i = 0 moves[i].move = SHIFT moves[i].label = 0 + moves[i].clas = i i += 1 moves[i].move = REDUCE moves[i].label = 0 + moves[i].clas = i i += 1 - self.label_ids = {'ROOT': 0, 'dep': -1} + self.label_ids = {'ROOT': 0} cdef int label_id for label_str in left_labels: label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) moves[i].move = LEFT moves[i].label = label_id + moves[i].clas = i i += 1 for label_str in right_labels: label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) moves[i].move = RIGHT moves[i].label = label_id + moves[i].clas = i i += 1 self._moves = moves - cdef int transition(self, State *s, const int clas) except -1: - cdef const Transition* t = &self._moves[clas] + cdef int transition(self, State *s, const Transition* t) except -1: if t.move == SHIFT: + # Set the dep label, in case we need it after we reduce + if NON_MONOTONIC: + get_s0(s).dep_tag = t.label push_stack(s) elif t.move == LEFT: add_dep(s, s.i, s.stack[0], t.label) @@ -113,11 +133,12 @@ cdef class TransitionSystem: add_dep(s, s.stack[0], s.i, t.label) push_stack(s) elif t.move == REDUCE: + add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag) pop_stack(s) else: raise StandardError(t.move) - cdef int best_valid(self, const weight_t* scores, const State* s) except -1: + cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: cdef bint[N_MOVES] valid valid[SHIFT] = _can_shift(s) valid[LEFT] = _can_left(s) @@ -126,59 +147,61 @@ cdef class TransitionSystem: cdef int best = -1 cdef weight_t score = 0 + cdef weight_t best_r_score = -9000 + cdef int best_r_label = -1 cdef int i for i in range(self.n_moves): if valid[self._moves[i].move] and (best == -1 or scores[i] > score): best = i score = scores[i] + if self._moves[i].move == RIGHT and scores[i] > best_r_score: + best_r_label = self._moves[i].label assert best >= 0 - return best + cdef Transition t = self._moves[best] + t.score = score + if t.move == SHIFT: + t.label = best_r_label + return t - cdef int best_gold(self, const weight_t* scores, const State* s, - int* gold_heads, int* gold_labels) except -1: + cdef Transition best_gold(self, Transition* guess, const weight_t* scores, + const State* s, + const int* gold_heads, const int* gold_labels) except *: + # If we can create a gold dependency, only one action can be correct cdef int[N_MOVES] unl_costs unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1 unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1 unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1 unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1 - cdef int cost - cdef int move - cdef int label + guess.cost = unl_costs[guess.move] + cdef Transition t + cdef int target_label + cdef int i + if gold_heads[s.stack[0]] == s.i: + target_label = gold_labels[s.stack[0]] + if guess.move == LEFT: + guess.cost += guess.label != target_label + for i in range(self.n_moves): + t = self._moves[i] + if t.move == LEFT and t.label == target_label: + return t + elif gold_heads[s.i] == s.stack[0]: + target_label = gold_labels[s.i] + if guess.move == RIGHT: + guess.cost += guess.label != target_label + for i in range(self.n_moves): + t = self._moves[i] + if t.move == RIGHT and t.label == target_label: + return t + cdef int best = -1 cdef weight_t score = -9000 - cdef int i for i in range(self.n_moves): - move = self._moves[i].move - label = self._moves[i].label - if unl_costs[move] == 0: - if move == SHIFT or move == REDUCE: - cost = 0 - elif move == LEFT: - if gold_heads[s.stack[0]] == s.i and gold_labels[s.stack[0]] != -1: - cost = label != gold_labels[s.stack[0]] - else: - cost = 0 - elif move == RIGHT: - if gold_heads[s.i] == s.stack[0] and gold_labels[s.i] != -1: - cost = label != gold_labels[s.i] - else: - cost = 0 - else: - raise StandardError("Unknown Move") - if cost == 0 and (best == -1 or scores[i] > score): - best = i - score = scores[i] - - if best < 0: - print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT] - print s.stack_len - print has_head(get_s0(s)) - print s.sent[s.stack[0]].head - print s.stack[0], s.i - print gold_heads[s.stack[0]], gold_heads[s.i] - print gold_labels[s.i] - print children_in_buffer(s, s.stack[0], gold_heads) - print head_in_buffer(s, s.stack[0], gold_heads) - raise StandardError - return best + t = self._moves[i] + if unl_costs[t.move] == 0 and (best == -1 or scores[i] > score): + best = i + score = scores[i] + t = self._moves[best] + t.score = score + assert best >= 0 + return t