From 04b1cd9b8c44ef73d533900eed800193ac36bc51 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jun 2015 04:20:23 +0200 Subject: [PATCH] * Greedy parsing working with new StateClass. Beam parsing broken --- spacy/syntax/arc_eager.pyx | 10 ++++- spacy/syntax/parser.pyx | 63 ++++++++++++++++++------------ spacy/syntax/stateclass.pxd | 9 +++++ spacy/syntax/stateclass.pyx | 5 +-- spacy/syntax/transition_system.pxd | 3 +- spacy/syntax/transition_system.pyx | 3 +- 6 files changed, 58 insertions(+), 35 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 546ea5281..99835e106 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -366,8 +366,8 @@ cdef class ArcEager(TransitionSystem): raise Exception(move) return t - cdef int initialize_state(self, State* state) except -1: - push_stack(state) + cdef int initialize_state(self, StateClass st) except -1: + st.push() cdef int finalize_state(self, StateClass st) except -1: cdef int root_label = self.strings['ROOT'] @@ -383,8 +383,11 @@ cdef class ArcEager(TransitionSystem): is_valid[RIGHT] = RightArc.is_valid(stcls, -1) is_valid[BREAK] = Break.is_valid(stcls, -1) cdef int i + n_valid = 0 for i in range(self.n_moves): output[i] = is_valid[self.c[i].move] + n_valid += output[i] + assert n_valid >= 1 cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: cdef int i, move, label @@ -409,6 +412,7 @@ cdef class ArcEager(TransitionSystem): cdef int* heads = gold.c.heads self.set_valid(self._is_valid, stcls) + n_gold = 0 for i in range(self.n_moves): if not self._is_valid[i]: output[i] = 9000 @@ -418,6 +422,8 @@ cdef class ArcEager(TransitionSystem): if move_costs[move] == -1: move_costs[move] = move_cost_funcs[move](stcls, &gold.c) output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) + n_gold += output[i] == 0 + assert n_gold >= 1 cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: cdef bint[N_MOVES] is_valid diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 93fdff043..b860425cd 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -5,6 +5,9 @@ MALT-style dependency parser """ from __future__ import unicode_literals cimport cython + +from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF + from libc.stdint cimport uint32_t, uint64_t from libc.string cimport memset, memcpy import random @@ -42,7 +45,6 @@ from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context from .stateclass cimport StateClass -from cpython.ref cimport PyObject DEBUG = False def set_debug(val): @@ -108,9 +110,9 @@ cdef class Parser: while not beam.is_done: self._advance_beam(beam, None, False) state = beam.at(0) - #self.moves.finalize_state(state) - #tokens.set_parse(state.sent) - raise Exception + self.moves.finalize_state(state) + tokens.set_parse(state._sent) + _cleanup(beam) def _greedy_train(self, Tokens tokens, GoldParse gold): cdef Pool mem = Pool() @@ -156,6 +158,8 @@ cdef class Parser: else: counts = {} self.model._model.update(counts) + _cleanup(pred) + _cleanup(gold) return pred.loss def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold): @@ -163,22 +167,23 @@ cdef class Parser: cdef int i, j, cost cdef bint is_valid cdef const Transition* move - cdef StateClass stcls = StateClass(gold.length) for i in range(beam.size): stcls = beam.at(i) if not stcls.is_final(): fill_context(context, stcls) self.model.set_scores(beam.scores[i], context) self.moves.set_valid(beam.is_valid[i], stcls) - if gold is not None: for i in range(beam.size): stcls = beam.at(i) self.moves.set_costs(beam.costs[i], stcls, gold) if follow_gold: + n_true = 0 for j in range(self.moves.n_moves): beam.is_valid[i][j] *= beam.costs[i][j] == 0 - beam.advance(_transition_state, NULL, self.moves.c) + n_true += beam.is_valid[i][j] + assert n_true >= 1 + beam.advance(_transition_state, _hash_state, self.moves.c) beam.check_done(_check_final_state, NULL) def _count_feats(self, dict counts, Tokens tokens, list hist, int inc): @@ -208,6 +213,7 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef StateClass st = StateClass.init(tokens, length) + Py_INCREF(st) return st @@ -215,23 +221,28 @@ cdef int _check_final_state(void* _state, void* extra_args) except -1: return (_state).is_final() -""" -cdef hash_t _hash_state(void* _state, void* _) except 0: - state = _state - cdef atom_t[10] rep +def _cleanup(Beam beam): + for i in range(beam.width): + Py_XDECREF(beam._states[i].content) + Py_XDECREF(beam._parents[i].content) - rep[0] = state.stack[0] if state.stack_len >= 1 else 0 - rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 - rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 - rep[3] = state.i - rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 - rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 - rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 - rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 - if get_left(state, get_n0(state), 1) != NULL: - rep[8] = get_left(state, get_n0(state), 1).dep - else: - rep[8] = 0 - rep[9] = state.sent[state.i].l_kids - return hash64(rep, sizeof(atom_t) * 10, 0) -""" +cdef hash_t _hash_state(void* _state, void* _) except 0: + return _state + + #state = _state + #cdef atom_t[10] rep + + #rep[0] = state.stack[0] if state.stack_len >= 1 else 0 + #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 + #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 + #rep[3] = state.i + #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 + #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 + #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 + #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 + #if get_left(state, get_n0(state), 1) != NULL: + # rep[8] = get_left(state, get_n0(state), 1).dep + #else: + # rep[8] = 0 + #rep[9] = state.sent[state.i].l_kids + #return hash64(rep, sizeof(atom_t) * 10, 0) diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index c5b9dfa47..1d6a58d29 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -21,6 +21,15 @@ cdef class StateClass: cdef int _b_i cdef int _e_i + @staticmethod + cdef inline StateClass init(const TokenC* sent, int length): + cdef StateClass self = StateClass(length) + cdef int i + for i in range(length): + self._sent[i] = sent[i] + self._buffer[i] = i + return self + cdef int from_struct(self, const State* state) except -1 cdef int S(self, int i) nogil diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 81227db26..c7568f7d0 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -1,6 +1,7 @@ from libc.string cimport memcpy, memset from libc.stdint cimport uint32_t from ..vocab cimport EMPTY_LEXEME +from ..structs cimport Entity cdef class StateClass: @@ -203,7 +204,7 @@ cdef class StateClass: memcpy(self._sent, src._sent, self.length * sizeof(TokenC)) memcpy(self._stack, src._stack, self.length * sizeof(int)) memcpy(self._buffer, src._buffer, self.length * sizeof(int)) - memcpy(self._ents, src._ents, self.length * sizeof(int)) + memcpy(self._ents, src._ents, self.length * sizeof(Entity)) self._b_i = src._b_i self._s_i = src._s_i self._e_i = src._e_i @@ -216,8 +217,6 @@ cdef class StateClass: n0 = words[self.B(0)] n1 = words[self.B(1)] return ' '.join((str(self.stack_depth()), third, second, top, '|', n0, n1)) - - # From https://en.wikipedia.org/wiki/Hamming_weight diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index f144d282e..adb093969 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -2,7 +2,6 @@ from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t from ..structs cimport TokenC -from ._state cimport State from ..gold cimport GoldParse from ..gold cimport GoldParseC from ..strings cimport StringStore @@ -36,7 +35,7 @@ cdef class TransitionSystem: cdef bint* _is_valid cdef readonly int n_moves - cdef int initialize_state(self, State* state) except -1 + cdef int initialize_state(self, StateClass state) except -1 cdef int finalize_state(self, StateClass state) except -1 cdef int preprocess_gold(self, GoldParse gold) except -1 diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 6d972bcf9..927498cba 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,5 +1,4 @@ from cymem.cymem cimport Pool -from ._state cimport State from ..structs cimport TokenC from thinc.typedefs cimport weight_t @@ -29,7 +28,7 @@ cdef class TransitionSystem: i += 1 self.c = moves - cdef int initialize_state(self, State* state) except -1: + cdef int initialize_state(self, StateClass state) except -1: pass cdef int finalize_state(self, StateClass state) except -1: