mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* Greedy parsing working with new StateClass. Beam parsing broken
This commit is contained in:
		
							parent
							
								
									6a94b64eca
								
							
						
					
					
						commit
						04b1cd9b8c
					
				|  | @ -366,8 +366,8 @@ cdef class ArcEager(TransitionSystem): | ||||||
|             raise Exception(move) |             raise Exception(move) | ||||||
|         return t |         return t | ||||||
| 
 | 
 | ||||||
|     cdef int initialize_state(self, State* state) except -1: |     cdef int initialize_state(self, StateClass st) except -1: | ||||||
|         push_stack(state) |         st.push() | ||||||
| 
 | 
 | ||||||
|     cdef int finalize_state(self, StateClass st) except -1: |     cdef int finalize_state(self, StateClass st) except -1: | ||||||
|         cdef int root_label = self.strings['ROOT'] |         cdef int root_label = self.strings['ROOT'] | ||||||
|  | @ -383,8 +383,11 @@ cdef class ArcEager(TransitionSystem): | ||||||
|         is_valid[RIGHT] = RightArc.is_valid(stcls, -1) |         is_valid[RIGHT] = RightArc.is_valid(stcls, -1) | ||||||
|         is_valid[BREAK] = Break.is_valid(stcls, -1) |         is_valid[BREAK] = Break.is_valid(stcls, -1) | ||||||
|         cdef int i |         cdef int i | ||||||
|  |         n_valid = 0 | ||||||
|         for i in range(self.n_moves): |         for i in range(self.n_moves): | ||||||
|             output[i] = is_valid[self.c[i].move] |             output[i] = is_valid[self.c[i].move] | ||||||
|  |             n_valid += output[i] | ||||||
|  |         assert n_valid >= 1 | ||||||
| 
 | 
 | ||||||
|     cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: |     cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: | ||||||
|         cdef int i, move, label |         cdef int i, move, label | ||||||
|  | @ -409,6 +412,7 @@ cdef class ArcEager(TransitionSystem): | ||||||
|         cdef int* heads = gold.c.heads |         cdef int* heads = gold.c.heads | ||||||
| 
 | 
 | ||||||
|         self.set_valid(self._is_valid, stcls) |         self.set_valid(self._is_valid, stcls) | ||||||
|  |         n_gold = 0 | ||||||
|         for i in range(self.n_moves): |         for i in range(self.n_moves): | ||||||
|             if not self._is_valid[i]: |             if not self._is_valid[i]: | ||||||
|                 output[i] = 9000 |                 output[i] = 9000 | ||||||
|  | @ -418,6 +422,8 @@ cdef class ArcEager(TransitionSystem): | ||||||
|                 if move_costs[move] == -1: |                 if move_costs[move] == -1: | ||||||
|                     move_costs[move] = move_cost_funcs[move](stcls, &gold.c) |                     move_costs[move] = move_cost_funcs[move](stcls, &gold.c) | ||||||
|                 output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) |                 output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) | ||||||
|  |                 n_gold += output[i] == 0 | ||||||
|  |         assert n_gold >= 1 | ||||||
| 
 | 
 | ||||||
|     cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: |     cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: | ||||||
|         cdef bint[N_MOVES] is_valid |         cdef bint[N_MOVES] is_valid | ||||||
|  |  | ||||||
|  | @ -5,6 +5,9 @@ MALT-style dependency parser | ||||||
| """ | """ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| cimport cython | cimport cython | ||||||
|  | 
 | ||||||
|  | from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF | ||||||
|  | 
 | ||||||
| from libc.stdint cimport uint32_t, uint64_t | from libc.stdint cimport uint32_t, uint64_t | ||||||
| from libc.string cimport memset, memcpy | from libc.string cimport memset, memcpy | ||||||
| import random | import random | ||||||
|  | @ -42,7 +45,6 @@ from ._parse_features cimport CONTEXT_SIZE | ||||||
| from ._parse_features cimport fill_context | from ._parse_features cimport fill_context | ||||||
| from .stateclass cimport StateClass | from .stateclass cimport StateClass | ||||||
| 
 | 
 | ||||||
| from cpython.ref cimport PyObject |  | ||||||
| 
 | 
 | ||||||
| DEBUG = False | DEBUG = False | ||||||
| def set_debug(val): | def set_debug(val): | ||||||
|  | @ -108,9 +110,9 @@ cdef class Parser: | ||||||
|         while not beam.is_done: |         while not beam.is_done: | ||||||
|             self._advance_beam(beam, None, False) |             self._advance_beam(beam, None, False) | ||||||
|         state = <StateClass>beam.at(0) |         state = <StateClass>beam.at(0) | ||||||
|         #self.moves.finalize_state(state) |         self.moves.finalize_state(state) | ||||||
|         #tokens.set_parse(state.sent) |         tokens.set_parse(state._sent) | ||||||
|         raise Exception |         _cleanup(beam) | ||||||
| 
 | 
 | ||||||
|     def _greedy_train(self, Tokens tokens, GoldParse gold): |     def _greedy_train(self, Tokens tokens, GoldParse gold): | ||||||
|         cdef Pool mem = Pool() |         cdef Pool mem = Pool() | ||||||
|  | @ -156,6 +158,8 @@ cdef class Parser: | ||||||
|         else: |         else: | ||||||
|             counts = {} |             counts = {} | ||||||
|         self.model._model.update(counts) |         self.model._model.update(counts) | ||||||
|  |         _cleanup(pred) | ||||||
|  |         _cleanup(gold) | ||||||
|         return pred.loss |         return pred.loss | ||||||
| 
 | 
 | ||||||
|     def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold): |     def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold): | ||||||
|  | @ -163,22 +167,23 @@ cdef class Parser: | ||||||
|         cdef int i, j, cost |         cdef int i, j, cost | ||||||
|         cdef bint is_valid |         cdef bint is_valid | ||||||
|         cdef const Transition* move |         cdef const Transition* move | ||||||
|         cdef StateClass stcls = StateClass(gold.length) |  | ||||||
|         for i in range(beam.size): |         for i in range(beam.size): | ||||||
|             stcls = <StateClass>beam.at(i) |             stcls = <StateClass>beam.at(i) | ||||||
|             if not stcls.is_final(): |             if not stcls.is_final(): | ||||||
|                 fill_context(context, stcls) |                 fill_context(context, stcls) | ||||||
|                 self.model.set_scores(beam.scores[i], context) |                 self.model.set_scores(beam.scores[i], context) | ||||||
|                 self.moves.set_valid(beam.is_valid[i], stcls) |                 self.moves.set_valid(beam.is_valid[i], stcls) | ||||||
|         |  | ||||||
|         if gold is not None: |         if gold is not None: | ||||||
|             for i in range(beam.size): |             for i in range(beam.size): | ||||||
|                 stcls = <StateClass>beam.at(i) |                 stcls = <StateClass>beam.at(i) | ||||||
|                 self.moves.set_costs(beam.costs[i], stcls, gold) |                 self.moves.set_costs(beam.costs[i], stcls, gold) | ||||||
|                 if follow_gold: |                 if follow_gold: | ||||||
|  |                     n_true = 0 | ||||||
|                     for j in range(self.moves.n_moves): |                     for j in range(self.moves.n_moves): | ||||||
|                         beam.is_valid[i][j] *= beam.costs[i][j] == 0 |                         beam.is_valid[i][j] *= beam.costs[i][j] == 0 | ||||||
|         beam.advance(_transition_state, NULL, <void*>self.moves.c) |                         n_true += beam.is_valid[i][j] | ||||||
|  |                     assert n_true >= 1 | ||||||
|  |         beam.advance(_transition_state, _hash_state, <void*>self.moves.c) | ||||||
|         beam.check_done(_check_final_state, NULL) |         beam.check_done(_check_final_state, NULL) | ||||||
| 
 | 
 | ||||||
|     def _count_feats(self, dict counts, Tokens tokens, list hist, int inc): |     def _count_feats(self, dict counts, Tokens tokens, list hist, int inc): | ||||||
|  | @ -208,6 +213,7 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) | ||||||
| 
 | 
 | ||||||
| cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: | cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: | ||||||
|     cdef StateClass st = StateClass.init(<const TokenC*>tokens, length) |     cdef StateClass st = StateClass.init(<const TokenC*>tokens, length) | ||||||
|  |     Py_INCREF(st) | ||||||
|     return <void*>st |     return <void*>st | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -215,23 +221,28 @@ cdef int _check_final_state(void* _state, void* extra_args) except -1: | ||||||
|     return (<StateClass>_state).is_final() |     return (<StateClass>_state).is_final() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| """ | def _cleanup(Beam beam): | ||||||
| cdef hash_t _hash_state(void* _state, void* _) except 0: |     for i in range(beam.width): | ||||||
|     state = <const State*>_state |         Py_XDECREF(<PyObject*>beam._states[i].content) | ||||||
|     cdef atom_t[10] rep |         Py_XDECREF(<PyObject*>beam._parents[i].content) | ||||||
| 
 | 
 | ||||||
|     rep[0] = state.stack[0] if state.stack_len >= 1 else 0 | cdef hash_t _hash_state(void* _state, void* _) except 0: | ||||||
|     rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 |     return <hash_t>_state | ||||||
|     rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 |      | ||||||
|     rep[3] = state.i |     #state = <const State*>_state | ||||||
|     rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 |     #cdef atom_t[10] rep | ||||||
|     rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 | 
 | ||||||
|     rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 |     #rep[0] = state.stack[0] if state.stack_len >= 1 else 0 | ||||||
|     rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 |     #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 | ||||||
|     if get_left(state, get_n0(state), 1) != NULL: |     #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 | ||||||
|         rep[8] = get_left(state, get_n0(state), 1).dep  |     #rep[3] = state.i | ||||||
|     else: |     #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 | ||||||
|         rep[8] = 0 |     #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 | ||||||
|     rep[9] = state.sent[state.i].l_kids |     #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 | ||||||
|     return hash64(rep, sizeof(atom_t) * 10, 0) |     #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 | ||||||
| """ |     #if get_left(state, get_n0(state), 1) != NULL: | ||||||
|  |     #    rep[8] = get_left(state, get_n0(state), 1).dep  | ||||||
|  |     #else: | ||||||
|  |     #    rep[8] = 0 | ||||||
|  |     #rep[9] = state.sent[state.i].l_kids | ||||||
|  |     #return hash64(rep, sizeof(atom_t) * 10, 0) | ||||||
|  |  | ||||||
|  | @ -21,6 +21,15 @@ cdef class StateClass: | ||||||
|     cdef int _b_i |     cdef int _b_i | ||||||
|     cdef int _e_i |     cdef int _e_i | ||||||
| 
 | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     cdef inline StateClass init(const TokenC* sent, int length): | ||||||
|  |         cdef StateClass self = StateClass(length) | ||||||
|  |         cdef int i | ||||||
|  |         for i in range(length): | ||||||
|  |             self._sent[i] = sent[i] | ||||||
|  |             self._buffer[i] = i | ||||||
|  |         return self | ||||||
|  | 
 | ||||||
|     cdef int from_struct(self, const State* state) except -1 |     cdef int from_struct(self, const State* state) except -1 | ||||||
| 
 | 
 | ||||||
|     cdef int S(self, int i) nogil |     cdef int S(self, int i) nogil | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| from libc.string cimport memcpy, memset | from libc.string cimport memcpy, memset | ||||||
| from libc.stdint cimport uint32_t | from libc.stdint cimport uint32_t | ||||||
| from ..vocab cimport EMPTY_LEXEME | from ..vocab cimport EMPTY_LEXEME | ||||||
|  | from ..structs cimport Entity | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class StateClass: | cdef class StateClass: | ||||||
|  | @ -203,7 +204,7 @@ cdef class StateClass: | ||||||
|         memcpy(self._sent, src._sent, self.length * sizeof(TokenC)) |         memcpy(self._sent, src._sent, self.length * sizeof(TokenC)) | ||||||
|         memcpy(self._stack, src._stack, self.length * sizeof(int)) |         memcpy(self._stack, src._stack, self.length * sizeof(int)) | ||||||
|         memcpy(self._buffer, src._buffer, self.length * sizeof(int)) |         memcpy(self._buffer, src._buffer, self.length * sizeof(int)) | ||||||
|         memcpy(self._ents, src._ents, self.length * sizeof(int)) |         memcpy(self._ents, src._ents, self.length * sizeof(Entity)) | ||||||
|         self._b_i = src._b_i |         self._b_i = src._b_i | ||||||
|         self._s_i = src._s_i |         self._s_i = src._s_i | ||||||
|         self._e_i = src._e_i |         self._e_i = src._e_i | ||||||
|  | @ -216,8 +217,6 @@ cdef class StateClass: | ||||||
|         n0 = words[self.B(0)]  |         n0 = words[self.B(0)]  | ||||||
|         n1 = words[self.B(1)]  |         n1 = words[self.B(1)]  | ||||||
|         return ' '.join((str(self.stack_depth()), third, second, top, '|', n0, n1)) |         return ' '.join((str(self.stack_depth()), third, second, top, '|', n0, n1)) | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|   |   | ||||||
| 
 | 
 | ||||||
| # From https://en.wikipedia.org/wiki/Hamming_weight | # From https://en.wikipedia.org/wiki/Hamming_weight | ||||||
|  |  | ||||||
|  | @ -2,7 +2,6 @@ from cymem.cymem cimport Pool | ||||||
| from thinc.typedefs cimport weight_t | from thinc.typedefs cimport weight_t | ||||||
| 
 | 
 | ||||||
| from ..structs cimport TokenC | from ..structs cimport TokenC | ||||||
| from ._state cimport State |  | ||||||
| from ..gold cimport GoldParse | from ..gold cimport GoldParse | ||||||
| from ..gold cimport GoldParseC | from ..gold cimport GoldParseC | ||||||
| from ..strings cimport StringStore | from ..strings cimport StringStore | ||||||
|  | @ -36,7 +35,7 @@ cdef class TransitionSystem: | ||||||
|     cdef bint* _is_valid |     cdef bint* _is_valid | ||||||
|     cdef readonly int n_moves |     cdef readonly int n_moves | ||||||
| 
 | 
 | ||||||
|     cdef int initialize_state(self, State* state) except -1 |     cdef int initialize_state(self, StateClass state) except -1 | ||||||
|     cdef int finalize_state(self, StateClass state) except -1 |     cdef int finalize_state(self, StateClass state) except -1 | ||||||
| 
 | 
 | ||||||
|     cdef int preprocess_gold(self, GoldParse gold) except -1 |     cdef int preprocess_gold(self, GoldParse gold) except -1 | ||||||
|  |  | ||||||
|  | @ -1,5 +1,4 @@ | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| from ._state cimport State |  | ||||||
| from ..structs cimport TokenC | from ..structs cimport TokenC | ||||||
| from thinc.typedefs cimport weight_t | from thinc.typedefs cimport weight_t | ||||||
| 
 | 
 | ||||||
|  | @ -29,7 +28,7 @@ cdef class TransitionSystem: | ||||||
|                 i += 1 |                 i += 1 | ||||||
|         self.c = moves |         self.c = moves | ||||||
| 
 | 
 | ||||||
|     cdef int initialize_state(self, State* state) except -1: |     cdef int initialize_state(self, StateClass state) except -1: | ||||||
|         pass |         pass | ||||||
| 
 | 
 | ||||||
|     cdef int finalize_state(self, StateClass state) except -1: |     cdef int finalize_state(self, StateClass state) except -1: | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user