mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-14 03:26:24 +03:00
* Greedy parsing working with new StateClass. Beam parsing broken
This commit is contained in:
parent
6a94b64eca
commit
04b1cd9b8c
|
@ -366,8 +366,8 @@ cdef class ArcEager(TransitionSystem):
|
||||||
raise Exception(move)
|
raise Exception(move)
|
||||||
return t
|
return t
|
||||||
|
|
||||||
cdef int initialize_state(self, State* state) except -1:
|
cdef int initialize_state(self, StateClass st) except -1:
|
||||||
push_stack(state)
|
st.push()
|
||||||
|
|
||||||
cdef int finalize_state(self, StateClass st) except -1:
|
cdef int finalize_state(self, StateClass st) except -1:
|
||||||
cdef int root_label = self.strings['ROOT']
|
cdef int root_label = self.strings['ROOT']
|
||||||
|
@ -383,8 +383,11 @@ cdef class ArcEager(TransitionSystem):
|
||||||
is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
|
is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
|
||||||
is_valid[BREAK] = Break.is_valid(stcls, -1)
|
is_valid[BREAK] = Break.is_valid(stcls, -1)
|
||||||
cdef int i
|
cdef int i
|
||||||
|
n_valid = 0
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
output[i] = is_valid[self.c[i].move]
|
output[i] = is_valid[self.c[i].move]
|
||||||
|
n_valid += output[i]
|
||||||
|
assert n_valid >= 1
|
||||||
|
|
||||||
cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
|
cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
|
||||||
cdef int i, move, label
|
cdef int i, move, label
|
||||||
|
@ -409,6 +412,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
cdef int* heads = gold.c.heads
|
cdef int* heads = gold.c.heads
|
||||||
|
|
||||||
self.set_valid(self._is_valid, stcls)
|
self.set_valid(self._is_valid, stcls)
|
||||||
|
n_gold = 0
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if not self._is_valid[i]:
|
if not self._is_valid[i]:
|
||||||
output[i] = 9000
|
output[i] = 9000
|
||||||
|
@ -418,6 +422,8 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if move_costs[move] == -1:
|
if move_costs[move] == -1:
|
||||||
move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
|
move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
|
||||||
output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
|
output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
|
||||||
|
n_gold += output[i] == 0
|
||||||
|
assert n_gold >= 1
|
||||||
|
|
||||||
cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
|
cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
|
||||||
cdef bint[N_MOVES] is_valid
|
cdef bint[N_MOVES] is_valid
|
||||||
|
|
|
@ -5,6 +5,9 @@ MALT-style dependency parser
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
|
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
import random
|
import random
|
||||||
|
@ -42,7 +45,6 @@ from ._parse_features cimport CONTEXT_SIZE
|
||||||
from ._parse_features cimport fill_context
|
from ._parse_features cimport fill_context
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
|
|
||||||
from cpython.ref cimport PyObject
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
def set_debug(val):
|
def set_debug(val):
|
||||||
|
@ -108,9 +110,9 @@ cdef class Parser:
|
||||||
while not beam.is_done:
|
while not beam.is_done:
|
||||||
self._advance_beam(beam, None, False)
|
self._advance_beam(beam, None, False)
|
||||||
state = <StateClass>beam.at(0)
|
state = <StateClass>beam.at(0)
|
||||||
#self.moves.finalize_state(state)
|
self.moves.finalize_state(state)
|
||||||
#tokens.set_parse(state.sent)
|
tokens.set_parse(state._sent)
|
||||||
raise Exception
|
_cleanup(beam)
|
||||||
|
|
||||||
def _greedy_train(self, Tokens tokens, GoldParse gold):
|
def _greedy_train(self, Tokens tokens, GoldParse gold):
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
|
@ -156,6 +158,8 @@ cdef class Parser:
|
||||||
else:
|
else:
|
||||||
counts = {}
|
counts = {}
|
||||||
self.model._model.update(counts)
|
self.model._model.update(counts)
|
||||||
|
_cleanup(pred)
|
||||||
|
_cleanup(gold)
|
||||||
return pred.loss
|
return pred.loss
|
||||||
|
|
||||||
def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold):
|
def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold):
|
||||||
|
@ -163,22 +167,23 @@ cdef class Parser:
|
||||||
cdef int i, j, cost
|
cdef int i, j, cost
|
||||||
cdef bint is_valid
|
cdef bint is_valid
|
||||||
cdef const Transition* move
|
cdef const Transition* move
|
||||||
cdef StateClass stcls = StateClass(gold.length)
|
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
stcls = <StateClass>beam.at(i)
|
stcls = <StateClass>beam.at(i)
|
||||||
if not stcls.is_final():
|
if not stcls.is_final():
|
||||||
fill_context(context, stcls)
|
fill_context(context, stcls)
|
||||||
self.model.set_scores(beam.scores[i], context)
|
self.model.set_scores(beam.scores[i], context)
|
||||||
self.moves.set_valid(beam.is_valid[i], stcls)
|
self.moves.set_valid(beam.is_valid[i], stcls)
|
||||||
|
|
||||||
if gold is not None:
|
if gold is not None:
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
stcls = <StateClass>beam.at(i)
|
stcls = <StateClass>beam.at(i)
|
||||||
self.moves.set_costs(beam.costs[i], stcls, gold)
|
self.moves.set_costs(beam.costs[i], stcls, gold)
|
||||||
if follow_gold:
|
if follow_gold:
|
||||||
|
n_true = 0
|
||||||
for j in range(self.moves.n_moves):
|
for j in range(self.moves.n_moves):
|
||||||
beam.is_valid[i][j] *= beam.costs[i][j] == 0
|
beam.is_valid[i][j] *= beam.costs[i][j] == 0
|
||||||
beam.advance(_transition_state, NULL, <void*>self.moves.c)
|
n_true += beam.is_valid[i][j]
|
||||||
|
assert n_true >= 1
|
||||||
|
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||||
beam.check_done(_check_final_state, NULL)
|
beam.check_done(_check_final_state, NULL)
|
||||||
|
|
||||||
def _count_feats(self, dict counts, Tokens tokens, list hist, int inc):
|
def _count_feats(self, dict counts, Tokens tokens, list hist, int inc):
|
||||||
|
@ -208,6 +213,7 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves)
|
||||||
|
|
||||||
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
||||||
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
|
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
|
||||||
|
Py_INCREF(st)
|
||||||
return <void*>st
|
return <void*>st
|
||||||
|
|
||||||
|
|
||||||
|
@ -215,23 +221,28 @@ cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
||||||
return (<StateClass>_state).is_final()
|
return (<StateClass>_state).is_final()
|
||||||
|
|
||||||
|
|
||||||
"""
|
def _cleanup(Beam beam):
|
||||||
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
for i in range(beam.width):
|
||||||
state = <const State*>_state
|
Py_XDECREF(<PyObject*>beam._states[i].content)
|
||||||
cdef atom_t[10] rep
|
Py_XDECREF(<PyObject*>beam._parents[i].content)
|
||||||
|
|
||||||
rep[0] = state.stack[0] if state.stack_len >= 1 else 0
|
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
||||||
rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
|
return <hash_t>_state
|
||||||
rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
|
|
||||||
rep[3] = state.i
|
#state = <const State*>_state
|
||||||
rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
|
#cdef atom_t[10] rep
|
||||||
rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
|
|
||||||
rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
|
#rep[0] = state.stack[0] if state.stack_len >= 1 else 0
|
||||||
rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
|
#rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
|
||||||
if get_left(state, get_n0(state), 1) != NULL:
|
#rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
|
||||||
rep[8] = get_left(state, get_n0(state), 1).dep
|
#rep[3] = state.i
|
||||||
else:
|
#rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
|
||||||
rep[8] = 0
|
#rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
|
||||||
rep[9] = state.sent[state.i].l_kids
|
#rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
|
||||||
return hash64(rep, sizeof(atom_t) * 10, 0)
|
#rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
|
||||||
"""
|
#if get_left(state, get_n0(state), 1) != NULL:
|
||||||
|
# rep[8] = get_left(state, get_n0(state), 1).dep
|
||||||
|
#else:
|
||||||
|
# rep[8] = 0
|
||||||
|
#rep[9] = state.sent[state.i].l_kids
|
||||||
|
#return hash64(rep, sizeof(atom_t) * 10, 0)
|
||||||
|
|
|
@ -21,6 +21,15 @@ cdef class StateClass:
|
||||||
cdef int _b_i
|
cdef int _b_i
|
||||||
cdef int _e_i
|
cdef int _e_i
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
cdef inline StateClass init(const TokenC* sent, int length):
|
||||||
|
cdef StateClass self = StateClass(length)
|
||||||
|
cdef int i
|
||||||
|
for i in range(length):
|
||||||
|
self._sent[i] = sent[i]
|
||||||
|
self._buffer[i] = i
|
||||||
|
return self
|
||||||
|
|
||||||
cdef int from_struct(self, const State* state) except -1
|
cdef int from_struct(self, const State* state) except -1
|
||||||
|
|
||||||
cdef int S(self, int i) nogil
|
cdef int S(self, int i) nogil
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
from ..vocab cimport EMPTY_LEXEME
|
||||||
|
from ..structs cimport Entity
|
||||||
|
|
||||||
|
|
||||||
cdef class StateClass:
|
cdef class StateClass:
|
||||||
|
@ -203,7 +204,7 @@ cdef class StateClass:
|
||||||
memcpy(self._sent, src._sent, self.length * sizeof(TokenC))
|
memcpy(self._sent, src._sent, self.length * sizeof(TokenC))
|
||||||
memcpy(self._stack, src._stack, self.length * sizeof(int))
|
memcpy(self._stack, src._stack, self.length * sizeof(int))
|
||||||
memcpy(self._buffer, src._buffer, self.length * sizeof(int))
|
memcpy(self._buffer, src._buffer, self.length * sizeof(int))
|
||||||
memcpy(self._ents, src._ents, self.length * sizeof(int))
|
memcpy(self._ents, src._ents, self.length * sizeof(Entity))
|
||||||
self._b_i = src._b_i
|
self._b_i = src._b_i
|
||||||
self._s_i = src._s_i
|
self._s_i = src._s_i
|
||||||
self._e_i = src._e_i
|
self._e_i = src._e_i
|
||||||
|
@ -218,8 +219,6 @@ cdef class StateClass:
|
||||||
return ' '.join((str(self.stack_depth()), third, second, top, '|', n0, n1))
|
return ' '.join((str(self.stack_depth()), third, second, top, '|', n0, n1))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# From https://en.wikipedia.org/wiki/Hamming_weight
|
# From https://en.wikipedia.org/wiki/Hamming_weight
|
||||||
cdef inline uint32_t _popcount(uint32_t x) nogil:
|
cdef inline uint32_t _popcount(uint32_t x) nogil:
|
||||||
"""Find number of non-zero bits."""
|
"""Find number of non-zero bits."""
|
||||||
|
|
|
@ -2,7 +2,6 @@ from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from ._state cimport State
|
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
from ..strings cimport StringStore
|
from ..strings cimport StringStore
|
||||||
|
@ -36,7 +35,7 @@ cdef class TransitionSystem:
|
||||||
cdef bint* _is_valid
|
cdef bint* _is_valid
|
||||||
cdef readonly int n_moves
|
cdef readonly int n_moves
|
||||||
|
|
||||||
cdef int initialize_state(self, State* state) except -1
|
cdef int initialize_state(self, StateClass state) except -1
|
||||||
cdef int finalize_state(self, StateClass state) except -1
|
cdef int finalize_state(self, StateClass state) except -1
|
||||||
|
|
||||||
cdef int preprocess_gold(self, GoldParse gold) except -1
|
cdef int preprocess_gold(self, GoldParse gold) except -1
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from ._state cimport State
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
|
|
||||||
|
@ -29,7 +28,7 @@ cdef class TransitionSystem:
|
||||||
i += 1
|
i += 1
|
||||||
self.c = moves
|
self.c = moves
|
||||||
|
|
||||||
cdef int initialize_state(self, State* state) except -1:
|
cdef int initialize_state(self, StateClass state) except -1:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
cdef int finalize_state(self, StateClass state) except -1:
|
cdef int finalize_state(self, StateClass state) except -1:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user