spaCy/spacy/syntax/_beam_utils.pyx

290 lines
11 KiB
Cython
Raw Normal View History

2017-08-12 22:47:45 +03:00
# cython: infer_types=True
# cython: profile=True
2017-08-12 22:47:45 +03:00
cimport numpy as np
import numpy
2017-10-27 20:45:57 +03:00
from cpython.ref cimport PyObject, Py_XDECREF
2017-08-12 22:47:45 +03:00
from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation
from thinc.typedefs cimport hash_t, class_t
from thinc.extra.search cimport MaxViolation
2017-08-12 22:47:45 +03:00
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParse
2017-11-14 04:11:40 +03:00
from .stateclass cimport StateC, StateClass
2017-08-12 22:47:45 +03:00
# These are passed as callbacks to thinc.search.Beam
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
2017-11-14 04:11:40 +03:00
dest = <StateC*>_dest
src = <StateC*>_src
2017-08-12 22:47:45 +03:00
moves = <const Transition*>_moves
dest.clone(src)
2017-11-14 04:11:40 +03:00
moves[clas].do(dest, moves[clas].label)
dest.push_hist(clas)
2017-08-12 22:47:45 +03:00
cdef int _check_final_state(void* _state, void* extra_args) except -1:
2017-11-14 04:11:40 +03:00
state = <StateC*>_state
return state.is_final()
2017-08-12 22:47:45 +03:00
cdef hash_t _hash_state(void* _state, void* _) except 0:
2017-11-14 04:11:40 +03:00
state = <StateC*>_state
if state.is_final():
2017-08-12 22:47:45 +03:00
return 1
else:
2017-11-14 04:11:40 +03:00
return state.hash()
2017-08-12 22:47:45 +03:00
cdef class ParserBeam(object):
cdef public TransitionSystem moves
2017-08-13 01:15:16 +03:00
cdef public object states
2017-08-12 22:47:45 +03:00
cdef public object golds
cdef public object beams
cdef public object dones
2017-08-12 22:47:45 +03:00
2017-08-13 01:15:16 +03:00
def __init__(self, TransitionSystem moves, states, golds,
2017-10-27 20:45:57 +03:00
int width, float density):
2017-08-12 22:47:45 +03:00
self.moves = moves
2017-08-13 01:15:16 +03:00
self.states = states
2017-08-12 22:47:45 +03:00
self.golds = golds
self.beams = []
cdef Beam beam
2017-11-14 04:11:40 +03:00
cdef StateClass state
cdef StateC* st
2017-08-13 01:15:16 +03:00
for state in states:
2017-08-12 22:47:45 +03:00
beam = Beam(self.moves.n_moves, width, density)
2017-10-27 20:45:57 +03:00
beam.initialize(self.moves.init_beam_state, state.c.length,
state.c._sent)
2017-08-14 02:02:05 +03:00
for i in range(beam.width):
2017-11-14 04:11:40 +03:00
st = <StateC*>beam.at(i)
st.offset = state.c.offset
2017-08-12 22:47:45 +03:00
self.beams.append(beam)
self.dones = [False] * len(self.beams)
2017-08-13 03:22:52 +03:00
2017-08-12 22:47:45 +03:00
@property
def is_done(self):
2017-10-27 20:45:57 +03:00
return all(b.is_done or self.dones[i]
for i, b in enumerate(self.beams))
2017-08-12 22:47:45 +03:00
def __getitem__(self, i):
return self.beams[i]
def __len__(self):
return len(self.beams)
def advance(self, scores, follow_gold=False):
cdef Beam beam
for i, beam in enumerate(self.beams):
if beam.is_done or not scores[i].size or self.dones[i]:
continue
2017-08-12 22:47:45 +03:00
self._set_scores(beam, scores[i])
if self.golds is not None:
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
2017-11-15 01:36:46 +03:00
beam.advance(_transition_state, NULL, <void*>self.moves.c)
2017-08-13 03:22:52 +03:00
beam.check_done(_check_final_state, NULL)
2017-11-14 04:11:40 +03:00
# This handles the non-monotonic stuff for the parser.
if beam.is_done and self.golds is not None:
2017-08-14 02:02:05 +03:00
for j in range(beam.size):
2017-11-14 04:11:40 +03:00
state = StateClass.borrow(<StateC*>beam.at(j))
if state.is_final():
try:
if self.moves.is_gold_parse(state, self.golds[i]):
beam._states[j].loss = 0.0
elif beam._states[j].loss == 0.0:
beam._states[j].loss = 1.0
except NotImplementedError:
break
2017-08-13 03:22:52 +03:00
def _set_scores(self, Beam beam, float[:, ::1] scores):
cdef float* c_scores = &scores[0, 0]
cdef int nr_state = min(scores.shape[0], beam.size)
cdef int nr_class = scores.shape[1]
for i in range(nr_state):
2017-11-14 04:11:40 +03:00
state = <StateC*>beam.at(i)
2017-08-13 02:21:54 +03:00
if not state.is_final():
for j in range(nr_class):
beam.scores[i][j] = c_scores[i * nr_class + j]
2017-11-14 04:11:40 +03:00
self.moves.set_valid(beam.is_valid[i], state)
else:
for j in range(beam.nr_class):
beam.scores[i][j] = 0
beam.costs[i][j] = 0
2017-08-12 22:47:45 +03:00
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
for i in range(beam.size):
2017-11-14 04:11:40 +03:00
state = StateClass.borrow(<StateC*>beam.at(i))
if not state.is_final():
2017-10-27 20:45:57 +03:00
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
state, gold)
2017-08-13 03:22:52 +03:00
if follow_gold:
for j in range(beam.nr_class):
if beam.costs[i][j] >= 1:
beam.is_valid[i][j] = 0
2017-08-12 22:47:45 +03:00
def get_token_ids(states, int n_tokens):
cdef StateClass state
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
2017-08-13 01:15:16 +03:00
dtype='int32', order='C')
2017-08-12 22:47:45 +03:00
c_ids = <int*>ids.data
for i, state in enumerate(states):
if not state.is_final():
state.c.set_context_tokens(c_ids, n_tokens)
2017-08-13 01:15:16 +03:00
else:
ids[i] = -1
2017-08-12 22:47:45 +03:00
c_ids += ids.shape[1]
return ids
2017-10-27 20:45:57 +03:00
nr_update = 0
2017-10-27 20:45:57 +03:00
2017-08-13 01:15:16 +03:00
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, golds,
state2vec, vec2scores,
int width, float density, int hist_feats,
losses=None, drop=0.):
global nr_update
cdef MaxViolation violn
nr_update += 1
2017-08-13 01:15:16 +03:00
pbeam = ParserBeam(moves, states, golds,
2017-08-12 22:47:45 +03:00
width=width, density=density)
2017-08-13 01:15:16 +03:00
gbeam = ParserBeam(moves, states, golds,
2017-11-14 04:11:40 +03:00
width=width, density=density)
2017-08-14 02:02:05 +03:00
cdef StateClass state
2017-08-13 01:15:16 +03:00
beam_maps = []
2017-08-12 22:47:45 +03:00
backprops = []
2017-08-13 01:15:16 +03:00
violns = [MaxViolation() for _ in range(len(states))]
for t in range(max_steps):
if pbeam.is_done and gbeam.is_done:
break
2017-08-14 02:02:05 +03:00
# The beam maps let us find the right row in the flattened scores
2017-10-27 20:45:57 +03:00
# arrays for each state. States are identified by (example id,
# history). We keep a different beam map for each step (since we'll
# have a flat scores array for each step). The beam map will let us
# take the per-state losses, and compute the gradient for each (step,
# state, class).
2017-08-13 01:15:16 +03:00
beam_maps.append({})
2017-08-14 02:02:05 +03:00
# Gather all states from the two beams in a list. Some stats may occur
# in both beams. To figure out which beam each state belonged to,
# we keep two lists of indices, p_indices and g_indices
2017-10-27 20:45:57 +03:00
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
nr_update)
if not states:
break
2017-08-14 02:02:05 +03:00
# Now that we have our flat list of states, feed them through the model
2017-08-12 22:47:45 +03:00
token_ids = get_token_ids(states, nr_feature)
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
if hist_feats:
2017-10-27 20:45:57 +03:00
hists = numpy.asarray([st.history[:hist_feats] for st in states],
dtype='i')
scores, bp_scores = vec2scores.begin_update((vectors, hists),
drop=drop)
else:
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
2017-08-13 01:15:16 +03:00
2017-08-14 02:02:05 +03:00
# Store the callbacks for the backward pass
2017-08-12 22:47:45 +03:00
backprops.append((token_ids, bp_vectors, bp_scores))
2017-08-14 02:02:05 +03:00
# Unpack the flat scores into lists for the two beams. The indices arrays
# tell us which example and state the scores-row refers to.
2017-10-27 20:45:57 +03:00
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
for indices in p_indices]
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
for indices in g_indices]
2017-08-14 02:02:05 +03:00
# Now advance the states in the beams. The gold beam is contrained to
# to follow only gold analyses.
2017-08-12 22:47:45 +03:00
pbeam.advance(p_scores)
gbeam.advance(g_scores, follow_gold=True)
2017-08-14 02:02:05 +03:00
# Track the "maximum violation", to use in the update.
2017-08-12 22:47:45 +03:00
for i, violn in enumerate(violns):
violn.check_crf(pbeam[i], gbeam[i])
histories = []
losses = []
for violn in violns:
if violn.p_hist:
histories.append(violn.p_hist + violn.g_hist)
losses.append(violn.p_probs + violn.g_probs)
else:
histories.append([])
losses.append([])
states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
2017-11-13 20:18:08 +03:00
beams = list(pbeam.beams) + list(gbeam.beams)
return states_d_scores, backprops[:len(states_d_scores)], beams
2017-08-12 22:47:45 +03:00
def get_states(pbeams, gbeams, beam_map, nr_update):
2017-08-12 22:47:45 +03:00
seen = {}
2017-08-13 01:15:16 +03:00
states = []
2017-08-12 22:47:45 +03:00
p_indices = []
g_indices = []
cdef Beam pbeam, gbeam
2017-08-14 02:02:05 +03:00
assert len(pbeams) == len(gbeams)
2017-08-13 01:15:16 +03:00
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
2017-08-12 22:47:45 +03:00
p_indices.append([])
2017-08-14 02:02:05 +03:00
g_indices.append([])
for i in range(pbeam.size):
2017-11-14 04:11:40 +03:00
state = StateClass.borrow(<StateC*>pbeam.at(i))
if not state.is_final():
key = tuple([eg_id] + pbeam.histories[i])
assert key not in seen, (key, seen)
seen[key] = len(states)
p_indices[-1].append(len(states))
2017-08-14 02:02:05 +03:00
states.append(state)
2017-08-12 22:47:45 +03:00
beam_map.update(seen)
for i in range(gbeam.size):
2017-11-14 04:11:40 +03:00
state = StateClass.borrow(<StateC*>gbeam.at(i))
if not state.is_final():
key = tuple([eg_id] + gbeam.histories[i])
if key in seen:
g_indices[-1].append(seen[key])
else:
g_indices[-1].append(len(states))
beam_map[key] = len(states)
2017-08-14 02:02:05 +03:00
states.append(state)
p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
return states, p_idx, g_idx
2017-08-12 22:47:45 +03:00
2017-08-13 01:15:16 +03:00
def get_gradient(nr_class, beam_maps, histories, losses):
2017-10-27 20:45:57 +03:00
"""The global model assigns a loss to each parse. The beam scores
2017-08-12 22:47:45 +03:00
are additive, so the same gradient is applied to each action
in the history. This gives the gradient of a single *action*
for a beam state -- so we have "the gradient of loss for taking
action i given history H."
2017-08-13 01:15:16 +03:00
Histories: Each hitory is a list of actions
Each candidate has a history
Each beam has multiple candidates
Each batch has multiple beams
So history is list of lists of lists of ints
2017-08-12 22:47:45 +03:00
"""
2017-08-13 01:15:16 +03:00
nr_step = len(beam_maps)
grads = []
nr_step = 0
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
if loss != 0.0 and not numpy.isnan(loss):
nr_step = max(nr_step, len(hist))
for i in range(nr_step):
2017-10-27 20:45:57 +03:00
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
dtype='f'))
assert len(histories) == len(losses)
2017-08-13 01:15:16 +03:00
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
if loss == 0.0 or numpy.isnan(loss):
continue
2017-08-13 01:15:16 +03:00
key = tuple([eg_id])
# Adjust loss for length
avg_loss = loss / len(hist)
loss += avg_loss * (nr_step - len(hist))
2017-08-13 01:15:16 +03:00
for j, clas in enumerate(hist):
2017-08-14 02:02:05 +03:00
i = beam_maps[j][key]
2017-08-13 02:21:54 +03:00
# In step j, at state i action clas
# resulted in loss
grads[j][i, clas] += loss
2017-08-13 01:15:16 +03:00
key = key + tuple([clas])
2017-08-12 22:47:45 +03:00
return grads