mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Keep working through errors
This commit is contained in:
parent
c538eaf1c8
commit
b67dd0cf89
|
@ -133,13 +133,14 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo
|
||||||
next_states = [s for s in states if not s.is_final()]
|
next_states = [s for s in states if not s.is_final()]
|
||||||
unseen_mask = _get_unseen_mask(model)
|
unseen_mask = _get_unseen_mask(model)
|
||||||
ids = numpy.zeros((len(states), nF), dtype="i")
|
ids = numpy.zeros((len(states), nF), dtype="i")
|
||||||
|
arange = model.ops.xp.arange(nF)
|
||||||
while next_states:
|
while next_states:
|
||||||
ids = ids[: len(next_states)]
|
ids = ids[: len(next_states)]
|
||||||
for i, state in enumerate(next_states):
|
for i, state in enumerate(next_states):
|
||||||
state.set_context_tokens(ids, i, nF)
|
state.set_context_tokens(ids, i, nF)
|
||||||
# Sum the state features, add the bias and apply the activation (maxout)
|
# Sum the state features, add the bias and apply the activation (maxout)
|
||||||
# to create the state vectors.
|
# to create the state vectors.
|
||||||
preacts = _sum_state_features(ops, feats, ids)
|
preacts = feats[ids, arange].sum(axis=1) # type: ignore
|
||||||
preacts += lower_b
|
preacts += lower_b
|
||||||
statevecs, which = ops.maxout(preacts)
|
statevecs, which = ops.maxout(preacts)
|
||||||
# Multiply the state-vector by the scores weights and add the bias,
|
# Multiply the state-vector by the scores weights and add the bias,
|
||||||
|
@ -152,7 +153,7 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo
|
||||||
all_scores.append(scores)
|
all_scores.append(scores)
|
||||||
if is_train:
|
if is_train:
|
||||||
# Remember intermediate results for the backprop.
|
# Remember intermediate results for the backprop.
|
||||||
all_ids.append(ids)
|
all_ids.append(ids.copy())
|
||||||
all_statevecs.append(statevecs)
|
all_statevecs.append(statevecs)
|
||||||
all_which.append(which)
|
all_which.append(which)
|
||||||
|
|
||||||
|
@ -175,7 +176,7 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo
|
||||||
# Now calculate d_statevecs, by backproping through the upper linear layer.
|
# Now calculate d_statevecs, by backproping through the upper linear layer.
|
||||||
d_statevecs = model.ops.gemm(d_scores, upper_W)
|
d_statevecs = model.ops.gemm(d_scores, upper_W)
|
||||||
# Backprop through the maxout activation
|
# Backprop through the maxout activation
|
||||||
d_preacts = model.ops.backprop_maxount(d_statevecs, which, model.get_dim("nP"))
|
d_preacts = model.ops.backprop_maxout(d_statevecs, which, model.get_dim("nP"))
|
||||||
# We don't need to backprop the summation, because we pass back the IDs instead
|
# We don't need to backprop the summation, because we pass back the IDs instead
|
||||||
d_tokvecs = backprop_feats((d_preacts, ids))
|
d_tokvecs = backprop_feats((d_preacts, ids))
|
||||||
return (backprop_tok2vec(d_tokvecs), None)
|
return (backprop_tok2vec(d_tokvecs), None)
|
||||||
|
@ -191,23 +192,6 @@ def _get_unseen_mask(model: Model) -> Floats1d:
|
||||||
return mask
|
return mask
|
||||||
|
|
||||||
|
|
||||||
def _sum_state_features(ops: Ops, feats: Floats3d, ids: Ints2d, _arange=[]) -> Floats2d:
|
|
||||||
# Here's what we're trying to implement here:
|
|
||||||
#
|
|
||||||
# for i in range(ids.shape[0]):
|
|
||||||
# for j in range(ids.shape[1]):
|
|
||||||
# output[i] += feats[ids[i, j], j]
|
|
||||||
#
|
|
||||||
# The arange thingy here is highly weird to me, but apparently
|
|
||||||
# it's how it works. If you squint a bit at the loop above I guess
|
|
||||||
# it makes sense?
|
|
||||||
if not _arange:
|
|
||||||
_arange.append(ops.xp.arange(ids.shape[1]))
|
|
||||||
if _arange[0].size != ids.shape[1]:
|
|
||||||
_arange[0] = ops.xp.arange(ids.shape[1])
|
|
||||||
return feats[ids, _arange[0]].sum(axis=1) # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
|
def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
|
||||||
|
|
||||||
W: Floats4d = model.get_param("lower_W")
|
W: Floats4d = model.get_param("lower_W")
|
||||||
|
@ -265,7 +249,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
|
||||||
nB = dY.shape[0]
|
nB = dY.shape[0]
|
||||||
nF = model.get_dim("nF")
|
nF = model.get_dim("nF")
|
||||||
nP = model.get_dim("nP")
|
nP = model.get_dim("nP")
|
||||||
nO = model.get_dim("nO")
|
nH = model.get_dim("nH")
|
||||||
# Backprop the "padding", used as a filler for missing values.
|
# Backprop the "padding", used as a filler for missing values.
|
||||||
# Values that are missing are set to -1, and each state vector could
|
# Values that are missing are set to -1, and each state vector could
|
||||||
# have multiple missing values. The padding has different values for
|
# have multiple missing values. The padding has different values for
|
||||||
|
@ -280,8 +264,8 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
|
||||||
#
|
#
|
||||||
# (ids < 0).T @ dY
|
# (ids < 0).T @ dY
|
||||||
mask = model.ops.asarray(ids < 0, dtype="f")
|
mask = model.ops.asarray(ids < 0, dtype="f")
|
||||||
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
|
d_pad = model.ops.gemm(mask, dY.reshape(nB, nH * nP), trans1=True)
|
||||||
return d_pad.reshape((1, nF, nO, nP))
|
return d_pad.reshape((1, nF, nH, nP))
|
||||||
|
|
||||||
|
|
||||||
def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
|
def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
|
||||||
|
|
|
@ -279,6 +279,7 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
|
||||||
else:
|
else:
|
||||||
action = moves.c[guess]
|
action = moves.c[guess]
|
||||||
action.do(states[i], action.label)
|
action.do(states[i], action.label)
|
||||||
|
states[i].history.push_back(guess)
|
||||||
free(is_valid)
|
free(is_valid)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -239,8 +239,10 @@ class Parser(TrainablePipe):
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
docs = [eg.x for eg in examples]
|
docs = [eg.x for eg in examples]
|
||||||
(states, scores), backprop_scores = self.model.begin_update((docs, self.moves))
|
(states, scores), backprop_scores = self.model.begin_update((docs, self.moves))
|
||||||
|
if sum(s.shape[0] for s in scores) == 0:
|
||||||
|
return losses
|
||||||
d_scores = self.get_loss((states, scores), examples)
|
d_scores = self.get_loss((states, scores), examples)
|
||||||
backprop_scores(d_scores)
|
backprop_scores((states, d_scores))
|
||||||
if sgd not in (None, False):
|
if sgd not in (None, False):
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += (d_scores**2).sum()
|
losses[self.name] += (d_scores**2).sum()
|
||||||
|
@ -252,22 +254,24 @@ class Parser(TrainablePipe):
|
||||||
|
|
||||||
def get_loss(self, states_scores, examples):
|
def get_loss(self, states_scores, examples):
|
||||||
states, scores = states_scores
|
states, scores = states_scores
|
||||||
|
scores = self.model.ops.xp.vstack(scores)
|
||||||
costs = self._get_costs_from_histories(
|
costs = self._get_costs_from_histories(
|
||||||
examples,
|
examples,
|
||||||
[list(state.history) for state in states]
|
[list(state.history) for state in states]
|
||||||
)
|
)
|
||||||
xp = get_array_module(scores)
|
xp = get_array_module(scores)
|
||||||
best_costs = costs.min(axis=1, keepdims=True)
|
best_costs = costs.min(axis=1, keepdims=True)
|
||||||
is_gold = costs <= costs.min(axis=1, keepdims=True)
|
gscores = scores.copy()
|
||||||
gscores = scores[is_gold]
|
min_score = scores.min()
|
||||||
max_ = scores.max(axis=1)
|
gscores[costs > best_costs] = min_score
|
||||||
|
max_ = scores.max(axis=1, keepdims=True)
|
||||||
gmax = gscores.max(axis=1, keepdims=True)
|
gmax = gscores.max(axis=1, keepdims=True)
|
||||||
exp_scores = xp.exp(scores - max_)
|
exp_scores = xp.exp(scores - max_)
|
||||||
exp_gscores = xp.exp(gscores - gmax)
|
exp_gscores = xp.exp(gscores - gmax)
|
||||||
Z = exp_scores.sum(axis=1, keepdims=True)
|
Z = exp_scores.sum(axis=1, keepdims=True)
|
||||||
gZ = exp_gscores.sum(axis=1, keepdims=True)
|
gZ = exp_gscores.sum(axis=1, keepdims=True)
|
||||||
d_scores = exp_scores / Z
|
d_scores = exp_scores / Z
|
||||||
d_scores[is_gold] -= exp_gscores / gZ
|
d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
|
||||||
return d_scores
|
return d_scores
|
||||||
|
|
||||||
def _get_costs_from_histories(self, examples, histories):
|
def _get_costs_from_histories(self, examples, histories):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user