mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-04 03:43:09 +03:00
WIP on rewrite parser
This commit is contained in:
parent
cda3b08dd1
commit
b456929bfd
|
@ -350,9 +350,9 @@ cdef class Begin:
|
||||||
elif st.B_(1).ent_iob == 3:
|
elif st.B_(1).ent_iob == 3:
|
||||||
# If the next word is B, we can't B now
|
# If the next word is B, we can't B now
|
||||||
return False
|
return False
|
||||||
elif st.B_(1).sent_start == 1:
|
#elif st.B_(1).sent_start == 1:
|
||||||
# Don't allow entities to extend across sentence boundaries
|
# # Don't allow entities to extend across sentence boundaries
|
||||||
return False
|
# return False
|
||||||
# Don't allow entities to start on whitespace
|
# Don't allow entities to start on whitespace
|
||||||
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
|
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
|
||||||
return False
|
return False
|
||||||
|
@ -418,9 +418,9 @@ cdef class In:
|
||||||
# Otherwise, force acceptance, even if we're across a sentence
|
# Otherwise, force acceptance, even if we're across a sentence
|
||||||
# boundary or the token is whitespace.
|
# boundary or the token is whitespace.
|
||||||
return True
|
return True
|
||||||
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
#elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
||||||
# Don't allow entities to extend across sentence boundaries
|
# # Don't allow entities to extend across sentence boundaries
|
||||||
return False
|
# return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ import random
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import set_dropout_rate, CupyOps
|
from thinc.api import set_dropout_rate, CupyOps, get_array_module
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
import numpy.random
|
import numpy.random
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -338,58 +338,79 @@ cdef class Parser(TrainablePipe):
|
||||||
losses=losses,
|
losses=losses,
|
||||||
beam_density=self.cfg["beam_density"]
|
beam_density=self.cfg["beam_density"]
|
||||||
)
|
)
|
||||||
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
|
docs = [eg.x for eg in examples]
|
||||||
final_states = self.moves.init_batch([eg.x for eg in examples])
|
model, backprop_tok2vec = self.model.begin_update(docs)
|
||||||
self._predict_states(model, final_states)
|
states = self.moves.init_batch(docs)
|
||||||
histories = [list(state.history) for state in final_states]
|
self._predict_states(states)
|
||||||
#oracle_histories = [self.moves.get_oracle_sequence(eg) for eg in examples]
|
# I've separated the prediction from getting the batch because
|
||||||
max_moves = self.cfg["update_with_oracle_cut_size"]
|
# I like the idea of trying to store the histories or maybe compute
|
||||||
if max_moves >= 1:
|
# them in another process or something. Just walking the states
|
||||||
# Chop sequences into lengths of this many words, to make the
|
# and transitioning isn't expensive anyway.
|
||||||
# batch uniform length.
|
ids, costs = self._get_ids_and_costs_from_histories(
|
||||||
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
|
||||||
states, golds, _ = self._init_gold_batch(
|
|
||||||
examples,
|
examples,
|
||||||
histories,
|
[list(state.history) for state in states]
|
||||||
max_length=max_moves
|
|
||||||
)
|
)
|
||||||
else:
|
scores, backprop_states = model.begin_update(ids)
|
||||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
d_scores = self.get_loss(scores, costs)
|
||||||
if not states:
|
d_tokvecs = backprop_states(d_scores)
|
||||||
return losses
|
backprop_tok2vec(d_tokvecs)
|
||||||
|
|
||||||
all_states = list(states)
|
|
||||||
states_golds = list(zip(states, golds))
|
|
||||||
n_moves = 0
|
|
||||||
while states_golds:
|
|
||||||
states, golds = zip(*states_golds)
|
|
||||||
scores, backprop = model.begin_update(states)
|
|
||||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
|
||||||
# Note that the gradient isn't normalized by the batch size
|
|
||||||
# here, because our "samples" are really the states...But we
|
|
||||||
# can't normalize by the number of states either, as then we'd
|
|
||||||
# be getting smaller gradients for states in long sequences.
|
|
||||||
backprop(d_scores)
|
|
||||||
# Follow the predicted action
|
|
||||||
self.transition_states(states, scores)
|
|
||||||
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
|
|
||||||
if max_moves >= 1 and n_moves >= max_moves:
|
|
||||||
break
|
|
||||||
n_moves += 1
|
|
||||||
|
|
||||||
backprop_tok2vec(golds)
|
|
||||||
if sgd not in (None, False):
|
if sgd not in (None, False):
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
self.set_annotations([eg.x for eg in examples], final_states)
|
self.set_annotations(docs, states)
|
||||||
|
losses[self.name] += (d_scores**2).sum()
|
||||||
# Ugh, this is annoying. If we're working on GPU, we want to free the
|
# Ugh, this is annoying. If we're working on GPU, we want to free the
|
||||||
# memory ASAP. It seems that Python doesn't necessarily get around to
|
# memory ASAP. It seems that Python doesn't necessarily get around to
|
||||||
# removing these in time if we don't explicitly delete? It's confusing.
|
# removing these in time if we don't explicitly delete? It's confusing.
|
||||||
del backprop
|
del backprop_states
|
||||||
del backprop_tok2vec
|
del backprop_tok2vec
|
||||||
model.clear_memory()
|
model.clear_memory()
|
||||||
del model
|
del model
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
|
def _get_ids_and_costs_from_histories(self, examples, histories):
|
||||||
|
cdef StateClass state
|
||||||
|
cdef int clas
|
||||||
|
cdef int nF = self.model.state2vec.nF
|
||||||
|
cdef int nO = self.moves.n_moves
|
||||||
|
cdef int nS = sum([len(history) for history in histories])
|
||||||
|
# ids and costs have one row per state in the whole batch.
|
||||||
|
cdef np.ndarray ids = numpy.zeros((nS, nF), dtype="i")
|
||||||
|
cdef np.ndarray costs = numpy.zeros((nS, nO), dtype="f")
|
||||||
|
cdef Pool mem = Pool()
|
||||||
|
is_valid = <int*>mem.alloc(nO, sizeof(int))
|
||||||
|
c_ids = <int*>ids.data
|
||||||
|
c_costs = <float*>costs.data
|
||||||
|
states = self.moves.init_states([eg.x for eg in examples])
|
||||||
|
cdef int i = 0
|
||||||
|
for eg, state, history in zip(examples, states, histories):
|
||||||
|
gold = self.moves.init_gold(state, eg)
|
||||||
|
for clas in history:
|
||||||
|
# Set a row into the C data of the arrays (which we return)
|
||||||
|
state.c.set_context_tokens(&c_ids[i*nF], nF)
|
||||||
|
self.moves.set_costs(is_valid, &c_costs[i*nO], state.c, gold)
|
||||||
|
action = self.moves.c[clas]
|
||||||
|
action.do(state.c, action.label)
|
||||||
|
state.c.history.push_back(clas)
|
||||||
|
i += 1
|
||||||
|
# If the model is on GPU, copy the costs to device.
|
||||||
|
costs = self.model.ops.asarray(costs)
|
||||||
|
return ids, costs
|
||||||
|
|
||||||
|
def get_loss(self, scores, costs):
|
||||||
|
xp = get_array_module(scores)
|
||||||
|
best_costs = costs.min(axis=1, keepdims=True)
|
||||||
|
is_gold = costs <= costs.min(axis=1, keepdims=True)
|
||||||
|
gscores = scores[is_gold]
|
||||||
|
max_ = scores.max(axis=1)
|
||||||
|
gmax = gscores.max(axis=1, keepdims=True)
|
||||||
|
exp_scores = xp.exp(scores - max_)
|
||||||
|
exp_gscores = xp.exp(gscores - gmax)
|
||||||
|
Z = exp_scores.sum(axis=1, keepdims=True)
|
||||||
|
gZ = exp_gscores.sum(axis=1, keepdims=True)
|
||||||
|
d_scores = exp_scores / Z
|
||||||
|
d_scores[is_gold] -= exp_gscores / gZ
|
||||||
|
return d_scores
|
||||||
|
|
||||||
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
||||||
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
|
@ -460,36 +481,6 @@ cdef class Parser(TrainablePipe):
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
|
|
||||||
def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
|
|
||||||
cdef StateClass state
|
|
||||||
cdef Pool mem = Pool()
|
|
||||||
cdef int i
|
|
||||||
|
|
||||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
|
||||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
|
||||||
|
|
||||||
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
|
||||||
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
|
||||||
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
|
||||||
dtype='f', order='C')
|
|
||||||
c_d_scores = <float*>d_scores.data
|
|
||||||
unseen_classes = self.model.attrs["unseen_classes"]
|
|
||||||
for i, (state, gold) in enumerate(zip(states, golds)):
|
|
||||||
memset(is_valid, 0, self.moves.n_moves * sizeof(int))
|
|
||||||
memset(costs, 0, self.moves.n_moves * sizeof(float))
|
|
||||||
self.moves.set_costs(is_valid, costs, state.c, gold)
|
|
||||||
for j in range(self.moves.n_moves):
|
|
||||||
if costs[j] <= 0.0 and j in unseen_classes:
|
|
||||||
unseen_classes.remove(j)
|
|
||||||
cpu_log_loss(c_d_scores,
|
|
||||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
|
||||||
c_d_scores += d_scores.shape[1]
|
|
||||||
# Note that we don't normalize this. See comment in update() for why.
|
|
||||||
if losses is not None:
|
|
||||||
losses.setdefault(self.name, 0.)
|
|
||||||
losses[self.name] += (d_scores**2).sum()
|
|
||||||
return d_scores
|
|
||||||
|
|
||||||
def set_output(self, nO):
|
def set_output(self, nO):
|
||||||
self.model.attrs["resize_output"](self.model, nO)
|
self.model.attrs["resize_output"](self.model, nO)
|
||||||
|
|
||||||
|
@ -586,42 +577,3 @@ cdef class Parser(TrainablePipe):
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149) from None
|
raise ValueError(Errors.E149) from None
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _init_gold_batch(self, examples, oracle_histories, max_length):
|
|
||||||
"""Make a square batch, of length equal to the shortest transition
|
|
||||||
sequence or a cap. A long
|
|
||||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
|
||||||
where N is the shortest doc. We'll make two states, one representing
|
|
||||||
long_doc[:N], and another representing long_doc[N:]."""
|
|
||||||
cdef:
|
|
||||||
StateClass start_state
|
|
||||||
StateClass state
|
|
||||||
Transition action
|
|
||||||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
|
||||||
assert len(all_states) == len(examples) == len(oracle_histories)
|
|
||||||
states = []
|
|
||||||
golds = []
|
|
||||||
for state, eg, history in zip(all_states, examples, oracle_histories):
|
|
||||||
if not history:
|
|
||||||
continue
|
|
||||||
if not self.moves.has_gold(eg):
|
|
||||||
continue
|
|
||||||
gold = self.moves.init_gold(state, eg)
|
|
||||||
if len(history) < max_length:
|
|
||||||
states.append(state)
|
|
||||||
golds.append(gold)
|
|
||||||
continue
|
|
||||||
for i in range(0, len(history), max_length):
|
|
||||||
if state.is_final():
|
|
||||||
break
|
|
||||||
start_state = state.copy()
|
|
||||||
for clas in history[i:i+max_length]:
|
|
||||||
action = self.moves.c[clas]
|
|
||||||
action.do(state.c, action.label)
|
|
||||||
state.c.history.push_back(clas)
|
|
||||||
if state.is_final():
|
|
||||||
break
|
|
||||||
if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
|
|
||||||
states.append(start_state)
|
|
||||||
golds.append(gold)
|
|
||||||
return states, golds, max_length
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user