mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
569cc98982
* Add load_from_config function * Add train_from_config script * Merge configs and expose via spacy.config * Fix script * Suggest create_evaluation_callback * Hard-code for NER * Fix errors * Register command * Add TODO * Update train-from-config todos * Fix imports * Allow delayed setting of parser model nr_class * Get train-from-config working * Tidy up and fix scores and printing * Hide traceback if cancelled * Fix weighted score formatting * Fix score formatting * Make output_path optional * Add Tok2Vec component * Tidy up and add tok2vec_tensors * Add option to copy docs in nlp.update * Copy docs in nlp.update * Adjust nlp.update() for set_annotations * Don't shuffle pipes in nlp.update, decruft * Support set_annotations arg in component update * Support set_annotations in parser update * Add get_gradients method * Add get_gradients to parser * Update errors.py * Fix problems caused by merge * Add _link_components method in nlp * Add concept of 'listeners' and ControlledModel * Support optional attributes arg in ControlledModel * Try having tok2vec component in pipeline * Fix tok2vec component * Fix config * Fix tok2vec * Update for Example * Update for Example * Update config * Add eg2doc util * Update and add schemas/types * Update schemas * Fix nlp.update * Fix tagger * Remove hacks from train-from-config * Remove hard-coded config str * Calculate loss in tok2vec component * Tidy up and use function signatures instead of models * Support union types for registry models * Minor cleaning in Language.update * Make ControlledModel specifically Tok2VecListener * Fix train_from_config * Fix tok2vec * Tidy up * Add function for bilstm tok2vec * Fix type * Fix syntax * Fix pytorch optimizer * Add example configs * Update for thinc describe changes * Update for Thinc changes * Update for dropout/sgd changes * Update for dropout/sgd changes * Unhack gradient update * Work on refactoring _ml * Remove _ml.py module * WIP upgrade cli scripts for thinc * Move some _ml stuff to util * Import link_vectors from util * Update train_from_config * Import from util * Import from util * Temporarily add ml.component_models module * Move ml methods * Move typedefs * Update load vectors * Update gitignore * Move imports * Add PrecomputableAffine * Fix imports * Fix imports * Fix imports * Fix missing imports * Update CLI scripts * Update spacy.language * Add stubs for building the models * Update model definition * Update create_default_optimizer * Fix import * Fix comment * Update imports in tests * Update imports in spacy.cli * Fix import * fix obsolete thinc imports * update srsly pin * from thinc to ml_datasets for example data such as imdb * update ml_datasets pin * using STATE.vectors * small fix * fix Sentencizer.pipe * black formatting * rename Affine to Linear as in thinc * set validate explicitely to True * rename with_square_sequences to with_list2padded * rename with_flatten to with_list2array * chaining layernorm * small fixes * revert Optimizer import * build_nel_encoder with new thinc style * fixes using model's get and set methods * Tok2Vec in component models, various fixes * fix up legacy tok2vec code * add model initialize calls * add in build_tagger_model * small fixes * setting model dims * fixes for ParserModel * various small fixes * initialize thinc Models * fixes * consistent naming of window_size * fixes, removing set_dropout * work around Iterable issue * remove legacy tok2vec * util fix * fix forward function of tok2vec listener * more fixes * trying to fix PrecomputableAffine (not succesful yet) * alloc instead of allocate * add morphologizer * rename residual * rename fixes * Fix predict function * Update parser and parser model * fixing few more tests * Fix precomputable affine * Update component model * Update parser model * Move backprop padding to own function, for test * Update test * Fix p. affine * Update NEL * build_bow_text_classifier and extract_ngrams * Fix parser init * Fix test add label * add build_simple_cnn_text_classifier * Fix parser init * Set gpu off by default in example * Fix tok2vec listener * Fix parser model * Small fixes * small fix for PyTorchLSTM parameters * revert my_compounding hack (iterable fixed now) * fix biLSTM * Fix uniqued * PyTorchRNNWrapper fix * small fixes * use helper function to calculate cosine loss * small fixes for build_simple_cnn_text_classifier * putting dropout default at 0.0 to ensure the layer gets built * using thinc util's set_dropout_rate * moving layer normalization inside of maxout definition to optimize dropout * temp debugging in NEL * fixed NEL model by using init defaults ! * fixing after set_dropout_rate refactor * proper fix * fix test_update_doc after refactoring optimizers in thinc * Add CharacterEmbed layer * Construct tagger Model * Add missing import * Remove unused stuff * Work on textcat * fix test (again :)) after optimizer refactor * fixes to allow reading Tagger from_disk without overwriting dimensions * don't build the tok2vec prematuraly * fix CharachterEmbed init * CharacterEmbed fixes * Fix CharacterEmbed architecture * fix imports * renames from latest thinc update * one more rename * add initialize calls where appropriate * fix parser initialization * Update Thinc version * Fix errors, auto-format and tidy up imports * Fix validation * fix if bias is cupy array * revert for now * ensure it's a numpy array before running bp in ParserStepModel * no reason to call require_gpu twice * use CupyOps.to_numpy instead of cupy directly * fix initialize of ParserModel * remove unnecessary import * fixes for CosineDistance * fix device renaming * use refactored loss functions (Thinc PR 251) * overfitting test for tagger * experimental settings for the tagger: avoid zero-init and subword normalization * clean up tagger overfitting test * use previous default value for nP * remove toy config * bringing layernorm back (had a bug - fixed in thinc) * revert setting nP explicitly * remove setting default in constructor * restore values as they used to be * add overfitting test for NER * add overfitting test for dep parser * add overfitting test for textcat * fixing init for linear (previously affine) * larger eps window for textcat * ensure doc is not None * Require newer thinc * Make float check vaguer * Slop the textcat overfit test more * Fix textcat test * Fix exclusive classes for textcat * fix after renaming of alloc methods * fixing renames and mandatory arguments (staticvectors WIP) * upgrade to thinc==8.0.0.dev3 * refer to vocab.vectors directly instead of its name * rename alpha to learn_rate * adding hashembed and staticvectors dropout * upgrade to thinc 8.0.0.dev4 * add name back to avoid warning W020 * thinc dev4 * update srsly * using thinc 8.0.0a0 ! Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> Co-authored-by: Ines Montani <ines@ines.io>
331 lines
12 KiB
Cython
331 lines
12 KiB
Cython
# cython: infer_types=True
|
|
# cython: profile=True
|
|
cimport numpy as np
|
|
import numpy
|
|
from cpython.ref cimport PyObject, Py_XDECREF
|
|
from thinc.extra.search cimport Beam
|
|
from thinc.extra.search import MaxViolation
|
|
from thinc.extra.search cimport MaxViolation
|
|
|
|
from ..typedefs cimport hash_t, class_t
|
|
from .transition_system cimport TransitionSystem, Transition
|
|
from ..gold cimport GoldParse
|
|
from ..errors import Errors
|
|
from .stateclass cimport StateC, StateClass
|
|
|
|
|
|
# These are passed as callbacks to thinc.search.Beam
|
|
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
|
dest = <StateC*>_dest
|
|
src = <StateC*>_src
|
|
moves = <const Transition*>_moves
|
|
dest.clone(src)
|
|
moves[clas].do(dest, moves[clas].label)
|
|
dest.push_hist(clas)
|
|
|
|
|
|
cdef int check_final_state(void* _state, void* extra_args) except -1:
|
|
state = <StateC*>_state
|
|
return state.is_final()
|
|
|
|
|
|
cdef hash_t hash_state(void* _state, void* _) except 0:
|
|
state = <StateC*>_state
|
|
if state.is_final():
|
|
return 1
|
|
else:
|
|
return state.hash()
|
|
|
|
|
|
def collect_states(beams):
|
|
cdef StateClass state
|
|
cdef Beam beam
|
|
states = []
|
|
for state_or_beam in beams:
|
|
if isinstance(state_or_beam, StateClass):
|
|
states.append(state_or_beam)
|
|
else:
|
|
beam = state_or_beam
|
|
state = StateClass.borrow(<StateC*>beam.at(0))
|
|
states.append(state)
|
|
return states
|
|
|
|
|
|
cdef class ParserBeam(object):
|
|
cdef public TransitionSystem moves
|
|
cdef public object states
|
|
cdef public object golds
|
|
cdef public object beams
|
|
cdef public object dones
|
|
|
|
def __init__(self, TransitionSystem moves, states, golds,
|
|
int width, float density=0.):
|
|
self.moves = moves
|
|
self.states = states
|
|
self.golds = golds
|
|
self.beams = []
|
|
cdef Beam beam
|
|
cdef StateClass state
|
|
cdef StateC* st
|
|
for state in states:
|
|
beam = Beam(self.moves.n_moves, width, min_density=density)
|
|
beam.initialize(self.moves.init_beam_state,
|
|
self.moves.del_beam_state, state.c.length,
|
|
state.c._sent)
|
|
for i in range(beam.width):
|
|
st = <StateC*>beam.at(i)
|
|
st.offset = state.c.offset
|
|
self.beams.append(beam)
|
|
self.dones = [False] * len(self.beams)
|
|
|
|
@property
|
|
def is_done(self):
|
|
return all(b.is_done or self.dones[i]
|
|
for i, b in enumerate(self.beams))
|
|
|
|
def __getitem__(self, i):
|
|
return self.beams[i]
|
|
|
|
def __len__(self):
|
|
return len(self.beams)
|
|
|
|
def advance(self, scores, follow_gold=False):
|
|
cdef Beam beam
|
|
for i, beam in enumerate(self.beams):
|
|
if beam.is_done or not scores[i].size or self.dones[i]:
|
|
continue
|
|
self._set_scores(beam, scores[i])
|
|
if self.golds is not None:
|
|
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
|
|
beam.advance(transition_state, hash_state, <void*>self.moves.c)
|
|
beam.check_done(check_final_state, NULL)
|
|
# This handles the non-monotonic stuff for the parser.
|
|
if beam.is_done and self.golds is not None:
|
|
for j in range(beam.size):
|
|
state = StateClass.borrow(<StateC*>beam.at(j))
|
|
if state.is_final():
|
|
try:
|
|
if self.moves.is_gold_parse(state, self.golds[i]):
|
|
beam._states[j].loss = 0.0
|
|
except NotImplementedError:
|
|
break
|
|
|
|
def _set_scores(self, Beam beam, float[:, ::1] scores):
|
|
cdef float* c_scores = &scores[0, 0]
|
|
cdef int nr_state = min(scores.shape[0], beam.size)
|
|
cdef int nr_class = scores.shape[1]
|
|
for i in range(nr_state):
|
|
state = <StateC*>beam.at(i)
|
|
if not state.is_final():
|
|
for j in range(nr_class):
|
|
beam.scores[i][j] = c_scores[i * nr_class + j]
|
|
self.moves.set_valid(beam.is_valid[i], state)
|
|
else:
|
|
for j in range(beam.nr_class):
|
|
beam.scores[i][j] = 0
|
|
beam.costs[i][j] = 0
|
|
|
|
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
|
|
for i in range(beam.size):
|
|
state = StateClass.borrow(<StateC*>beam.at(i))
|
|
if not state.is_final():
|
|
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
|
|
state, gold)
|
|
if follow_gold:
|
|
min_cost = 0
|
|
for j in range(beam.nr_class):
|
|
if beam.is_valid[i][j] and beam.costs[i][j] < min_cost:
|
|
min_cost = beam.costs[i][j]
|
|
for j in range(beam.nr_class):
|
|
if beam.costs[i][j] > min_cost:
|
|
beam.is_valid[i][j] = 0
|
|
|
|
|
|
def get_token_ids(states, int n_tokens):
|
|
cdef StateClass state
|
|
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
|
|
dtype='int32', order='C')
|
|
c_ids = <int*>ids.data
|
|
for i, state in enumerate(states):
|
|
if not state.is_final():
|
|
state.c.set_context_tokens(c_ids, n_tokens)
|
|
else:
|
|
ids[i] = -1
|
|
c_ids += ids.shape[1]
|
|
return ids
|
|
|
|
|
|
nr_update = 0
|
|
|
|
|
|
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
|
states, golds,
|
|
state2vec, vec2scores,
|
|
int width, losses=None, drop=0.,
|
|
early_update=True, beam_density=0.0):
|
|
global nr_update
|
|
cdef MaxViolation violn
|
|
nr_update += 1
|
|
pbeam = ParserBeam(moves, states, golds, width=width, density=beam_density)
|
|
gbeam = ParserBeam(moves, states, golds, width=width, density=beam_density)
|
|
cdef StateClass state
|
|
beam_maps = []
|
|
backprops = []
|
|
violns = [MaxViolation() for _ in range(len(states))]
|
|
for t in range(max_steps):
|
|
if pbeam.is_done and gbeam.is_done:
|
|
break
|
|
# The beam maps let us find the right row in the flattened scores
|
|
# arrays for each state. States are identified by (example id,
|
|
# history). We keep a different beam map for each step (since we'll
|
|
# have a flat scores array for each step). The beam map will let us
|
|
# take the per-state losses, and compute the gradient for each (step,
|
|
# state, class).
|
|
beam_maps.append({})
|
|
# Gather all states from the two beams in a list. Some stats may occur
|
|
# in both beams. To figure out which beam each state belonged to,
|
|
# we keep two lists of indices, p_indices and g_indices
|
|
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
|
|
nr_update)
|
|
if not states:
|
|
break
|
|
# Now that we have our flat list of states, feed them through the model
|
|
token_ids = get_token_ids(states, nr_feature)
|
|
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
|
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
|
|
|
# Store the callbacks for the backward pass
|
|
backprops.append((token_ids, bp_vectors, bp_scores))
|
|
|
|
# Unpack the flat scores into lists for the two beams. The indices arrays
|
|
# tell us which example and state the scores-row refers to.
|
|
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
|
for indices in p_indices]
|
|
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
|
for indices in g_indices]
|
|
# Now advance the states in the beams. The gold beam is constrained to
|
|
# to follow only gold analyses.
|
|
pbeam.advance(p_scores)
|
|
gbeam.advance(g_scores, follow_gold=True)
|
|
# Track the "maximum violation", to use in the update.
|
|
for i, violn in enumerate(violns):
|
|
violn.check_crf(pbeam[i], gbeam[i])
|
|
histories = []
|
|
losses = []
|
|
for violn in violns:
|
|
if violn.p_hist:
|
|
histories.append(violn.p_hist + violn.g_hist)
|
|
losses.append(violn.p_probs + violn.g_probs)
|
|
else:
|
|
histories.append([])
|
|
losses.append([])
|
|
states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
|
|
beams = list(pbeam.beams) + list(gbeam.beams)
|
|
return states_d_scores, backprops[:len(states_d_scores)], beams
|
|
|
|
|
|
def get_states(pbeams, gbeams, beam_map, nr_update):
|
|
seen = {}
|
|
states = []
|
|
p_indices = []
|
|
g_indices = []
|
|
cdef Beam pbeam, gbeam
|
|
if len(pbeams) != len(gbeams):
|
|
raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams)))
|
|
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
|
|
p_indices.append([])
|
|
g_indices.append([])
|
|
for i in range(pbeam.size):
|
|
state = StateClass.borrow(<StateC*>pbeam.at(i))
|
|
if not state.is_final():
|
|
key = tuple([eg_id] + pbeam.histories[i])
|
|
if key in seen:
|
|
raise ValueError(Errors.E080.format(key=key))
|
|
seen[key] = len(states)
|
|
p_indices[-1].append(len(states))
|
|
states.append(state)
|
|
beam_map.update(seen)
|
|
for i in range(gbeam.size):
|
|
state = StateClass.borrow(<StateC*>gbeam.at(i))
|
|
if not state.is_final():
|
|
key = tuple([eg_id] + gbeam.histories[i])
|
|
if key in seen:
|
|
g_indices[-1].append(seen[key])
|
|
else:
|
|
g_indices[-1].append(len(states))
|
|
beam_map[key] = len(states)
|
|
states.append(state)
|
|
p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
|
|
g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
|
|
return states, p_idx, g_idx
|
|
|
|
|
|
def get_gradient(nr_class, beam_maps, histories, losses):
|
|
"""The global model assigns a loss to each parse. The beam scores
|
|
are additive, so the same gradient is applied to each action
|
|
in the history. This gives the gradient of a single *action*
|
|
for a beam state -- so we have "the gradient of loss for taking
|
|
action i given history H."
|
|
|
|
Histories: Each hitory is a list of actions
|
|
Each candidate has a history
|
|
Each beam has multiple candidates
|
|
Each batch has multiple beams
|
|
So history is list of lists of lists of ints
|
|
"""
|
|
grads = []
|
|
nr_steps = []
|
|
for eg_id, hists in enumerate(histories):
|
|
nr_step = 0
|
|
for loss, hist in zip(losses[eg_id], hists):
|
|
if loss != 0.0 and not numpy.isnan(loss):
|
|
nr_step = max(nr_step, len(hist))
|
|
nr_steps.append(nr_step)
|
|
for i in range(max(nr_steps)):
|
|
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
|
|
dtype='f'))
|
|
if len(histories) != len(losses):
|
|
raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses)))
|
|
for eg_id, hists in enumerate(histories):
|
|
for loss, hist in zip(losses[eg_id], hists):
|
|
if loss == 0.0 or numpy.isnan(loss):
|
|
continue
|
|
key = tuple([eg_id])
|
|
# Adjust loss for length
|
|
# We need to do this because each state in a short path is scored
|
|
# multiple times, as we add in the average cost when we run out
|
|
# of actions.
|
|
avg_loss = loss / len(hist)
|
|
loss += avg_loss * (nr_steps[eg_id] - len(hist))
|
|
for j, clas in enumerate(hist):
|
|
i = beam_maps[j][key]
|
|
# In step j, at state i action clas
|
|
# resulted in loss
|
|
grads[j][i, clas] += loss
|
|
key = key + tuple([clas])
|
|
return grads
|
|
|
|
|
|
def cleanup_beam(Beam beam):
|
|
cdef StateC* state
|
|
# Once parsing has finished, states in beam may not be unique. Is this
|
|
# correct?
|
|
seen = set()
|
|
for i in range(beam.width):
|
|
addr = <size_t>beam._parents[i].content
|
|
if addr not in seen:
|
|
state = <StateC*>addr
|
|
del state
|
|
seen.add(addr)
|
|
else:
|
|
raise ValueError(Errors.E023.format(addr=addr, i=i))
|
|
addr = <size_t>beam._states[i].content
|
|
if addr not in seen:
|
|
state = <StateC*>addr
|
|
del state
|
|
seen.add(addr)
|
|
else:
|
|
raise ValueError(Errors.E023.format(addr=addr, i=i))
|
|
|
|
|