diff --git a/spacy/_ml.py b/spacy/_ml.py index a5dcf56df..bea6a3251 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -118,6 +118,29 @@ class PrecomputableMaxouts(Model): return dXf return Yfp, backward +def Tok2Vec(width, embed_size, preprocess=None): + cols = [LOWER, PREFIX, SUFFIX, SHAPE] + with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): + lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size) + prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2) + suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2) + shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2) + + tok2vec = ( + flatten + >> (lower | prefix | suffix | shape ) + >> Maxout(width, width*4, pieces=3) + >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) + >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) + ) + if preprocess is not None: + tok2vec = preprocess >> tok2vec + # Work around thinc API limitations :(. TODO: Revise in Thinc 7 + tok2vec.nO = width + return tok2vec + def get_col(idx): def forward(X, drop=0.): @@ -125,7 +148,6 @@ def get_col(idx): ops = NumpyOps() else: ops = CupyOps() - assert len(X.shape) <= 3 output = ops.xp.ascontiguousarray(X[:, idx]) def backward(y, sgd=None): dX = ops.allocate(X.shape) @@ -171,8 +193,10 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): def flatten(seqs, drop=0.): if isinstance(seqs[0], numpy.ndarray): ops = NumpyOps() - else: + elif hasattr(CupyOps.xp, 'ndarray') and isinstance(seqs[0], CupyOps.xp.ndarray): ops = CupyOps() + else: + raise ValueError("Unable to flatten sequence of type %s" % type(seqs[0])) lengths = [len(seq) for seq in seqs] def finish_update(d_X, sgd=None): return ops.unflatten(d_X, lengths) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3d5c1513e..cc3698b49 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -64,10 +64,15 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_ with Language.train(output_path, train_data, pos=tagger_cfg, deps=parser_cfg, ner=entity_cfg) as trainer: + for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): - for doc, gold in epoch: - trainer.update(doc, gold) - dev_scores = trainer.evaluate(dev_data).scores if dev_data else defaultdict(float) + for docs, golds in partition_all(12, epoch): + trainer.update(docs, golds) + + if dev_data: + dev_scores = trainer.evaluate(dev_data).scores + else: + defaultdict(float) print_progress(itn, trainer.nlp.parser.model.nr_weight, trainer.nlp.parser.model.nr_active_feat, **dev_scores) diff --git a/spacy/language.py b/spacy/language.py index 473381159..7f7d2c745 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -247,6 +247,7 @@ class Language(object): self.tokenizer = self.Defaults.create_tokenizer(self) \ if 'tokenizer' not in overrides \ else overrides['tokenizer'] + self.tagger = self.Defaults.create_tagger(self) \ if 'tagger' not in overrides \ else overrides['tagger'] diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index fd03efe4c..4976ff317 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -27,40 +27,26 @@ from thinc.neural._classes.resnet import Residual from thinc.neural._classes.batchnorm import BatchNorm as BN from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP -from ._ml import flatten, get_col, doc2feats +from ._ml import Tok2Vec, flatten, get_col, doc2feats class TokenVectorEncoder(object): '''Assign position-sensitive vectors to tokens, using a CNN or RNN.''' - def __init__(self, vocab, token_vector_width, **cfg): + + @classmethod + def Model(cls, width=128, embed_size=5000, **cfg): + return Tok2Vec(width, embed_size, preprocess=False) + + def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.doc2feats = doc2feats() - self.model = self.build_model(vocab.lang, token_vector_width, **cfg) - self.tagger = chain( - self.model, - Softmax(self.vocab.morphology.n_tags, - token_vector_width)) - - def build_model(self, lang, width, embed_size=5000, **cfg): - cols = self.doc2feats.cols - with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): - lower = get_col(cols.index(LOWER)) >> (HashEmbed(width, embed_size) - +HashEmbed(width, embed_size)) - prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2) - suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2) - shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2) - - tok2vec = ( - flatten - >> (lower | prefix | suffix | shape ) - >> Maxout(width, pieces=3) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - ) - return tok2vec + self.model = self.Model() if model is True else model + if self.model not in (None, False): + self.tagger = chain( + self.model, + Softmax(self.vocab.morphology.n_tags, + self.model.nO)) def pipe(self, docs): docs = list(docs) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index bfe3a6bea..f27580de5 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -23,6 +23,7 @@ cdef cppclass StateC: Entity* _ents TokenC _empty_token int length + int offset int _s_i int _b_i int _e_i diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd index 42088a9ff..7ff4b9f9f 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/syntax/nn_parser.pxd @@ -10,9 +10,8 @@ from ._state cimport StateC cdef class Parser: cdef readonly Vocab vocab - cdef readonly object model + cdef public object model cdef readonly TransitionSystem moves cdef readonly object cfg - cdef public object feature_maps #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 915cce706..b3a5e9ca1 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -1,5 +1,7 @@ # cython: infer_types=True # cython: profile=True +# cython: cdivision=True +# cython: boundscheck=False # coding: utf-8 from __future__ import unicode_literals, print_function @@ -30,11 +32,12 @@ from preshed.maps cimport MapStruct from preshed.maps cimport map_get from thinc.api import layerize, chain -from thinc.neural import BatchNorm, Model, Affine, ELU, ReLu, Maxout +from thinc.neural import Model, Affine, ELU, ReLu, Maxout from thinc.neural.ops import NumpyOps -from ..util import get_cuda_stream +from ..util import get_async, get_cuda_stream from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts +from .._ml import Tok2Vec, doc2feats from . import _parse_features from ._parse_features cimport CONTEXT_SIZE @@ -61,8 +64,7 @@ def set_debug(val): DEBUG = val -def get_greedy_model_for_batch(batch_size, tokvecs, lower_model, cuda_stream=None, - drop=0.): +cdef class precompute_hiddens: '''Allow a model to be "primed" by pre-computing input features in bulk. This is used for the parser, where we want to take a batch of documents, @@ -79,95 +81,88 @@ def get_greedy_model_for_batch(batch_size, tokvecs, lower_model, cuda_stream=Non we can do all our hard maths up front, packed into large multiplications, and do the hard-to-program parsing on the CPU. ''' - gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) - cdef np.ndarray cached - if not isinstance(gpu_cached, numpy.ndarray): - cached = gpu_cached.get(stream=cuda_stream) - else: - cached = gpu_cached - nF = gpu_cached.shape[1] - nO = gpu_cached.shape[2] - nP = gpu_cached.shape[3] - ops = lower_model.ops - features = numpy.zeros((batch_size, nO, nP), dtype='f') - synchronized = False + cdef int nF, nO, nP + cdef bint _is_synchronized + cdef public object ops + cdef np.ndarray _features + cdef np.ndarray _cached + cdef object _cuda_stream + cdef object _bp_hiddens - def forward(token_ids, drop=0.): - nonlocal synchronized - if not synchronized and cuda_stream is not None: - cuda_stream.synchronize() - synchronized = True - # This is tricky, but: + def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.): + gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) + cdef np.ndarray cached + if not isinstance(gpu_cached, numpy.ndarray): + # Note the passing of cuda_stream here: it lets + # cupy make the copy asynchronously. + # We then have to block before first use. + cached = gpu_cached.get(stream=cuda_stream) + else: + cached = gpu_cached + self.nF = cached.shape[1] + self.nO = cached.shape[2] + self.nP = cached.shape[3] + self.ops = lower_model.ops + self._features = numpy.zeros((batch_size, self.nO, self.nP), dtype='f') + self._is_synchronized = False + self._cuda_stream = cuda_stream + self._cached = cached + self._bp_hiddens = bp_features + + def __call__(self, X): + return self.begin_update(X)[0] + + def begin_update(self, token_ids, drop=0.): + self._features.fill(0) + if not self._is_synchronized \ + and self._cuda_stream is not None: + self._cuda_stream.synchronize() + self._synchronized = True + # This is tricky, but (assuming GPU available); # - Input to forward on CPU # - Output from forward on CPU # - Input to backward on GPU! # - Output from backward on GPU - nonlocal features - features = features[:len(token_ids)] - features.fill(0) - cdef float[:, :, ::1] feats = features + cdef np.ndarray state_vector = self._features[:len(token_ids)] + cdef np.ndarray hiddens = self._cached + bp_hiddens = self._bp_hiddens + cdef int[:, ::1] ids = token_ids - _sum_features(&feats[0,0,0], - cached.data, &ids[0,0], - token_ids.shape[0], nF, nO*nP) + self._sum_features(state_vector.data, + hiddens.data, &ids[0,0], + token_ids.shape[0], self.nF, self.nO*self.nP) - if nP >= 2: - best, which = ops.maxout(features) - else: - best = features.reshape((features.shape[0], features.shape[1])) - which = None + output, bp_output = self._apply_nonlinearity(state_vector) - def backward(d_best, sgd=None): + def backward(d_output, sgd=None): # This will usually be on GPU - if isinstance(d_best, numpy.ndarray): - d_best = ops.xp.array(d_best) - if nP >= 2: - d_features = ops.backprop_maxout(d_best, which, nP) - else: - d_features = d_best.reshape((d_best.shape[0], d_best.shape[1], 1)) - d_tokens = bp_features((d_features, token_ids), sgd) + if isinstance(d_output, numpy.ndarray): + d_output = self.ops.xp.array(d_output) + d_state_vector = bp_output(d_output, sgd) + d_tokens = bp_hiddens((d_state_vector, token_ids), sgd) return d_tokens + return output, backward - return best, backward + def _apply_nonlinearity(self, X): + if self.nP < 2: + return X.reshape(X.shape[:2]), lambda dX, sgd=None: dX.reshape(X.shape) + best, which = self.ops.maxout(X) + return best, lambda dX, sgd=None: self.ops.backprop_maxout(dX, which, self.nP) - return forward - - -cdef void _sum_features(float* output, - const float* cached, const int* token_ids, int B, int F, int O) nogil: - cdef int idx, b, f, i - cdef const float* feature - for b in range(B): - for f in range(F): - if token_ids[f] < 0: - continue - idx = token_ids[f] * F * O + f*O - feature = &cached[idx] - for i in range(O): - output[i] += feature[i] - output += O - token_ids += F - - -def get_batch_loss(TransitionSystem moves, states, golds, float[:, ::1] scores): - cdef StateClass state - cdef GoldParse gold - cdef Pool mem = Pool() - cdef int i - is_valid = mem.alloc(moves.n_moves, sizeof(int)) - costs = mem.alloc(moves.n_moves, sizeof(float)) - cdef np.ndarray d_scores = numpy.zeros((len(states), moves.n_moves), dtype='f', - order='c') - c_d_scores = d_scores.data - for i, (state, gold) in enumerate(zip(states, golds)): - memset(is_valid, 0, moves.n_moves * sizeof(int)) - memset(costs, 0, moves.n_moves * sizeof(float)) - moves.set_costs(is_valid, costs, state, gold) - cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]) - #cpu_regression_loss(c_d_scores, - # costs, is_valid, &scores[i, 0], d_scores.shape[1]) - c_d_scores += d_scores.shape[1] - return d_scores + cdef void _sum_features(self, float* output, + const float* cached, const int* token_ids, int B, int F, int O) nogil: + cdef int idx, b, f, i + cdef const float* feature + for b in range(B): + for f in range(F): + if token_ids[f] < 0: + continue + idx = token_ids[f] * F * O + f*O + feature = &cached[idx] + for i in range(O): + output[i] += feature[i] + output += O + token_ids += F cdef void cpu_log_loss(float* d_scores, @@ -217,121 +212,62 @@ cdef void cpu_regression_loss(float* d_scores, d_scores[i] = diff -def init_states(TransitionSystem moves, docs): - cdef Doc doc - cdef StateClass state - offsets = [] - states = [] - offset = 0 - for i, doc in enumerate(docs): - state = StateClass.init(doc.c, doc.length) - moves.initialize_state(state.c) - states.append(state) - offsets.append(offset) - offset += len(doc) - return states, offsets - - -def extract_token_ids(states, offsets=None, nF=1, nB=0, nS=2, nL=0, nR=0): - cdef StateClass state - cdef int n_tokens = states[0].nr_context_tokens(nF, nB, nS, nL, nR) - ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c') - if offsets is None: - offsets = [0] * len(states) - for i, (state, offset) in enumerate(zip(states, offsets)): - state.set_context_tokens(ids[i], nF, nB, nS, nL, nR) - ids[i] += (ids[i] >= 0) * offset - return ids - - -_n_iter = 0 -@layerize -def print_mean_variance(X, drop=0.): - global _n_iter - _n_iter += 1 - fwd_iter = _n_iter - means = X.mean(axis=0) - variance = X.var(axis=0) - print(fwd_iter, "M", ', '.join(('%.2f' % m) for m in means)) - print(fwd_iter, "V", ', '.join(('%.2f' % m) for m in variance)) - def backward(dX, sgd=None): - means = dX.mean(axis=0) - variance = dX.var(axis=0) - print(fwd_iter, "dM", ', '.join(('%.2f' % m) for m in means)) - print(fwd_iter, "dV", ', '.join(('%.2f' % m) for m in variance)) - return X, backward - - cdef class Parser: """ Base class of the DependencyParser and EntityRecognizer. """ @classmethod - def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg): - """ - Load the statistical model from the supplied path. + def Model(cls, nr_class, tok2vec=None, hidden_width=128, **cfg): + if tok2vec is None: + tok2vec = Tok2Vec(hidden_width, 5000, preprocess=doc2feats()) + token_vector_width = tok2vec.nO + nr_context_tokens = StateClass.nr_context_tokens() + lower = PrecomputableMaxouts(hidden_width, + nF=nr_context_tokens, + nI=token_vector_width, + pieces=cfg.get('maxout_pieces', 1)) - Arguments: - path (Path): - The path to load from. - vocab (Vocab): - The vocabulary. Must be shared by the documents to be processed. - require (bool): - Whether to raise an error if the files are not found. - Returns (Parser): - The newly constructed object. - """ - with (path / 'config.json').open() as file_: - cfg = ujson.load(file_) - self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg) - if (path / 'model').exists(): - self.model.load(str(path / 'model')) - elif require: - raise IOError( - "Required file %s/model not found when loading" % str(path)) - return self + with Model.use_device('cpu'): + upper = chain( + Maxout(hidden_width), + zero_init(Affine(nr_class)) + ) + # TODO: This is an unfortunate hack atm! + # Used to set input dimensions in network. + lower.begin_training(lower.ops.allocate((500, token_vector_width))) + upper.begin_training(upper.ops.allocate((500, hidden_width))) + return tok2vec, lower, upper - def __init__(self, Vocab vocab, TransitionSystem=None, model=None, **cfg): + @classmethod + def Moves(cls): + return TransitionSystem() + + def __init__(self, Vocab vocab, moves=True, model=True, **cfg): """ Create a Parser. Arguments: vocab (Vocab): The vocabulary object. Must be shared with documents to be processed. - model (thinc Model): - The statistical model. - Returns (Parser): - The newly constructed object. + The value is set to the .vocab attribute. + moves (TransitionSystem): + Defines how the parse-state is created, updated and evaluated. + The value is set to the .moves attribute unless True (default), + in which case a new instance is created with Parser.Moves(). + model (object): + Defines how the parse-state is created, updated and evaluated. + The value is set to the .model attribute unless True (default), + in which case a new instance is created with Parser.Model(). + **cfg: + Arbitrary configuration parameters. Set to the .cfg attribute """ - if TransitionSystem is None: - TransitionSystem = self.TransitionSystem self.vocab = vocab - cfg['actions'] = TransitionSystem.get_actions(**cfg) - self.moves = TransitionSystem(vocab.strings, cfg['actions']) - if model is None: - self.model, self.feature_maps = self.build_model(**cfg) - else: - self.model, self.feature_maps = model + self.moves = self.Moves(self.vocab) if moves is True else moves + self.model = self.Model(self.moves.n_moves) if model is True else model self.cfg = cfg def __reduce__(self): - return (Parser, (self.vocab, self.moves, self.model), None, None) - - def build_model(self, - hidden_width=128, token_vector_width=96, nr_vector=1000, - nF=1, nB=1, nS=1, nL=1, nR=1, **cfg): - nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR) - with Model.use_device('cpu'): - upper = chain( - Maxout(hidden_width, hidden_width), - #print_mean_variance, - zero_init(Affine(self.moves.n_moves, hidden_width))) - assert isinstance(upper.ops, NumpyOps) - lower = PrecomputableMaxouts(hidden_width, nF=nr_context_tokens, nI=token_vector_width, - pieces=cfg.get('maxout_pieces', 1)) - lower.begin_training(lower.ops.allocate((500, token_vector_width))) - upper.begin_training(upper.ops.allocate((500, hidden_width))) - return upper, lower + return (Parser, (self.vocab, self.moves, self.model, self.cfg), None, None) def __call__(self, Doc tokens): """ @@ -356,168 +292,145 @@ cdef class Parser: The number of threads with which to work on the buffer in parallel. Yields (Doc): Documents, in order. """ + cdef StateClass state + cdef Doc doc queue = [] - for doc in stream: - queue.append(doc) - if len(queue) == batch_size: - self.parse_batch(queue) - for doc in queue: - self.moves.finalize_doc(doc) - yield doc - queue = [] - if queue: - self.parse_batch(queue) - for doc in queue: + for docs in cytoolz.partition_all(batch_size, stream): + docs = list(docs) + states = self.parse_batch(docs) + for state, doc in zip(states, docs): + self.moves.finalize_state(state.c) + for i in range(doc.length): + doc.c[i] = state.c._sent[i] self.moves.finalize_doc(doc) yield doc - def parse_batch(self, docs_tokvecs): - cdef: - int nC - Doc doc - StateClass state - np.ndarray py_scores - int[500] is_valid # Hacks for now + def parse_batch(self, docs): + cuda_stream = get_cuda_stream() + + tokvecs = self.model[0](docs) + states = self.moves.init_batch(docs) + state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, + cuda_stream, 0.0) + + todo = [st for st in states if not st.is_final()] + while todo: + token_ids = self.get_token_ids(states) + vectors = state2vec(token_ids) + scores = vec2scores(vectors) + self.transition_batch(states, scores) + todo = [st for st in states if not st.is_final()] + self.finish_batch(states, docs) + + def update(self, docs, golds, drop=0., sgd=None): + if isinstance(docs, Doc) and isinstance(golds, GoldParse): + return self.update([docs], [golds], drop=drop, sgd=sgd) cuda_stream = get_cuda_stream() - docs, tokvecs = docs_tokvecs - lower_model = get_greedy_model_for_batch(len(docs), tokvecs, self.feature_maps, - cuda_stream) - upper_model = self.model + for gold in golds: + self.moves.preprocess_gold(gold) - states, offsets = init_states(self.moves, docs) - all_states = list(states) - todo = [st for st in zip(states, offsets) if not st[0].py_is_final()] + tokvecs, bp_tokvecs = self.model[0].begin_update(docs, drop=drop) + states = self.moves.init_batch(docs) + state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, + drop) + todo = [(s, g) for s, g in zip(states, golds) if not s.is_final()] + + backprops = [] + cdef float loss = 0. while todo: - states, offsets = zip(*todo) - token_ids = extract_token_ids(states, offsets=offsets) + states, golds = zip(*todo) - py_scores = upper_model(lower_model(token_ids)[0]) - scores = py_scores.data - nC = py_scores.shape[1] - for state, offset in zip(states, offsets): - self.moves.set_valid(is_valid, state.c) - guess = arg_max_if_valid(scores, is_valid, nC) - action = self.moves.c[guess] - action.do(state.c, action.label) - scores += nC - todo = [st for st in todo if not st[0].py_is_final()] + token_ids = self.get_token_ids(states) + vector, bp_vector = state2vec.begin_update(token_ids, drop=drop) + scores, bp_scores = vec2scores.begin_update(vector, drop=drop) - for state, doc in zip(all_states, docs): + d_scores = self.get_batch_loss(states, golds, scores) + d_vector = bp_scores(d_scores, sgd=sgd) + loss += (d_scores**2).sum() + + if not isinstance(tokvecs, state2vec.ops.xp.ndarray): + backprops.append((token_ids, d_vector, bp_vector)) + else: + # Move token_ids and d_vector to CPU, asynchronously + backprops.append(( + get_async(cuda_stream, token_ids), + get_async(cuda_stream, d_vector), + bp_vector + )) + self.transition_batch(states, scores) + todo = [st for st in todo if not st[0].is_final()] + # Tells CUDA to block, so our async copies complete. + if cuda_stream is not None: + cuda_stream.synchronize() + d_tokvecs = state2vec.ops.allocate(tokvecs.shape) + xp = state2vec.ops.xp # Handle for numpy/cupy + for token_ids, d_vector, bp_vector in backprops: + d_state_features = bp_vector(d_vector, sgd=sgd) + active_feats = token_ids * (token_ids >= 0) + active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1)) + if hasattr(xp, 'scatter_add'): + xp.scatter_add(d_tokvecs, + token_ids, d_state_features * active_feats) + else: + xp.add.at(d_tokvecs, + token_ids, d_state_features * active_feats) + bp_tokvecs(d_tokvecs, sgd) + return loss + + def get_batch_model(self, batch_size, tokvecs, stream, dropout): + state2vec = precompute_hiddens(batch_size, tokvecs, + self.model[1], stream, drop=dropout) + return state2vec, self.model[-1] + + def get_token_ids(self, states): + cdef StateClass state + cdef int n_tokens = states[0].nr_context_tokens() + ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c') + for i, state in enumerate(states): + state.set_context_tokens(ids[i]) + return ids + + def transition_batch(self, states, float[:, ::1] scores): + cdef StateClass state + cdef int[500] is_valid # TODO: Unhack + cdef float* c_scores = &scores[0, 0] + for state in states: + self.moves.set_valid(is_valid, state.c) + guess = arg_max_if_valid(c_scores, is_valid, scores.shape[1]) + action = self.moves.c[guess] + action.do(state.c, action.label) + c_scores += scores.shape[1] + + def get_batch_loss(self, states, golds, float[:, ::1] scores): + cdef StateClass state + cdef GoldParse gold + cdef Pool mem = Pool() + cdef int i + is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) + costs = mem.alloc(self.moves.n_moves, sizeof(float)) + cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves), + dtype='f', order='C') + c_d_scores = d_scores.data + for i, (state, gold) in enumerate(zip(states, golds)): + memset(is_valid, 0, self.moves.n_moves * sizeof(int)) + memset(costs, 0, self.moves.n_moves * sizeof(float)) + self.moves.set_costs(is_valid, costs, state, gold) + cpu_log_loss(c_d_scores, + costs, is_valid, &scores[i, 0], d_scores.shape[1]) + c_d_scores += d_scores.shape[1] + return d_scores + + def finish_batch(self, states, docs): + cdef StateClass state + cdef Doc doc + for state, doc in zip(states, docs): self.moves.finalize_state(state.c) for i in range(doc.length): doc.c[i] = state.c._sent[i] self.moves.finalize_doc(doc) - def update(self, docs_tokvecs, golds, drop=0., sgd=None): - cdef: - int nC - Doc doc - StateClass state - np.ndarray scores - - docs, tokvecs = docs_tokvecs - cuda_stream = get_cuda_stream() - lower_model = get_greedy_model_for_batch(len(docs), - tokvecs, self.feature_maps, cuda_stream=cuda_stream, - drop=drop) - if isinstance(docs, Doc) and isinstance(golds, GoldParse): - return self.update(([docs], tokvecs), [golds], drop=drop) - for gold in golds: - self.moves.preprocess_gold(gold) - - states, offsets = init_states(self.moves, docs) - - todo = zip(states, offsets, golds) - todo = filter(lambda sp: not sp[0].py_is_final(), todo) - - cdef Pool mem = Pool() - is_valid = mem.alloc(len(states) * self.moves.n_moves, sizeof(int)) - costs = mem.alloc(len(states) * self.moves.n_moves, sizeof(float)) - - upper_model = self.model - d_tokens = self.feature_maps.ops.allocate(tokvecs.shape) - backprops = [] - n_tokens = tokvecs.shape[0] - nF = self.feature_maps.nF - loss = 0. - total = 1e-4 - follow_gold = False - cupy = self.feature_maps.ops.xp - while len(todo) >= 4: - states, offsets, golds = zip(*todo) - - token_ids = extract_token_ids(states, offsets=offsets) - lower, bp_lower = lower_model(token_ids, drop=drop) - scores, bp_scores = upper_model.begin_update(lower, drop=drop) - - d_scores = get_batch_loss(self.moves, states, golds, scores) - loss += numpy.abs(d_scores).sum() - total += d_scores.shape[0] - d_lower = bp_scores(d_scores, sgd=sgd) - - if isinstance(tokvecs, cupy.ndarray): - gpu_tok_ids = cupy.ndarray(token_ids.shape, dtype='i', order='C') - gpu_d_lower = cupy.ndarray(d_lower.shape, dtype='f', order='C') - gpu_tok_ids.set(token_ids, stream=cuda_stream) - gpu_d_lower.set(d_lower, stream=cuda_stream) - backprops.append((gpu_tok_ids, gpu_d_lower, bp_lower)) - else: - backprops.append((token_ids, d_lower, bp_lower)) - - c_scores = scores.data - for state, gold in zip(states, golds): - if follow_gold: - self.moves.set_costs(is_valid, costs, state, gold) - guess = arg_max_if_gold(c_scores, costs, is_valid, scores.shape[1]) - else: - self.moves.set_valid(is_valid, state.c) - guess = arg_max_if_valid(c_scores, is_valid, scores.shape[1]) - action = self.moves.c[guess] - action.do(state.c, action.label) - c_scores += scores.shape[1] - - todo = filter(lambda sp: not sp[0].py_is_final(), todo) - # This tells CUDA to block --- so we know our copies are complete. - cuda_stream.synchronize() - for token_ids, d_lower, bp_lower in backprops: - d_state_features = bp_lower(d_lower, sgd=sgd) - active_feats = token_ids * (token_ids >= 0) - active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1)) - if hasattr(self.feature_maps.ops.xp, 'scatter_add'): - self.feature_maps.ops.xp.scatter_add(d_tokens, - token_ids, d_state_features * active_feats) - else: - self.model.ops.xp.add.at(d_tokens, - token_ids, d_state_features * active_feats) - return d_tokens, loss / total - - def step_through(self, Doc doc, GoldParse gold=None): - """ - Set up a stepwise state, to introspect and control the transition sequence. - - Arguments: - doc (Doc): The document to step through. - gold (GoldParse): Optional gold parse - Returns (StepwiseState): - A state object, to step through the annotation process. - """ - return StepwiseState(self, doc, gold=gold) - - def from_transition_sequence(self, Doc doc, sequence): - """Control the annotations on a document by specifying a transition sequence - to follow. - - Arguments: - doc (Doc): The document to annotate. - sequence: A sequence of action names, as unicode strings. - Returns: None - """ - with self.step_through(doc) as stepwise: - for transition in sequence: - stepwise.transition(transition) - def add_label(self, label): # Doesn't set label into serializer -- subclasses override it to do that. for action in self.moves.action_types: @@ -528,108 +441,6 @@ cdef class Parser: self.cfg.setdefault('extra_labels', []).append(label) -cdef class StepwiseState: - cdef readonly StateClass stcls - cdef readonly Example eg - cdef readonly Doc doc - cdef readonly GoldParse gold - cdef readonly Parser parser - - def __init__(self, Parser parser, Doc doc, GoldParse gold=None): - self.parser = parser - self.doc = doc - if gold is not None: - self.gold = gold - self.parser.moves.preprocess_gold(self.gold) - else: - self.gold = GoldParse(doc) - self.stcls = StateClass.init(doc.c, doc.length) - self.parser.moves.initialize_state(self.stcls.c) - self.eg = Example( - nr_class=self.parser.moves.n_moves, - nr_atom=CONTEXT_SIZE, - nr_feat=self.parser.model.nr_feat) - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.finish() - - @property - def is_final(self): - return self.stcls.is_final() - - @property - def stack(self): - return self.stcls.stack - - @property - def queue(self): - return self.stcls.queue - - @property - def heads(self): - return [self.stcls.H(i) for i in range(self.stcls.c.length)] - - @property - def deps(self): - return [self.doc.vocab.strings[self.stcls.c._sent[i].dep] - for i in range(self.stcls.c.length)] - - @property - def costs(self): - """ - Find the action-costs for the current state. - """ - if not self.gold: - raise ValueError("Can't set costs: No GoldParse provided") - self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs, - self.stcls, self.gold) - costs = {} - for i in range(self.parser.moves.n_moves): - if not self.eg.c.is_valid[i]: - continue - transition = self.parser.moves.c[i] - name = self.parser.moves.move_name(transition.move, transition.label) - costs[name] = self.eg.c.costs[i] - return costs - - def predict(self): - self.eg.reset() - #self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features, - # self.stcls.c) - self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c) - #self.parser.model.set_scoresC(self.eg.c.scores, - # self.eg.c.features, self.eg.c.nr_feat) - - cdef Transition action = self.parser.moves.c[self.eg.guess] - return self.parser.moves.move_name(action.move, action.label) - - def transition(self, action_name=None): - if action_name is None: - action_name = self.predict() - moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3} - if action_name == '_': - action_name = self.predict() - action = self.parser.moves.lookup_transition(action_name) - elif action_name == 'L' or action_name == 'R': - self.predict() - move = moves[action_name] - clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c, - self.eg.c.nr_class) - action = self.parser.moves.c[clas] - else: - action = self.parser.moves.lookup_transition(action_name) - action.do(self.stcls.c, action.label) - - def finish(self): - if self.stcls.is_final(): - self.parser.moves.finalize_state(self.stcls.c) - self.doc.set_parse(self.stcls.c._sent) - self.parser.moves.finalize_doc(self.doc) - - class ParserStateError(ValueError): def __init__(self, doc): ValueError.__init__(self, diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 0eaf1fe55..62fda5ade 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -9,17 +9,24 @@ from ..vocab cimport EMPTY_LEXEME from ._state cimport StateC -@cython.final cdef class StateClass: cdef Pool mem cdef StateC* c @staticmethod cdef inline StateClass init(const TokenC* sent, int length): - cdef StateClass self = StateClass(length) + cdef StateClass self = StateClass() self.c = new StateC(sent, length) return self + @staticmethod + cdef inline StateClass init_offset(const TokenC* sent, int length, int + offset): + cdef StateClass self = StateClass() + self.c = new StateC(sent, length) + self.c.offset = offset + return self + cdef inline int S(self, int i) nogil: return self.c.S(i) @@ -68,9 +75,6 @@ cdef class StateClass: cdef inline bint at_break(self) nogil: return self.c.at_break() - cdef inline bint is_final(self) nogil: - return self.c.is_final() - cdef inline bint has_head(self, int i) nogil: return self.c.has_head(i) @@ -97,22 +101,22 @@ cdef class StateClass: cdef inline void pop(self) nogil: self.c.pop() - + cdef inline void unshift(self) nogil: self.c.unshift() cdef inline void add_arc(self, int head, int child, int label) nogil: self.c.add_arc(head, child, label) - + cdef inline void del_arc(self, int head, int child) nogil: self.c.del_arc(head, child) cdef inline void open_ent(self, int label) nogil: self.c.open_ent(label) - + cdef inline void close_ent(self) nogil: self.c.close_ent() - + cdef inline void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil: self.c.set_ent_tag(i, ent_iob, ent_type) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 0c160bd2a..fd38710e7 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -12,12 +12,16 @@ from ..symbols cimport punct from ..attrs cimport IS_SPACE from ..attrs cimport attr_id_t from ..tokens.token cimport Token +from ..tokens.doc cimport Doc cdef class StateClass: - def __init__(self, int length): + def __init__(self, Doc doc=None, int offset=0): cdef Pool mem = Pool() self.mem = mem + if doc is not None: + self.c = new StateC(doc.c, doc.length) + self.c.offset = offset def __dealloc__(self): del self.c @@ -34,7 +38,7 @@ cdef class StateClass: def token_vector_lenth(self): return self.doc.tensor.shape[1] - def py_is_final(self): + def is_final(self): return self.c.is_final() def print_state(self, words): @@ -47,11 +51,10 @@ cdef class StateClass: return ' '.join((third, second, top, '|', n0, n1)) @classmethod - def nr_context_tokens(cls, int nF, int nB, int nS, int nL, int nR): + def nr_context_tokens(cls): return 13 - def set_context_tokens(self, int[:] output, nF=1, nB=0, nS=2, - nL=2, nR=2): + def set_context_tokens(self, int[::1] output): output[0] = self.B(0) output[1] = self.B(1) output[2] = self.S(0) @@ -67,21 +70,6 @@ cdef class StateClass: output[11] = self.R(self.S(1), 1) output[12] = self.R(self.S(1), 2) - def set_attributes(self, uint64_t[:, :] vals, int[:] tokens, int[:] names): - cdef int i, j, tok_i - for i in range(tokens.shape[0]): - tok_i = tokens[i] - if tok_i >= 0: - token = &self.c._sent[tok_i] - for j in range(names.shape[0]): - vals[i, j] = Token.get_struct_attr(token, names[j]) - else: - vals[i] = 0 - - def set_token_vectors(self, tokvecs, - all_tokvecs, int[:] indices): - for i in range(indices.shape[0]): - if indices[i] >= 0: - tokvecs[i] = all_tokvecs[indices[i]] - else: - tokvecs[i] = 0 + for i in range(13): + if output[i] != -1: + output[i] += self.c.offset diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 5009bae6e..45ff5b5c9 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -58,6 +58,17 @@ cdef class TransitionSystem: (self.strings, labels_by_action, self.freqs), None, None) + def init_batch(self, docs): + cdef StateClass state + states = [] + offset = 0 + for doc in docs: + state = StateClass(doc, offset=offset) + self.initialize_state(state.c) + states.append(state) + offset += len(doc) + return states + cdef int initialize_state(self, StateC* state) nogil: pass diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py new file mode 100644 index 000000000..ea7bd3cf6 --- /dev/null +++ b/spacy/tests/parser/test_neural_parser.py @@ -0,0 +1,70 @@ +from thinc.neural import Model +from mock import Mock +import pytest +import numpy + +from ..._ml import chain, Tok2Vec, doc2feats +from ...vocab import Vocab +from ...pipeline import TokenVectorEncoder +from ...syntax.arc_eager import ArcEager +from ...syntax.nn_parser import Parser +from ...tokens.doc import Doc +from ...gold import GoldParse + + +@pytest.fixture +def vocab(): + return Vocab() + + +@pytest.fixture +def arc_eager(vocab): + actions = ArcEager.get_actions(left_labels=['L'], right_labels=['R']) + return ArcEager(vocab.strings, actions) + + +@pytest.fixture +def tok2vec(): + return Tok2Vec(8, 100, preprocess=doc2feats()) + + +@pytest.fixture +def parser(vocab, arc_eager): + return Parser(vocab, moves=arc_eager, model=None) + +@pytest.fixture +def model(arc_eager, tok2vec): + return Parser.Model(arc_eager.n_moves, tok2vec) + +@pytest.fixture +def doc(vocab): + return Doc(vocab, words=['a', 'b', 'c']) + +@pytest.fixture +def gold(doc): + return GoldParse(doc, heads=[1, 1, 1], deps=['L', 'ROOT', 'R']) +def test_can_init_nn_parser(parser): + assert parser.model is None + + +def test_build_model(parser, tok2vec): + parser.model = Parser.Model(parser.moves.n_moves, tok2vec) + assert parser.model is not None + + +def test_predict_doc(parser, model, doc): + parser.model = model + parser(doc) + + +def test_update_doc(parser, model, doc, gold): + parser.model = model + loss1 = parser.update(doc, gold) + assert loss1 > 0 + loss2 = parser.update(doc, gold) + assert loss2 == loss1 + def optimize(weights, gradient, key=None): + weights -= 0.001 * gradient + loss3 = parser.update(doc, gold, sgd=optimize) + loss4 = parser.update(doc, gold, sgd=optimize) + assert loss3 < loss2 diff --git a/spacy/train.py b/spacy/train.py index 13d7e1d84..87b262333 100644 --- a/spacy/train.py +++ b/spacy/train.py @@ -3,6 +3,10 @@ from __future__ import absolute_import, unicode_literals import random import tqdm + +from thinc.neural.optimizers import Adam +from thinc.neural.ops import NumpyOps, CupyOps + from .gold import GoldParse, merge_sents from .scorer import Scorer @@ -44,10 +48,12 @@ class Trainer(object): yield _epoch(indices) self.nr_epoch += 1 - def update(self, doc, gold): + def update(self, docs, golds, drop=0.): for process in self.nlp.pipeline: if hasattr(process, 'update'): - loss = process.update(doc, gold, itn=self.nr_epoch) + loss = process.update(doc, gold, sgd=self.sgd, drop=drop, + itn=self.nr_epoch) + self.sgd.finish_update() else: process(doc) return doc diff --git a/spacy/util.py b/spacy/util.py index bbb563754..50d0ab73d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -15,7 +15,15 @@ from .compat import path2str, basestring_, input_, unicode_ LANGUAGES = {} _data_path = Path(__file__).parent / 'data' +try: + from cupy.cuda.stream import Stream as CudaStream +except ImportError: + CudaStream = None +try: + import cupy +except ImportError: + cupy = None def set_lang_class(name, cls): global LANGUAGES @@ -152,11 +160,14 @@ def parse_package_meta(package_path, require=True): def get_cuda_stream(require=False): # TODO: Error and tell to install chainer if not found # Requires GPU - try: - from cupy.cuda.stream import Stream - except ImportError: - return None - return Stream() + return CudaStream() if CudaStream is not None else None + + +def get_async(stream, numpy_array): + if cupy is None: + return numpy_array + else: + return cupy.array(numpy_array, stream=stream) def read_regex(path):