from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.neural import Model, Maxout, Softmax, Affine from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.static_vectors import StaticVectors from thinc.neural._classes.batchnorm import BatchNorm from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP def get_col(idx): def forward(X, drop=0.): assert len(X.shape) <= 3 output = Model.ops.xp.ascontiguousarray(X[:, idx]) def backward(y, sgd=None): dX = Model.ops.allocate(X.shape) dX[:, idx] += y return dX return output, backward return layerize(forward) def build_tok2vec(lang, width, depth=2, embed_size=1000): cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone}): #static = get_col(cols.index(ID)) >> StaticVectors(lang, width) lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size) prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width//4, embed_size) suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width//4, embed_size) shape = get_col(cols.index(SHAPE)) >> HashEmbed(width//4, embed_size) tag = get_col(cols.index(TAG)) >> HashEmbed(width//2, embed_size) tok2vec = ( doc2feats(cols) >> with_flatten( #(static | prefix | suffix | shape) (lower | prefix | suffix | shape | tag) >> Maxout(width) >> (ExtractWindow(nW=1) >> Maxout(width, width*3)) >> (ExtractWindow(nW=1) >> Maxout(width, width*3)) ) ) return tok2vec def doc2feats(cols): def forward(docs, drop=0.): feats = [doc.to_array(cols) for doc in docs] feats = [model.ops.asarray(f, dtype='uint64') for f in feats] return feats, None model = layerize(forward) return model def build_feature_precomputer(model, feat_maps): '''Allow a model to be "primed" by pre-computing input features in bulk. This is used for the parser, where we want to take a batch of documents, and compute vectors for each (token, position) pair. These vectors can then be reused, especially for beam-search. Let's say we're using 12 features for each state, e.g. word at start of buffer, three words on stack, their children, etc. In the normal arc-eager system, a document of length N is processed in 2*N states. This means we'll create 2*N*12 feature vectors --- but if we pre-compute, we only need N*12 vector computations. The saving for beam-search is much better: if we have a beam of k, we'll normally make 2*N*12*K computations -- so we can save the factor k. This also gives a nice CPU/GPU division: we can do all our hard maths up front, packed into large multiplications, and do the hard-to-program parsing on the CPU. ''' def precompute(input_vectors): cached, backprops = zip(*[lyr.begin_update(input_vectors) for lyr in feat_maps) def forward(batch_token_ids, drop=0.): output = ops.allocate((batch_size, output_width)) # i: batch index # j: position index (i.e. N0, S0, etc # tok_i: Index of the token within its document for i, token_ids in enumerate(batch_token_ids): for j, tok_i in enumerate(token_ids): output[i] += cached[j][tok_i] def backward(d_vector, sgd=None): d_inputs = ops.allocate((batch_size, n_feat, vec_width)) for i, token_ids in enumerate(batch_token_ids): for j in range(len(token_ids)): d_inputs[i][j] = backprops[j](d_vector, sgd) # Return the IDs, so caller can associate to correct token return (batch_token_ids, d_inputs) return vector, backward return chain(layerize(forward), model) return precompute def print_shape(prefix): def forward(X, drop=0.): return X, lambda dX, **kwargs: dX return layerize(forward) @layerize def get_token_vectors(tokens_attrs_vectors, drop=0.): ops = Model.ops tokens, attrs, vectors = tokens_attrs_vectors def backward(d_output, sgd=None): return (tokens, d_output) return vectors, backward @layerize def flatten(seqs, drop=0.): ops = Model.ops def finish_update(d_X, sgd=None): return d_X X = ops.xp.concatenate([ops.asarray(seq) for seq in seqs]) return X, finish_update