spaCy/spacy/_ml.py

from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed

from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.static_vectors import StaticVectors
from thinc.neural._classes.batchnorm import BatchNorm

from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP


def get_col(idx):
    def forward(X, drop=0.):
        assert len(X.shape) <= 3
        output = Model.ops.xp.ascontiguousarray(X[:, idx])
        def backward(y, sgd=None):
            dX = Model.ops.allocate(X.shape)
            dX[:, idx] += y
            return dX
        return output, backward
    return layerize(forward)


def build_tok2vec(lang, width, depth=2, embed_size=1000):
    cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG]
    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone}):
        #static = get_col(cols.index(ID))     >> StaticVectors(lang, width)
        lower = get_col(cols.index(LOWER))     >> HashEmbed(width, embed_size)
        prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width//4, embed_size)
        suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width//4, embed_size)
        shape = get_col(cols.index(SHAPE))   >> HashEmbed(width//4, embed_size)
        tag = get_col(cols.index(TAG))   >> HashEmbed(width//2, embed_size)
        tok2vec = (
            doc2feats(cols)
            >> with_flatten(
                #(static | prefix | suffix | shape)
                (lower | prefix | suffix | shape | tag)
                >> Maxout(width)
                >> (ExtractWindow(nW=1) >> Maxout(width, width*3))
                >> (ExtractWindow(nW=1) >> Maxout(width, width*3))
            )
        )
    return tok2vec


def doc2feats(cols):
    def forward(docs, drop=0.):
        feats = [doc.to_array(cols) for doc in docs]
        feats = [model.ops.asarray(f, dtype='uint64') for f in feats]
        return feats, None
    model = layerize(forward)
    return model


def build_feature_precomputer(model, feat_maps):
    '''Allow a model to be "primed" by pre-computing input features in bulk.

    This is used for the parser, where we want to take a batch of documents,
    and compute vectors for each (token, position) pair. These vectors can then
    be reused, especially for beam-search.

    Let's say we're using 12 features for each state, e.g. word at start of
    buffer, three words on stack, their children, etc. In the normal arc-eager
    system, a document of length N is processed in 2*N states. This means we'll
    create 2*N*12 feature vectors --- but if we pre-compute, we only need
    N*12 vector computations. The saving for beam-search is much better:
    if we have a beam of k, we'll normally make 2*N*12*K computations -- 
    so we can save the factor k. This also gives a nice CPU/GPU division:
    we can do all our hard maths up front, packed into large multiplications,
    and do the hard-to-program parsing on the CPU.
    '''
    def precompute(input_vectors):
        cached, backprops = zip(*[lyr.begin_update(input_vectors)
                                for lyr in feat_maps)
        def forward(batch_token_ids, drop=0.):
            output = ops.allocate((batch_size, output_width))
            # i: batch index
            # j: position index (i.e. N0, S0, etc
            # tok_i: Index of the token within its document
            for i, token_ids in enumerate(batch_token_ids):
                for j, tok_i in enumerate(token_ids):
                    output[i] += cached[j][tok_i]
            def backward(d_vector, sgd=None):
                d_inputs = ops.allocate((batch_size, n_feat, vec_width))
                for i, token_ids in enumerate(batch_token_ids):
                    for j in range(len(token_ids)):
                        d_inputs[i][j] = backprops[j](d_vector, sgd)
                # Return the IDs, so caller can associate to correct token
                return (batch_token_ids, d_inputs)
            return vector, backward
        return chain(layerize(forward), model)
    return precompute


def print_shape(prefix):
    def forward(X, drop=0.):
        return X, lambda dX, **kwargs: dX
    return layerize(forward)


@layerize
def get_token_vectors(tokens_attrs_vectors, drop=0.):
    ops = Model.ops
    tokens, attrs, vectors = tokens_attrs_vectors
    def backward(d_output, sgd=None):
        return (tokens, d_output)
    return vectors, backward


@layerize
def flatten(seqs, drop=0.):
    ops = Model.ops
    def finish_update(d_X, sgd=None):
        return d_X
    X = ops.xp.concatenate([ops.asarray(seq) for seq in seqs])
    return X, finish_update
Learning smoothly 2017-05-06 21:38:12 +03:00			`from thinc.api import add, layerize, chain, clone, concatenate, with_flatten`
Gradients look correct 2017-05-06 17:47:15 +03:00			`from thinc.neural import Model, Maxout, Softmax, Affine`
Draft up Parser model 2017-05-04 14:31:40 +03:00			`from thinc.neural._classes.hash_embed import HashEmbed`
Restore tok2vec function 2017-05-05 21:12:03 +03:00
			`from thinc.neural._classes.convolution import ExtractWindow`
			`from thinc.neural._classes.static_vectors import StaticVectors`
Learns things 2017-05-06 19:24:38 +03:00			`from thinc.neural._classes.batchnorm import BatchNorm`
Restore tok2vec function 2017-05-05 21:12:03 +03:00
Learns things 2017-05-06 18:37:36 +03:00			`from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP`
Draft up Parser model 2017-05-04 14:31:40 +03:00

			`def get_col(idx):`
			`def forward(X, drop=0.):`
Learning smoothly 2017-05-06 21:38:12 +03:00			`assert len(X.shape) <= 3`
Data running through, likely errors in model 2017-05-06 15:22:20 +03:00			`output = Model.ops.xp.ascontiguousarray(X[:, idx])`
Learning smoothly 2017-05-06 21:38:12 +03:00			`def backward(y, sgd=None):`
			`dX = Model.ops.allocate(X.shape)`
			`dX[:, idx] += y`
			`return dX`
			`return output, backward`
Draft up Parser model 2017-05-04 14:31:40 +03:00			`return layerize(forward)`


WIP on refactor, with hidde pre-computing 2017-05-07 03:02:43 +03:00			`def build_tok2vec(lang, width, depth=2, embed_size=1000):`
			`cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG]`
			`with Model.define_operators({'>>': chain, '\|': concatenate, '**': clone}):`
			`#static = get_col(cols.index(ID)) >> StaticVectors(lang, width)`
			`lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size)`
			`prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width//4, embed_size)`
			`suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width//4, embed_size)`
			`shape = get_col(cols.index(SHAPE)) >> HashEmbed(width//4, embed_size)`
			`tag = get_col(cols.index(TAG)) >> HashEmbed(width//2, embed_size)`
			`tok2vec = (`
			`doc2feats(cols)`
			`>> with_flatten(`
			`#(static \| prefix \| suffix \| shape)`
			`(lower \| prefix \| suffix \| shape \| tag)`
			`>> Maxout(width)`
			`>> (ExtractWindow(nW=1) >> Maxout(width, width*3))`
			`>> (ExtractWindow(nW=1) >> Maxout(width, width*3))`
			`)`
Gradients look correct 2017-05-06 17:47:15 +03:00			`)`
WIP on refactor, with hidde pre-computing 2017-05-07 03:02:43 +03:00			`return tok2vec`
Gradients look correct 2017-05-06 17:47:15 +03:00

WIP on refactor, with hidde pre-computing 2017-05-07 03:02:43 +03:00			`def doc2feats(cols):`
			`def forward(docs, drop=0.):`
			`feats = [doc.to_array(cols) for doc in docs]`
			`feats = [model.ops.asarray(f, dtype='uint64') for f in feats]`
			`return feats, None`
Gradients look correct 2017-05-06 17:47:15 +03:00			`model = layerize(forward)`
			`return model`


WIP on refactor, with hidde pre-computing 2017-05-07 03:02:43 +03:00			`def build_feature_precomputer(model, feat_maps):`
			`'''Allow a model to be "primed" by pre-computing input features in bulk.`
Learning smoothly 2017-05-06 21:38:12 +03:00
WIP on refactor, with hidde pre-computing 2017-05-07 03:02:43 +03:00			`This is used for the parser, where we want to take a batch of documents,`
			`and compute vectors for each (token, position) pair. These vectors can then`
			`be reused, especially for beam-search.`

			`Let's say we're using 12 features for each state, e.g. word at start of`
			`buffer, three words on stack, their children, etc. In the normal arc-eager`
			`system, a document of length N is processed in 2*N states. This means we'll`
			`create 2N12 feature vectors --- but if we pre-compute, we only need`
			`N*12 vector computations. The saving for beam-search is much better:`
			`if we have a beam of k, we'll normally make 2N12*K computations --`
			`so we can save the factor k. This also gives a nice CPU/GPU division:`
			`we can do all our hard maths up front, packed into large multiplications,`
			`and do the hard-to-program parsing on the CPU.`
			`'''`
			`def precompute(input_vectors):`
			`cached, backprops = zip(*[lyr.begin_update(input_vectors)`
			`for lyr in feat_maps)`
			`def forward(batch_token_ids, drop=0.):`
			`output = ops.allocate((batch_size, output_width))`
			`# i: batch index`
			`# j: position index (i.e. N0, S0, etc`
			`# tok_i: Index of the token within its document`
			`for i, token_ids in enumerate(batch_token_ids):`
			`for j, tok_i in enumerate(token_ids):`
			`output[i] += cached[j][tok_i]`
			`def backward(d_vector, sgd=None):`
			`d_inputs = ops.allocate((batch_size, n_feat, vec_width))`
			`for i, token_ids in enumerate(batch_token_ids):`
			`for j in range(len(token_ids)):`
			`d_inputs[i][j] = backprops[j](d_vector, sgd)`
			`# Return the IDs, so caller can associate to correct token`
			`return (batch_token_ids, d_inputs)`
			`return vector, backward`
			`return chain(layerize(forward), model)`
			`return precompute`
Learning smoothly 2017-05-06 21:38:12 +03:00

			`def print_shape(prefix):`
			`def forward(X, drop=0.):`
			`return X, lambda dX, **kwargs: dX`
			`return layerize(forward)`
WIP on refactor, with hidde pre-computing 2017-05-07 03:02:43 +03:00
Learning smoothly 2017-05-06 21:38:12 +03:00
			`@layerize`
			`def get_token_vectors(tokens_attrs_vectors, drop=0.):`
			`ops = Model.ops`
			`tokens, attrs, vectors = tokens_attrs_vectors`
			`def backward(d_output, sgd=None):`
			`return (tokens, d_output)`
			`return vectors, backward`


Data running through, likely errors in model 2017-05-06 15:22:20 +03:00			`@layerize`
			`def flatten(seqs, drop=0.):`
			`ops = Model.ops`
			`def finish_update(d_X, sgd=None):`
			`return d_X`
			`X = ops.xp.concatenate([ops.asarray(seq) for seq in seqs])`
			`return X, finish_update`