From e33b7e0b3c8f7a205e093ff481a8d6bc6b402eb9 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 14:39:30 +0200 Subject: [PATCH] Tidy up parser and ML --- spacy/_ml.py | 295 +++++++++---------------------------- spacy/syntax/nn_parser.pyx | 59 +++----- 2 files changed, 94 insertions(+), 260 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 4c4e36412..89324b3b3 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -1,47 +1,42 @@ -import ujson -from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU +# coding: utf8 +from __future__ import unicode_literals + +import numpy +from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu from thinc.i2v import HashEmbed, StaticVectors from thinc.t2t import ExtractWindow, ParametricAttention -from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool +from thinc.t2v import Pooling, sum_pool from thinc.misc import Residual -from thinc.misc import BatchNorm as BN from thinc.misc import LayerNorm as LN from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.api import FeatureExtracter, with_getitem -from thinc.api import uniqued, wrap, flatten_add_lengths, noop +from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths +from thinc.api import uniqued, wrap, noop from thinc.linear.linear import LinearModel from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module -import random -import cytoolz - from thinc import describe from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.neural._classes.affine import _set_dimensions_if_needed import thinc.extra.load_nlp -from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER -from .tokens.doc import Doc +from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE from . import util -import numpy -import io - -# TODO: Unset this once we don't want to support models previous models. -import thinc.neural._classes.layernorm -thinc.neural._classes.layernorm.set_compat_six_eight(False) VECTORS_KEY = 'spacy_pretrained_vectors' + @layerize def _flatten_add_lengths(seqs, pad=0, drop=0.): ops = Model.ops lengths = ops.asarray([len(seq) for seq in seqs], dtype='i') + def finish_update(d_X, sgd=None): return ops.unflatten(d_X, lengths, pad=pad) + X = ops.flatten(seqs, pad=pad) return (X, lengths), finish_update @@ -55,33 +50,14 @@ def _logistic(X, drop=0.): X = xp.minimum(X, 10., X) X = xp.maximum(X, -10., X) Y = 1. / (1. + xp.exp(-X)) + def logistic_bwd(dY, sgd=None): dX = dY * (Y * (1-Y)) return dX + return Y, logistic_bwd -@layerize -def add_tuples(X, drop=0.): - """Give inputs of sequence pairs, where each sequence is (vals, length), - sum the values, returning a single sequence. - - If input is: - ((vals1, length), (vals2, length) - Output is: - (vals1+vals2, length) - - vals are a single tensor for the whole batch. - """ - (vals1, length1), (vals2, length2) = X - assert length1 == length2 - - def add_tuples_bwd(dY, sgd=None): - return (dY, dY) - - return (vals1+vals2, length), add_tuples_bwd - - def _zero_init(model): def _zero_init_impl(self, X, y): self.W.fill(0) @@ -115,13 +91,12 @@ def _init_for_precomputed(W, ops): nF=Dimension("Number of features"), nO=Dimension("Output size"), W=Synapses("Weights matrix", - lambda obj: (obj.nF, obj.nO, obj.nI), - lambda W, ops: _init_for_precomputed(W, ops)), + lambda obj: (obj.nF, obj.nO, obj.nI), + lambda W, ops: _init_for_precomputed(W, ops)), b=Biases("Bias vector", - lambda obj: (obj.nO,)), + lambda obj: (obj.nO,)), d_W=Gradient("W"), - d_b=Gradient("b") -) + d_b=Gradient("b")) class PrecomputableAffine(Model): def __init__(self, nO=None, nI=None, nF=None, **kwargs): Model.__init__(self, **kwargs) @@ -134,18 +109,19 @@ class PrecomputableAffine(Model): # Yf: (b, f, i) # dY: (b, o) # dYf: (b, f, o) - #Yf = numpy.einsum('bi,foi->bfo', X, self.W) + # Yf = numpy.einsum('bi,foi->bfo', X, self.W) Yf = self.ops.xp.tensordot( X, self.W, axes=[[1], [2]]) Yf += self.b + def backward(dY_ids, sgd=None): tensordot = self.ops.xp.tensordot dY, ids = dY_ids Xf = X[ids] - #dXf = numpy.einsum('bo,foi->bfi', dY, self.W) + # dXf = numpy.einsum('bo,foi->bfi', dY, self.W) dXf = tensordot(dY, self.W, axes=[[1], [1]]) - #dW = numpy.einsum('bo,bfi->ofi', dY, Xf) + # dW = numpy.einsum('bo,bfi->ofi', dY, Xf) dW = tensordot(dY, Xf, axes=[[0], [0]]) # ofi -> foi self.d_W += dW.transpose((1, 0, 2)) @@ -154,6 +130,7 @@ class PrecomputableAffine(Model): if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) return dXf + return Yf, backward @@ -164,13 +141,12 @@ class PrecomputableAffine(Model): nP=Dimension("Number of pieces"), nO=Dimension("Output size"), W=Synapses("Weights matrix", - lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI), - lambda W, ops: ops.xavier_uniform_init(W)), + lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI), + lambda W, ops: ops.xavier_uniform_init(W)), b=Biases("Bias vector", - lambda obj: (obj.nO, obj.nP)), + lambda obj: (obj.nO, obj.nP)), d_W=Gradient("W"), - d_b=Gradient("b") -) + d_b=Gradient("b")) class PrecomputableMaxouts(Model): def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs): Model.__init__(self, **kwargs) @@ -186,114 +162,26 @@ class PrecomputableMaxouts(Model): # dYp: (b, o, p) # W: (f, o, p, i) # b: (o, p) - # bi,opfi->bfop # bop,fopi->bfi # bop,fbi->opfi : fopi - tensordot = self.ops.xp.tensordot - ascontiguous = self.ops.xp.ascontiguousarray - Yfp = tensordot(X, self.W, axes=[[1], [3]]) Yfp += self.b def backward(dYp_ids, sgd=None): dYp, ids = dYp_ids Xf = X[ids] - - dXf = tensordot(dYp, self.W, axes=[[1, 2], [1,2]]) + dXf = tensordot(dYp, self.W, axes=[[1, 2], [1, 2]]) dW = tensordot(dYp, Xf, axes=[[0], [0]]) - self.d_W += dW.transpose((2, 0, 1, 3)) self.d_b += dYp.sum(axis=0) - if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) return dXf + return Yfp, backward -# Thinc's Embed class is a bit broken atm, so drop this here. -from thinc import describe -from thinc.neural._classes.embed import _uniform_init - - -@describe.attributes( - nV=describe.Dimension("Number of vectors"), - nO=describe.Dimension("Size of output"), - vectors=describe.Weights("Embedding table", - lambda obj: (obj.nV, obj.nO), - _uniform_init(-0.1, 0.1) - ), - d_vectors=describe.Gradient("vectors") -) -class Embed(Model): - name = 'embed' - - def __init__(self, nO, nV=None, **kwargs): - if nV is not None: - nV += 1 - Model.__init__(self, **kwargs) - if 'name' in kwargs: - self.name = kwargs['name'] - self.column = kwargs.get('column', 0) - self.nO = nO - self.nV = nV - - def predict(self, ids): - if ids.ndim == 2: - ids = ids[:, self.column] - return self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f') - - def begin_update(self, ids, drop=0.): - if ids.ndim == 2: - ids = ids[:, self.column] - vectors = self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f') - def backprop_embed(d_vectors, sgd=None): - n_vectors = d_vectors.shape[0] - self.ops.scatter_add(self.d_vectors, ids, d_vectors) - if sgd is not None: - sgd(self._mem.weights, self._mem.gradient, key=self.id) - return None - return vectors, backprop_embed - - -def HistoryFeatures(nr_class, hist_size=8, nr_dim=8): - '''Wrap a model, adding features representing action history.''' - if hist_size == 0: - return layerize(noop()) - embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d') - for i in range(hist_size)] - embed = chain(concatenate(*embed_tables), - LN(Maxout(hist_size*nr_dim, hist_size*nr_dim))) - ops = embed.ops - def add_history_fwd(vectors_hists, drop=0.): - vectors, hist_ids = vectors_hists - hist_feats, bp_hists = embed.begin_update(hist_ids, drop=drop) - outputs = ops.xp.hstack((vectors, hist_feats)) - - def add_history_bwd(d_outputs, sgd=None): - d_vectors = d_outputs[:, :vectors.shape[1]] - d_hists = d_outputs[:, vectors.shape[1]:] - bp_hists(d_hists, sgd=sgd) - return embed.ops.xp.ascontiguousarray(d_vectors) - return outputs, add_history_bwd - return wrap(add_history_fwd, embed) - - -def drop_layer(layer, factor=2.): - def drop_layer_fwd(X, drop=0.): - if drop <= 0.: - return layer.begin_update(X, drop=drop) - else: - coinflip = layer.ops.xp.random.random() - if (coinflip / factor) >= drop: - return layer.begin_update(X, drop=drop) - else: - return X, lambda dX, sgd=None: dX - - model = wrap(drop_layer_fwd, layer) - model.predict = layer - return model def link_vectors_to_models(vocab): vectors = vocab.vectors @@ -308,16 +196,21 @@ def link_vectors_to_models(vocab): # (unideal, I know) thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data + def Tok2Vec(width, embed_size, **kwargs): pretrained_dims = kwargs.get('pretrained_dims', 0) cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, - '*': reapply}): - norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') - prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') - suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') - shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') + with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, + '+': add, '*': reapply}): + norm = HashEmbed(width, embed_size, column=cols.index(NORM), + name='embed_norm') + prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), + name='embed_prefix') + suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), + name='embed_suffix') + shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), + name='embed_shape') if pretrained_dims is not None and pretrained_dims >= 1: glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID)) @@ -329,7 +222,6 @@ def Tok2Vec(width, embed_size, **kwargs): (norm | prefix | suffix | shape) >> LN(Maxout(width, width*4, pieces=3)), column=5) - convolution = Residual( ExtractWindow(nW=1) >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces)) @@ -354,6 +246,7 @@ def reapply(layer, n_times): Y, backprop = layer.begin_update(X, drop=drop) X = Y backprops.append(backprop) + def reapply_bwd(dY, sgd=None): dX = None for backprop in reversed(backprops): @@ -363,39 +256,20 @@ def reapply(layer, n_times): else: dX += dY return dX + return Y, reapply_bwd return wrap(reapply_fwd, layer) - - def asarray(ops, dtype): def forward(X, drop=0.): return ops.asarray(X, dtype=dtype), None return layerize(forward) -def foreach(layer): - def forward(Xs, drop=0.): - results = [] - backprops = [] - for X in Xs: - result, bp = layer.begin_update(X, drop=drop) - results.append(result) - backprops.append(bp) - def backward(d_results, sgd=None): - dXs = [] - for d_result, backprop in zip(d_results, backprops): - dXs.append(backprop(d_result, sgd)) - return dXs - return results, backward - model = layerize(forward) - model._layers.append(layer) - return model - - def rebatch(size, layer): ops = layer.ops + def forward(X, drop=0.): if X.shape[0] < size: return layer.begin_update(X) @@ -403,6 +277,7 @@ def rebatch(size, layer): results, bp_results = zip(*[layer.begin_update(p, drop=drop) for p in parts]) y = ops.flatten(results) + def backward(dy, sgd=None): d_parts = [bp(y, sgd=sgd) for bp, y in zip(bp_results, _divide_array(dy, size))] @@ -413,6 +288,7 @@ def rebatch(size, layer): except ValueError: dX = None return dX + return y, backward model = layerize(forward) model._layers.append(layer) @@ -423,13 +299,14 @@ def _divide_array(X, size): parts = [] index = 0 while index < len(X): - parts.append(X[index : index + size]) + parts.append(X[index:index + size]) index += size return parts def get_col(idx): assert idx >= 0, idx + def forward(X, drop=0.): assert idx >= 0, idx if isinstance(X, numpy.ndarray): @@ -437,30 +314,28 @@ def get_col(idx): else: ops = CupyOps() output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) + def backward(y, sgd=None): assert idx >= 0, idx dX = ops.allocate(X.shape) dX[:, idx] += y return dX + return output, backward + return layerize(forward) -def zero_init(model): - def _hook(self, X, y=None): - self.W.fill(0) - model.on_data_hooks.append(_hook) - return model - - def doc2feats(cols=None): if cols is None: cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] + def forward(docs, drop=0.): feats = [] for doc in docs: feats.append(doc.to_array(cols)) return feats, None + model = layerize(forward) model.cols = cols return model @@ -474,28 +349,14 @@ def print_shape(prefix): @layerize def get_token_vectors(tokens_attrs_vectors, drop=0.): - ops = Model.ops tokens, attrs, vectors = tokens_attrs_vectors + def backward(d_output, sgd=None): return (tokens, d_output) + return vectors, backward -@layerize -def flatten(seqs, drop=0.): - if isinstance(seqs[0], numpy.ndarray): - ops = NumpyOps() - elif hasattr(CupyOps.xp, 'ndarray') and isinstance(seqs[0], CupyOps.xp.ndarray): - ops = CupyOps() - else: - raise ValueError("Unable to flatten sequence of type %s" % type(seqs[0])) - lengths = [len(seq) for seq in seqs] - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths) - X = ops.xp.vstack(seqs) - return X, finish_update - - @layerize def logistic(X, drop=0.): xp = get_array_module(X) @@ -505,9 +366,11 @@ def logistic(X, drop=0.): X = xp.minimum(X, 10., X) X = xp.maximum(X, -10., X) Y = 1. / (1. + xp.exp(-X)) + def logistic_bwd(dY, sgd=None): dX = dY * (Y * (1-Y)) return dX + return Y, logistic_bwd @@ -517,6 +380,7 @@ def zero_init(model): model.on_data_hooks.append(_zero_init_impl) return model + @layerize def preprocess_doc(docs, drop=0.): keys = [doc.to_array([LOWER]) for doc in docs] @@ -526,11 +390,13 @@ def preprocess_doc(docs, drop=0.): vals = ops.allocate(keys.shape[0]) + 1 return (keys, vals, lengths), None + def getitem(i): def getitem_fwd(X, drop=0.): return X[i], None return layerize(getitem_fwd) + def build_tagger_model(nr_class, **cfg): embed_size = util.env_opt('embed_size', 7000) if 'token_vector_width' in cfg: @@ -555,8 +421,6 @@ def build_tagger_model(nr_class, **cfg): @layerize def SpacyVectors(docs, drop=0.): - xp = get_array_module(docs[0].vocab.vectors.data) - width = docs[0].vocab.vectors.data.shape[1] batch = [] for doc in docs: indices = numpy.zeros((len(doc),), dtype='i') @@ -570,29 +434,6 @@ def SpacyVectors(docs, drop=0.): return batch, None -def foreach(layer, drop_factor=1.0): - '''Map a layer across elements in a list''' - def foreach_fwd(Xs, drop=0.): - drop *= drop_factor - ys = [] - backprops = [] - for X in Xs: - y, bp_y = layer.begin_update(X, drop=drop) - ys.append(y) - backprops.append(bp_y) - def foreach_bwd(d_ys, sgd=None): - d_Xs = [] - for d_y, bp_y in zip(d_ys, backprops): - if bp_y is not None and bp_y is not None: - d_Xs.append(d_y, sgd=sgd) - else: - d_Xs.append(None) - return d_Xs - return ys, foreach_bwd - model = wrap(foreach_fwd, layer) - return model - - def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 5000) pretrained_dims = cfg.get('pretrained_dims', 0) @@ -602,9 +443,7 @@ def build_text_classifier(nr_class, width=64, **cfg): model = ( SpacyVectors >> flatten_add_lengths - >> with_getitem(0, - Affine(width, pretrained_dims) - ) + >> with_getitem(0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width)) ** 2 @@ -613,7 +452,6 @@ def build_text_classifier(nr_class, width=64, **cfg): ) return model - lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width//2, nr_vector, column=2) suffix = HashEmbed(width//2, nr_vector, column=3) @@ -671,33 +509,40 @@ def build_text_classifier(nr_class, width=64, **cfg): model.lsuv = False return model + @layerize def flatten(seqs, drop=0.): ops = Model.ops lengths = ops.asarray([len(seq) for seq in seqs], dtype='i') + def finish_update(d_X, sgd=None): return ops.unflatten(d_X, lengths, pad=0) + X = ops.flatten(seqs, pad=0) return X, finish_update -def concatenate_lists(*layers, **kwargs): # pragma: no cover - '''Compose two or more models `f`, `g`, etc, such that their outputs are +def concatenate_lists(*layers, **kwargs): # pragma: no cover + """Compose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` - ''' + """ if not layers: return noop() drop_factor = kwargs.get('drop_factor', 1.0) ops = layers[0].ops layers = [chain(layer, flatten) for layer in layers] concat = concatenate(*layers) + def concatenate_lists_fwd(Xs, drop=0.): drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype='i') flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) + def concatenate_lists_bwd(d_ys, sgd=None): return bp_flat_y(ops.flatten(d_ys), sgd=sgd) + return ys, concatenate_lists_bwd + model = wrap(concatenate_lists_fwd, concat) return model diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index c592cdc22..12332ab25 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -49,9 +49,8 @@ from .. import util from ..util import get_async, get_cuda_stream from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import Tok2Vec, doc2feats, rebatch -from .._ml import Residual, drop_layer, flatten +from .._ml import Residual, flatten from .._ml import link_vectors_to_models -from .._ml import HistoryFeatures from ..compat import json_dumps, copy_array from .stateclass cimport StateClass @@ -77,7 +76,7 @@ def set_debug(val): cdef class precompute_hiddens: - '''Allow a model to be "primed" by pre-computing input features in bulk. + """Allow a model to be "primed" by pre-computing input features in bulk. This is used for the parser, where we want to take a batch of documents, and compute vectors for each (token, position) pair. These vectors can then @@ -92,7 +91,7 @@ cdef class precompute_hiddens: so we can save the factor k. This also gives a nice CPU/GPU division: we can do all our hard maths up front, packed into large multiplications, and do the hard-to-program parsing on the CPU. - ''' + """ cdef int nF, nO, nP cdef bint _is_synchronized cdef public object ops @@ -280,23 +279,19 @@ cdef class Parser: return (tok2vec, lower, upper), cfg def __init__(self, Vocab vocab, moves=True, model=True, **cfg): - """ - Create a Parser. + """Create a Parser. - Arguments: - vocab (Vocab): - The vocabulary object. Must be shared with documents to be processed. - The value is set to the .vocab attribute. - moves (TransitionSystem): - Defines how the parse-state is created, updated and evaluated. - The value is set to the .moves attribute unless True (default), - in which case a new instance is created with Parser.Moves(). - model (object): - Defines how the parse-state is created, updated and evaluated. - The value is set to the .model attribute unless True (default), - in which case a new instance is created with Parser.Model(). - **cfg: - Arbitrary configuration parameters. Set to the .cfg attribute + vocab (Vocab): The vocabulary object. Must be shared with documents + to be processed. The value is set to the `.vocab` attribute. + moves (TransitionSystem): Defines how the parse-state is created, + updated and evaluated. The value is set to the .moves attribute + unless True (default), in which case a new instance is created with + `Parser.Moves()`. + model (object): Defines how the parse-state is created, updated and + evaluated. The value is set to the .model attribute unless True + (default), in which case a new instance is created with + `Parser.Model()`. + **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute """ self.vocab = vocab if moves is True: @@ -322,13 +317,10 @@ cdef class Parser: return (Parser, (self.vocab, self.moves, self.model), None, None) def __call__(self, Doc doc, beam_width=None, beam_density=None): - """ - Apply the parser or entity recognizer, setting the annotations onto the Doc object. + """Apply the parser or entity recognizer, setting the annotations onto + the `Doc` object. - Arguments: - doc (Doc): The document to be processed. - Returns: - None + doc (Doc): The document to be processed. """ if beam_width is None: beam_width = self.cfg.get('beam_width', 1) @@ -350,16 +342,13 @@ cdef class Parser: def pipe(self, docs, int batch_size=256, int n_threads=2, beam_width=None, beam_density=None): - """ - Process a stream of documents. + """Process a stream of documents. - Arguments: - stream: The sequence of documents to process. - batch_size (int): - The number of documents to accumulate into a working set. - n_threads (int): - The number of threads with which to work on the buffer in parallel. - Yields (Doc): Documents, in order. + stream: The sequence of documents to process. + batch_size (int): Number of documents to accumulate into a working set. + n_threads (int): The number of threads with which to work on the buffer + in parallel. + YIELDS (Doc): Documents, in order. """ if beam_width is None: beam_width = self.cfg.get('beam_width', 1)