mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
491 lines
17 KiB
Cython
491 lines
17 KiB
Cython
"""Feed-forward neural network, using Thenao."""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
import numpy
|
|
|
|
import theano
|
|
import theano.tensor as T
|
|
import gzip
|
|
import cPickle
|
|
|
|
|
|
def load_data(dataset):
|
|
''' Loads the dataset
|
|
|
|
:type dataset: string
|
|
:param dataset: the path to the dataset (here MNIST)
|
|
'''
|
|
|
|
#############
|
|
# LOAD DATA #
|
|
#############
|
|
|
|
# Download the MNIST dataset if it is not present
|
|
data_dir, data_file = os.path.split(dataset)
|
|
if data_dir == "" and not os.path.isfile(dataset):
|
|
# Check if dataset is in the data directory.
|
|
new_path = os.path.join(
|
|
os.path.split(__file__)[0],
|
|
"..",
|
|
"data",
|
|
dataset
|
|
)
|
|
if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
|
|
dataset = new_path
|
|
|
|
if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
|
|
import urllib
|
|
origin = (
|
|
'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
|
|
)
|
|
print 'Downloading data from %s' % origin
|
|
urllib.urlretrieve(origin, dataset)
|
|
|
|
print '... loading data'
|
|
|
|
# Load the dataset
|
|
f = gzip.open(dataset, 'rb')
|
|
train_set, valid_set, test_set = cPickle.load(f)
|
|
f.close()
|
|
#train_set, valid_set, test_set format: tuple(input, target)
|
|
#input is an numpy.ndarray of 2 dimensions (a matrix),
|
|
#each row corresponding to an example. target is a
|
|
#numpy.ndarray of 1 dimension (vector)) that have the same length as
|
|
#the number of rows in the input. It should give the target
|
|
#target to the example with the same index in the input.
|
|
|
|
def shared_dataset(data_xy, borrow=True):
|
|
""" Function that loads the dataset into shared variables
|
|
|
|
The reason we store our dataset in shared variables is to allow
|
|
Theano to copy it into the GPU memory (when code is run on GPU).
|
|
Since copying data into the GPU is slow, copying a minibatch everytime
|
|
is needed (the default behaviour if the data is not in a shared
|
|
variable) would lead to a large decrease in performance.
|
|
"""
|
|
data_x, data_y = data_xy
|
|
shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX),
|
|
borrow=borrow)
|
|
shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX),
|
|
borrow=borrow)
|
|
# When storing data on the GPU it has to be stored as floats
|
|
# therefore we will store the labels as ``floatX`` as well
|
|
# (``shared_y`` does exactly that). But during our computations
|
|
# we need them as ints (we use labels as index, and if they are
|
|
# floats it doesn't make sense) therefore instead of returning
|
|
# ``shared_y`` we will have to cast it to int. This little hack
|
|
# lets ous get around this issue
|
|
return shared_x, T.cast(shared_y, 'int32')
|
|
|
|
test_set_x, test_set_y = shared_dataset(test_set)
|
|
valid_set_x, valid_set_y = shared_dataset(valid_set)
|
|
train_set_x, train_set_y = shared_dataset(train_set)
|
|
|
|
rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
|
|
(test_set_x, test_set_y)]
|
|
return rval
|
|
|
|
|
|
class LogisticRegression(object):
|
|
"""Multi-class Logistic Regression Class
|
|
|
|
The logistic regression is fully described by a weight matrix :math:`W`
|
|
and bias vector :math:`b`. Classification is done by projecting data
|
|
points onto a set of hyperplanes, the distance to which is used to
|
|
determine a class membership probability.
|
|
"""
|
|
|
|
def __init__(self, input, n_in, n_out):
|
|
""" Initialize the parameters of the logistic regression
|
|
|
|
:type input: theano.tensor.TensorType
|
|
:param input: symbolic variable that describes the input of the
|
|
architecture (one minibatch)
|
|
|
|
:type n_in: int
|
|
:param n_in: number of input units, the dimension of the space in
|
|
which the datapoints lie
|
|
|
|
:type n_out: int
|
|
:param n_out: number of output units, the dimension of the space in
|
|
which the labels lie
|
|
|
|
"""
|
|
# start-snippet-1
|
|
# initialize with 0 the weights W as a matrix of shape (n_in, n_out)
|
|
self.W = theano.shared(
|
|
value=numpy.zeros((n_in, n_out),
|
|
dtype=theano.config.floatX
|
|
),
|
|
name='W',
|
|
borrow=True
|
|
)
|
|
# initialize the baises b as a vector of n_out 0s
|
|
self.b = theano.shared(
|
|
value=numpy.zeros(
|
|
(n_out,),
|
|
dtype=theano.config.floatX
|
|
),
|
|
name='b',
|
|
borrow=True
|
|
)
|
|
|
|
# symbolic expression for computing the matrix of class-membership
|
|
# probabilities
|
|
# Where:
|
|
# W is a matrix where column-k represent the separation hyper plain for
|
|
# class-k
|
|
# x is a matrix where row-j represents input training sample-j
|
|
# b is a vector where element-k represent the free parameter of hyper
|
|
# plain-k
|
|
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
|
|
|
|
# symbolic description of how to compute prediction as class whose
|
|
# probability is maximal
|
|
self.y_pred = T.argmax(self.p_y_given_x, axis=1)
|
|
# end-snippet-1
|
|
|
|
# parameters of the model
|
|
self.params = [self.W, self.b]
|
|
|
|
def neg_ll(self, y):
|
|
"""Return the mean of the negative log-likelihood of the prediction
|
|
of this model under a given target distribution.
|
|
|
|
.. math::
|
|
|
|
\frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
|
|
\frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
|
|
\log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
|
|
\ell (\theta=\{W,b\}, \mathcal{D})
|
|
|
|
:type y: theano.tensor.TensorType
|
|
:param y: corresponds to a vector that gives for each example the
|
|
correct label
|
|
|
|
Note: we use the mean instead of the sum so that
|
|
the learning rate is less dependent on the batch size
|
|
"""
|
|
# start-snippet-2
|
|
# y.shape[0] is (symbolically) the number of rows in y, i.e.,
|
|
# number of examples (call it n) in the minibatch
|
|
# T.arange(y.shape[0]) is a symbolic vector which will contain
|
|
# [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
|
|
# Log-Probabilities (call it LP) with one row per example and
|
|
# one column per class LP[T.arange(y.shape[0]),y] is a vector
|
|
# v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
|
|
# LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
|
|
# the mean (across minibatch examples) of the elements in v,
|
|
# i.e., the mean log-likelihood across the minibatch.
|
|
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
|
|
# end-snippet-2
|
|
|
|
def errors(self, y):
|
|
"""Return a float representing the number of errors in the minibatch
|
|
over the total number of examples of the minibatch ; zero one
|
|
loss over the size of the minibatch
|
|
|
|
:type y: theano.tensor.TensorType
|
|
:param y: corresponds to a vector that gives for each example the
|
|
correct label
|
|
"""
|
|
|
|
# check if y has same dimension of y_pred
|
|
if y.ndim != self.y_pred.ndim:
|
|
raise TypeError(
|
|
'y should have the same shape as self.y_pred',
|
|
('y', y.type, 'y_pred', self.y_pred.type)
|
|
)
|
|
# check if y is of the correct datatype
|
|
if y.dtype.startswith('int'):
|
|
# the T.neq operator returns a vector of 0s and 1s, where 1
|
|
# represents a mistake in prediction
|
|
return T.mean(T.neq(self.y_pred, y))
|
|
else:
|
|
raise NotImplementedError()
|
|
|
|
|
|
# start-snippet-1
|
|
class HiddenLayer(object):
|
|
def __init__(self, rng, input, n_in, n_out, W=None, b=None,
|
|
activation=T.tanh):
|
|
"""
|
|
Typical hidden layer of a MLP: units are fully-connected and have
|
|
sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
|
|
and the bias vector b is of shape (n_out,).
|
|
|
|
NOTE : The nonlinearity used here is tanh
|
|
|
|
Hidden unit activation is given by: tanh(dot(input,W) + b)
|
|
|
|
:type rng: numpy.random.RandomState
|
|
:param rng: a random number generator used to initialize weights
|
|
|
|
:type input: theano.tensor.dmatrix
|
|
:param input: a symbolic tensor of shape (n_examples, n_in)
|
|
|
|
:type n_in: int
|
|
:param n_in: dimensionality of input
|
|
|
|
:type n_out: int
|
|
:param n_out: number of hidden units
|
|
|
|
:type activation: theano.Op or function
|
|
:param activation: Non linearity to be applied in the hidden
|
|
layer
|
|
"""
|
|
self.input = input
|
|
# end-snippet-1
|
|
|
|
# `W` is initialized with `W_values` which is uniformely sampled
|
|
# from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
|
|
# for tanh activation function
|
|
# the output of uniform if converted using asarray to dtype
|
|
# theano.config.floatX so that the code is runable on GPU
|
|
# Note : optimal initialization of weights is dependent on the
|
|
# activation function used (among other things).
|
|
# For example, results presented in [Xavier10] suggest that you
|
|
# should use 4 times larger initial weights for sigmoid
|
|
# compared to tanh
|
|
# We have no info for other function, so we use the same as
|
|
# tanh.
|
|
if W is None:
|
|
W_values = numpy.asarray(
|
|
rng.uniform(
|
|
low=-numpy.sqrt(6. / (n_in + n_out)),
|
|
high=numpy.sqrt(6. / (n_in + n_out)),
|
|
size=(n_in, n_out)
|
|
),
|
|
dtype=theano.config.floatX
|
|
)
|
|
if activation == theano.tensor.nnet.sigmoid:
|
|
W_values *= 4
|
|
|
|
W = theano.shared(value=W_values, name='W', borrow=True)
|
|
|
|
if b is None:
|
|
b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
|
b = theano.shared(value=b_values, name='b', borrow=True)
|
|
|
|
self.W = W
|
|
self.b = b
|
|
|
|
lin_output = T.dot(input, self.W) + self.b
|
|
self.output = (
|
|
lin_output if activation is None
|
|
else activation(lin_output)
|
|
)
|
|
# parameters of the model
|
|
self.params = [self.W, self.b]
|
|
|
|
|
|
# start-snippet-2
|
|
class MLP(object):
|
|
"""Multi-Layer Perceptron Class
|
|
|
|
A multilayer perceptron is a feedforward artificial neural network model
|
|
that has one layer or more of hidden units and nonlinear activations.
|
|
Intermediate layers usually have as activation function tanh or the
|
|
sigmoid function (defined here by a ``HiddenLayer`` class) while the
|
|
top layer is a softmax layer (defined here by a ``LogisticRegression``
|
|
class).
|
|
"""
|
|
|
|
def __init__(self, rng, input, n_in, n_hidden, n_out):
|
|
"""Initialize the parameters for the multilayer perceptron
|
|
|
|
:type rng: numpy.random.RandomState
|
|
:param rng: a random number generator used to initialize weights
|
|
|
|
:type input: theano.tensor.TensorType
|
|
:param input: symbolic variable that describes the input of the
|
|
architecture (one minibatch)
|
|
|
|
:type n_in: int
|
|
:param n_in: number of input units, the dimension of the space in
|
|
which the datapoints lie
|
|
|
|
:type n_hidden: int
|
|
:param n_hidden: number of hidden units
|
|
|
|
:type n_out: int
|
|
:param n_out: number of output units, the dimension of the space in
|
|
which the labels lie
|
|
|
|
"""
|
|
|
|
# Since we are dealing with a one hidden layer MLP, this will translate
|
|
# into a HiddenLayer with a tanh activation function connected to the
|
|
# LogisticRegression layer; the activation function can be replaced by
|
|
# sigmoid or any other nonlinear function
|
|
self.hidden = HiddenLayer(
|
|
rng=rng,
|
|
input=input,
|
|
n_in=n_in,
|
|
n_out=n_hidden,
|
|
activation=T.tanh
|
|
)
|
|
|
|
# The logistic regression layer gets as input the hidden units
|
|
# of the hidden layer
|
|
self.maxent = LogisticRegression(
|
|
input=self.hidden.output,
|
|
n_in=n_hidden,
|
|
n_out=n_out
|
|
)
|
|
# L1 norm ; one regularization option is to enforce L1 norm to
|
|
# be small
|
|
self.L1 = abs(self.hidden.W).sum() + abs(self.maxent.W).sum()
|
|
|
|
# square of L2 norm ; one regularization option is to enforce
|
|
# square of L2 norm to be small
|
|
self.L2_sqr = (self.hidden.W ** 2).sum() + (self.maxent.W ** 2).sum()
|
|
|
|
# negative log likelihood of the MLP is given by the negative
|
|
# log likelihood of the output of the model, computed in the
|
|
# logistic regression layer
|
|
self.neg_ll = self.maxent.neg_ll
|
|
# same holds for the function computing the number of errors
|
|
self.errors = self.maxent.errors
|
|
|
|
# the parameters of the model are the parameters of the two layer it is
|
|
# made out of
|
|
self.params = self.hidden.params + self.maxent.params
|
|
|
|
|
|
|
|
|
|
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
|
|
dataset='mnist.pkl.gz', batch_size=1, n_hidden=500):
|
|
"""
|
|
Demonstrate stochastic gradient descent optimization for a multilayer
|
|
perceptron
|
|
|
|
This is demonstrated on MNIST.
|
|
|
|
:type learning_rate: float
|
|
:param learning_rate: learning rate used (factor for the stochastic
|
|
gradient
|
|
|
|
:type L1_reg: float
|
|
:param L1_reg: L1-norm's weight when added to the cost (see
|
|
regularization)
|
|
|
|
:type L2_reg: float
|
|
:param L2_reg: L2-norm's weight when added to the cost (see
|
|
regularization)
|
|
|
|
:type n_epochs: int
|
|
:param n_epochs: maximal number of epochs to run the optimizer
|
|
|
|
:type dataset: string
|
|
:param dataset: the path of the MNIST dataset file from
|
|
http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
|
|
"""
|
|
datasets = load_data(dataset)
|
|
|
|
train_set_x, train_set_y = datasets[0]
|
|
valid_set_x, valid_set_y = datasets[1]
|
|
test_set_x, test_set_y = datasets[2]
|
|
|
|
######################
|
|
# BUILD ACTUAL MODEL #
|
|
######################
|
|
print '... building the model'
|
|
|
|
# allocate symbolic variables for the data
|
|
index = T.lscalar() # index to a [mini]batch
|
|
x = T.matrix('x') # the data is presented as rasterized images
|
|
y = T.ivector('y') # the labels are presented as 1D vector of
|
|
# [int] labels
|
|
|
|
rng = numpy.random.RandomState(1234)
|
|
|
|
# construct the MLP class
|
|
mlp = MLP(
|
|
rng=rng,
|
|
input=x,
|
|
n_in=28 * 28,
|
|
n_hidden=n_hidden,
|
|
n_out=10
|
|
)
|
|
|
|
# the cost we minimize during training is the negative log likelihood of
|
|
# the model plus the regularization terms (L1 and L2); cost is expressed
|
|
# here symbolically
|
|
|
|
# compiling a Theano function that computes the mistakes that are made
|
|
# by the model on a minibatch
|
|
test_model = theano.function(
|
|
inputs=[index],
|
|
outputs=mlp.maxent.errors(y),
|
|
givens={
|
|
x: test_set_x[index:index+1],
|
|
y: test_set_y[index:index+1]
|
|
}
|
|
)
|
|
|
|
validate_model = theano.function(
|
|
inputs=[index],
|
|
outputs=mlp.maxent.errors(y),
|
|
givens={
|
|
x: valid_set_x[index:index+1],
|
|
y: valid_set_y[index:index+1]
|
|
}
|
|
)
|
|
|
|
# compute the gradient of cost with respect to theta (sotred in params)
|
|
# the resulting gradients will be stored in a list gparams
|
|
cost = mlp.neg_ll(y) + L1_reg * mlp.L1 + L2_reg * mlp.L2_sqr
|
|
gparams = [T.grad(cost, param) for param in mlp.params]
|
|
|
|
# specify how to update the parameters of the model as a list of
|
|
# (variable, update expression) pairs
|
|
|
|
updates = [(mlp.params[i], mlp.params[i] - (learning_rate * gparams[i]))
|
|
for i in xrange(len(gparams))]
|
|
|
|
# compiling a Theano function `train_model` that returns the cost, but
|
|
# in the same time updates the parameter of the model based on the rules
|
|
# defined in `updates`
|
|
train_model = theano.function(
|
|
inputs=[index],
|
|
outputs=cost,
|
|
updates=updates,
|
|
givens={
|
|
x: train_set_x[index:index+1],
|
|
y: train_set_y[index:index+1]
|
|
}
|
|
)
|
|
# end-snippet-5
|
|
|
|
###############
|
|
# TRAIN MODEL #
|
|
###############
|
|
print '... training'
|
|
|
|
start_time = time.clock()
|
|
|
|
n_examples = train_set_x.get_value(borrow=True).shape[0]
|
|
n_dev_examples = valid_set_x.get_value(borrow=True).shape[0]
|
|
n_test_examples = test_set_x.get_value(borrow=True).shape[0]
|
|
|
|
for epoch in range(1, n_epochs+1):
|
|
for idx in xrange(n_examples):
|
|
train_model(idx)
|
|
# compute zero-one loss on validation set
|
|
error = numpy.mean(map(validate_model, xrange(n_dev_examples)))
|
|
print('epoch %i, validation error %f %%' % (epoch, error * 100))
|
|
|
|
end_time = time.clock()
|
|
print >> sys.stderr, ('The code for file ' +
|
|
os.path.split(__file__)[1] +
|
|
' ran for %.2fm' % ((end_time - start_time) / 60.))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
test_mlp()
|