mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
* Remove cruft module
This commit is contained in:
parent
fb0a641a2d
commit
f8d6d319f4
490
spacy/_bu_nn.pyx
490
spacy/_bu_nn.pyx
|
@ -1,490 +0,0 @@
|
|||
"""Feed-forward neural network, using Thenao."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy
|
||||
|
||||
import theano
|
||||
import theano.tensor as T
|
||||
import gzip
|
||||
import cPickle
|
||||
|
||||
|
||||
def load_data(dataset):
|
||||
''' Loads the dataset
|
||||
|
||||
:type dataset: string
|
||||
:param dataset: the path to the dataset (here MNIST)
|
||||
'''
|
||||
|
||||
#############
|
||||
# LOAD DATA #
|
||||
#############
|
||||
|
||||
# Download the MNIST dataset if it is not present
|
||||
data_dir, data_file = os.path.split(dataset)
|
||||
if data_dir == "" and not os.path.isfile(dataset):
|
||||
# Check if dataset is in the data directory.
|
||||
new_path = os.path.join(
|
||||
os.path.split(__file__)[0],
|
||||
"..",
|
||||
"data",
|
||||
dataset
|
||||
)
|
||||
if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
|
||||
dataset = new_path
|
||||
|
||||
if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
|
||||
import urllib
|
||||
origin = (
|
||||
'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
|
||||
)
|
||||
print 'Downloading data from %s' % origin
|
||||
urllib.urlretrieve(origin, dataset)
|
||||
|
||||
print '... loading data'
|
||||
|
||||
# Load the dataset
|
||||
f = gzip.open(dataset, 'rb')
|
||||
train_set, valid_set, test_set = cPickle.load(f)
|
||||
f.close()
|
||||
#train_set, valid_set, test_set format: tuple(input, target)
|
||||
#input is an numpy.ndarray of 2 dimensions (a matrix),
|
||||
#each row corresponding to an example. target is a
|
||||
#numpy.ndarray of 1 dimension (vector)) that have the same length as
|
||||
#the number of rows in the input. It should give the target
|
||||
#target to the example with the same index in the input.
|
||||
|
||||
def shared_dataset(data_xy, borrow=True):
|
||||
""" Function that loads the dataset into shared variables
|
||||
|
||||
The reason we store our dataset in shared variables is to allow
|
||||
Theano to copy it into the GPU memory (when code is run on GPU).
|
||||
Since copying data into the GPU is slow, copying a minibatch everytime
|
||||
is needed (the default behaviour if the data is not in a shared
|
||||
variable) would lead to a large decrease in performance.
|
||||
"""
|
||||
data_x, data_y = data_xy
|
||||
shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX),
|
||||
borrow=borrow)
|
||||
shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX),
|
||||
borrow=borrow)
|
||||
# When storing data on the GPU it has to be stored as floats
|
||||
# therefore we will store the labels as ``floatX`` as well
|
||||
# (``shared_y`` does exactly that). But during our computations
|
||||
# we need them as ints (we use labels as index, and if they are
|
||||
# floats it doesn't make sense) therefore instead of returning
|
||||
# ``shared_y`` we will have to cast it to int. This little hack
|
||||
# lets ous get around this issue
|
||||
return shared_x, T.cast(shared_y, 'int32')
|
||||
|
||||
test_set_x, test_set_y = shared_dataset(test_set)
|
||||
valid_set_x, valid_set_y = shared_dataset(valid_set)
|
||||
train_set_x, train_set_y = shared_dataset(train_set)
|
||||
|
||||
rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
|
||||
(test_set_x, test_set_y)]
|
||||
return rval
|
||||
|
||||
|
||||
class LogisticRegression(object):
|
||||
"""Multi-class Logistic Regression Class
|
||||
|
||||
The logistic regression is fully described by a weight matrix :math:`W`
|
||||
and bias vector :math:`b`. Classification is done by projecting data
|
||||
points onto a set of hyperplanes, the distance to which is used to
|
||||
determine a class membership probability.
|
||||
"""
|
||||
|
||||
def __init__(self, input, n_in, n_out):
|
||||
""" Initialize the parameters of the logistic regression
|
||||
|
||||
:type input: theano.tensor.TensorType
|
||||
:param input: symbolic variable that describes the input of the
|
||||
architecture (one minibatch)
|
||||
|
||||
:type n_in: int
|
||||
:param n_in: number of input units, the dimension of the space in
|
||||
which the datapoints lie
|
||||
|
||||
:type n_out: int
|
||||
:param n_out: number of output units, the dimension of the space in
|
||||
which the labels lie
|
||||
|
||||
"""
|
||||
# start-snippet-1
|
||||
# initialize with 0 the weights W as a matrix of shape (n_in, n_out)
|
||||
self.W = theano.shared(
|
||||
value=numpy.zeros((n_in, n_out),
|
||||
dtype=theano.config.floatX
|
||||
),
|
||||
name='W',
|
||||
borrow=True
|
||||
)
|
||||
# initialize the baises b as a vector of n_out 0s
|
||||
self.b = theano.shared(
|
||||
value=numpy.zeros(
|
||||
(n_out,),
|
||||
dtype=theano.config.floatX
|
||||
),
|
||||
name='b',
|
||||
borrow=True
|
||||
)
|
||||
|
||||
# symbolic expression for computing the matrix of class-membership
|
||||
# probabilities
|
||||
# Where:
|
||||
# W is a matrix where column-k represent the separation hyper plain for
|
||||
# class-k
|
||||
# x is a matrix where row-j represents input training sample-j
|
||||
# b is a vector where element-k represent the free parameter of hyper
|
||||
# plain-k
|
||||
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
|
||||
|
||||
# symbolic description of how to compute prediction as class whose
|
||||
# probability is maximal
|
||||
self.y_pred = T.argmax(self.p_y_given_x, axis=1)
|
||||
# end-snippet-1
|
||||
|
||||
# parameters of the model
|
||||
self.params = [self.W, self.b]
|
||||
|
||||
def neg_ll(self, y):
|
||||
"""Return the mean of the negative log-likelihood of the prediction
|
||||
of this model under a given target distribution.
|
||||
|
||||
.. math::
|
||||
|
||||
\frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
|
||||
\frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
|
||||
\log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
|
||||
\ell (\theta=\{W,b\}, \mathcal{D})
|
||||
|
||||
:type y: theano.tensor.TensorType
|
||||
:param y: corresponds to a vector that gives for each example the
|
||||
correct label
|
||||
|
||||
Note: we use the mean instead of the sum so that
|
||||
the learning rate is less dependent on the batch size
|
||||
"""
|
||||
# start-snippet-2
|
||||
# y.shape[0] is (symbolically) the number of rows in y, i.e.,
|
||||
# number of examples (call it n) in the minibatch
|
||||
# T.arange(y.shape[0]) is a symbolic vector which will contain
|
||||
# [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
|
||||
# Log-Probabilities (call it LP) with one row per example and
|
||||
# one column per class LP[T.arange(y.shape[0]),y] is a vector
|
||||
# v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
|
||||
# LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
|
||||
# the mean (across minibatch examples) of the elements in v,
|
||||
# i.e., the mean log-likelihood across the minibatch.
|
||||
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
|
||||
# end-snippet-2
|
||||
|
||||
def errors(self, y):
|
||||
"""Return a float representing the number of errors in the minibatch
|
||||
over the total number of examples of the minibatch ; zero one
|
||||
loss over the size of the minibatch
|
||||
|
||||
:type y: theano.tensor.TensorType
|
||||
:param y: corresponds to a vector that gives for each example the
|
||||
correct label
|
||||
"""
|
||||
|
||||
# check if y has same dimension of y_pred
|
||||
if y.ndim != self.y_pred.ndim:
|
||||
raise TypeError(
|
||||
'y should have the same shape as self.y_pred',
|
||||
('y', y.type, 'y_pred', self.y_pred.type)
|
||||
)
|
||||
# check if y is of the correct datatype
|
||||
if y.dtype.startswith('int'):
|
||||
# the T.neq operator returns a vector of 0s and 1s, where 1
|
||||
# represents a mistake in prediction
|
||||
return T.mean(T.neq(self.y_pred, y))
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
# start-snippet-1
|
||||
class HiddenLayer(object):
|
||||
def __init__(self, rng, input, n_in, n_out, W=None, b=None,
|
||||
activation=T.tanh):
|
||||
"""
|
||||
Typical hidden layer of a MLP: units are fully-connected and have
|
||||
sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
|
||||
and the bias vector b is of shape (n_out,).
|
||||
|
||||
NOTE : The nonlinearity used here is tanh
|
||||
|
||||
Hidden unit activation is given by: tanh(dot(input,W) + b)
|
||||
|
||||
:type rng: numpy.random.RandomState
|
||||
:param rng: a random number generator used to initialize weights
|
||||
|
||||
:type input: theano.tensor.dmatrix
|
||||
:param input: a symbolic tensor of shape (n_examples, n_in)
|
||||
|
||||
:type n_in: int
|
||||
:param n_in: dimensionality of input
|
||||
|
||||
:type n_out: int
|
||||
:param n_out: number of hidden units
|
||||
|
||||
:type activation: theano.Op or function
|
||||
:param activation: Non linearity to be applied in the hidden
|
||||
layer
|
||||
"""
|
||||
self.input = input
|
||||
# end-snippet-1
|
||||
|
||||
# `W` is initialized with `W_values` which is uniformely sampled
|
||||
# from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
|
||||
# for tanh activation function
|
||||
# the output of uniform if converted using asarray to dtype
|
||||
# theano.config.floatX so that the code is runable on GPU
|
||||
# Note : optimal initialization of weights is dependent on the
|
||||
# activation function used (among other things).
|
||||
# For example, results presented in [Xavier10] suggest that you
|
||||
# should use 4 times larger initial weights for sigmoid
|
||||
# compared to tanh
|
||||
# We have no info for other function, so we use the same as
|
||||
# tanh.
|
||||
if W is None:
|
||||
W_values = numpy.asarray(
|
||||
rng.uniform(
|
||||
low=-numpy.sqrt(6. / (n_in + n_out)),
|
||||
high=numpy.sqrt(6. / (n_in + n_out)),
|
||||
size=(n_in, n_out)
|
||||
),
|
||||
dtype=theano.config.floatX
|
||||
)
|
||||
if activation == theano.tensor.nnet.sigmoid:
|
||||
W_values *= 4
|
||||
|
||||
W = theano.shared(value=W_values, name='W', borrow=True)
|
||||
|
||||
if b is None:
|
||||
b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
||||
b = theano.shared(value=b_values, name='b', borrow=True)
|
||||
|
||||
self.W = W
|
||||
self.b = b
|
||||
|
||||
lin_output = T.dot(input, self.W) + self.b
|
||||
self.output = (
|
||||
lin_output if activation is None
|
||||
else activation(lin_output)
|
||||
)
|
||||
# parameters of the model
|
||||
self.params = [self.W, self.b]
|
||||
|
||||
|
||||
# start-snippet-2
|
||||
class MLP(object):
|
||||
"""Multi-Layer Perceptron Class
|
||||
|
||||
A multilayer perceptron is a feedforward artificial neural network model
|
||||
that has one layer or more of hidden units and nonlinear activations.
|
||||
Intermediate layers usually have as activation function tanh or the
|
||||
sigmoid function (defined here by a ``HiddenLayer`` class) while the
|
||||
top layer is a softmax layer (defined here by a ``LogisticRegression``
|
||||
class).
|
||||
"""
|
||||
|
||||
def __init__(self, rng, input, n_in, n_hidden, n_out):
|
||||
"""Initialize the parameters for the multilayer perceptron
|
||||
|
||||
:type rng: numpy.random.RandomState
|
||||
:param rng: a random number generator used to initialize weights
|
||||
|
||||
:type input: theano.tensor.TensorType
|
||||
:param input: symbolic variable that describes the input of the
|
||||
architecture (one minibatch)
|
||||
|
||||
:type n_in: int
|
||||
:param n_in: number of input units, the dimension of the space in
|
||||
which the datapoints lie
|
||||
|
||||
:type n_hidden: int
|
||||
:param n_hidden: number of hidden units
|
||||
|
||||
:type n_out: int
|
||||
:param n_out: number of output units, the dimension of the space in
|
||||
which the labels lie
|
||||
|
||||
"""
|
||||
|
||||
# Since we are dealing with a one hidden layer MLP, this will translate
|
||||
# into a HiddenLayer with a tanh activation function connected to the
|
||||
# LogisticRegression layer; the activation function can be replaced by
|
||||
# sigmoid or any other nonlinear function
|
||||
self.hidden = HiddenLayer(
|
||||
rng=rng,
|
||||
input=input,
|
||||
n_in=n_in,
|
||||
n_out=n_hidden,
|
||||
activation=T.tanh
|
||||
)
|
||||
|
||||
# The logistic regression layer gets as input the hidden units
|
||||
# of the hidden layer
|
||||
self.maxent = LogisticRegression(
|
||||
input=self.hidden.output,
|
||||
n_in=n_hidden,
|
||||
n_out=n_out
|
||||
)
|
||||
# L1 norm ; one regularization option is to enforce L1 norm to
|
||||
# be small
|
||||
self.L1 = abs(self.hidden.W).sum() + abs(self.maxent.W).sum()
|
||||
|
||||
# square of L2 norm ; one regularization option is to enforce
|
||||
# square of L2 norm to be small
|
||||
self.L2_sqr = (self.hidden.W ** 2).sum() + (self.maxent.W ** 2).sum()
|
||||
|
||||
# negative log likelihood of the MLP is given by the negative
|
||||
# log likelihood of the output of the model, computed in the
|
||||
# logistic regression layer
|
||||
self.neg_ll = self.maxent.neg_ll
|
||||
# same holds for the function computing the number of errors
|
||||
self.errors = self.maxent.errors
|
||||
|
||||
# the parameters of the model are the parameters of the two layer it is
|
||||
# made out of
|
||||
self.params = self.hidden.params + self.maxent.params
|
||||
|
||||
|
||||
|
||||
|
||||
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
|
||||
dataset='mnist.pkl.gz', batch_size=1, n_hidden=500):
|
||||
"""
|
||||
Demonstrate stochastic gradient descent optimization for a multilayer
|
||||
perceptron
|
||||
|
||||
This is demonstrated on MNIST.
|
||||
|
||||
:type learning_rate: float
|
||||
:param learning_rate: learning rate used (factor for the stochastic
|
||||
gradient
|
||||
|
||||
:type L1_reg: float
|
||||
:param L1_reg: L1-norm's weight when added to the cost (see
|
||||
regularization)
|
||||
|
||||
:type L2_reg: float
|
||||
:param L2_reg: L2-norm's weight when added to the cost (see
|
||||
regularization)
|
||||
|
||||
:type n_epochs: int
|
||||
:param n_epochs: maximal number of epochs to run the optimizer
|
||||
|
||||
:type dataset: string
|
||||
:param dataset: the path of the MNIST dataset file from
|
||||
http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
|
||||
"""
|
||||
datasets = load_data(dataset)
|
||||
|
||||
train_set_x, train_set_y = datasets[0]
|
||||
valid_set_x, valid_set_y = datasets[1]
|
||||
test_set_x, test_set_y = datasets[2]
|
||||
|
||||
######################
|
||||
# BUILD ACTUAL MODEL #
|
||||
######################
|
||||
print '... building the model'
|
||||
|
||||
# allocate symbolic variables for the data
|
||||
index = T.lscalar() # index to a [mini]batch
|
||||
x = T.matrix('x') # the data is presented as rasterized images
|
||||
y = T.ivector('y') # the labels are presented as 1D vector of
|
||||
# [int] labels
|
||||
|
||||
rng = numpy.random.RandomState(1234)
|
||||
|
||||
# construct the MLP class
|
||||
mlp = MLP(
|
||||
rng=rng,
|
||||
input=x,
|
||||
n_in=28 * 28,
|
||||
n_hidden=n_hidden,
|
||||
n_out=10
|
||||
)
|
||||
|
||||
# the cost we minimize during training is the negative log likelihood of
|
||||
# the model plus the regularization terms (L1 and L2); cost is expressed
|
||||
# here symbolically
|
||||
|
||||
# compiling a Theano function that computes the mistakes that are made
|
||||
# by the model on a minibatch
|
||||
test_model = theano.function(
|
||||
inputs=[index],
|
||||
outputs=mlp.maxent.errors(y),
|
||||
givens={
|
||||
x: test_set_x[index:index+1],
|
||||
y: test_set_y[index:index+1]
|
||||
}
|
||||
)
|
||||
|
||||
validate_model = theano.function(
|
||||
inputs=[index],
|
||||
outputs=mlp.maxent.errors(y),
|
||||
givens={
|
||||
x: valid_set_x[index:index+1],
|
||||
y: valid_set_y[index:index+1]
|
||||
}
|
||||
)
|
||||
|
||||
# compute the gradient of cost with respect to theta (sotred in params)
|
||||
# the resulting gradients will be stored in a list gparams
|
||||
cost = mlp.neg_ll(y) + L1_reg * mlp.L1 + L2_reg * mlp.L2_sqr
|
||||
gparams = [T.grad(cost, param) for param in mlp.params]
|
||||
|
||||
# specify how to update the parameters of the model as a list of
|
||||
# (variable, update expression) pairs
|
||||
|
||||
updates = [(mlp.params[i], mlp.params[i] - (learning_rate * gparams[i]))
|
||||
for i in xrange(len(gparams))]
|
||||
|
||||
# compiling a Theano function `train_model` that returns the cost, but
|
||||
# in the same time updates the parameter of the model based on the rules
|
||||
# defined in `updates`
|
||||
train_model = theano.function(
|
||||
inputs=[index],
|
||||
outputs=cost,
|
||||
updates=updates,
|
||||
givens={
|
||||
x: train_set_x[index:index+1],
|
||||
y: train_set_y[index:index+1]
|
||||
}
|
||||
)
|
||||
# end-snippet-5
|
||||
|
||||
###############
|
||||
# TRAIN MODEL #
|
||||
###############
|
||||
print '... training'
|
||||
|
||||
start_time = time.clock()
|
||||
|
||||
n_examples = train_set_x.get_value(borrow=True).shape[0]
|
||||
n_dev_examples = valid_set_x.get_value(borrow=True).shape[0]
|
||||
n_test_examples = test_set_x.get_value(borrow=True).shape[0]
|
||||
|
||||
for epoch in range(1, n_epochs+1):
|
||||
for idx in xrange(n_examples):
|
||||
train_model(idx)
|
||||
# compute zero-one loss on validation set
|
||||
error = numpy.mean(map(validate_model, xrange(n_dev_examples)))
|
||||
print('epoch %i, validation error %f %%' % (epoch, error * 100))
|
||||
|
||||
end_time = time.clock()
|
||||
print >> sys.stderr, ('The code for file ' +
|
||||
os.path.split(__file__)[1] +
|
||||
' ran for %.2fm' % ((end_time - start_time) / 60.))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_mlp()
|
Loading…
Reference in New Issue
Block a user