mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Merge changes, and adjust Example to use memoryview
This commit is contained in:
parent
9282a8e72c
commit
897dd0dd0b
255
bin/parser/nn_train.py
Executable file
255
bin/parser/nn_train.py
Executable file
|
@ -0,0 +1,255 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import codecs
|
||||
import random
|
||||
|
||||
import plac
|
||||
import cProfile
|
||||
import pstats
|
||||
import re
|
||||
|
||||
import spacy.util
|
||||
from spacy.en import English
|
||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
||||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.gold import read_json_file
|
||||
from spacy.gold import GoldParse
|
||||
|
||||
from spacy.scorer import Scorer
|
||||
|
||||
from thinc.theano_nn import compile_theano_model
|
||||
|
||||
from spacy.syntax.parser import Parser
|
||||
from spacy._theano import TheanoModel
|
||||
|
||||
|
||||
def _corrupt(c, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return c
|
||||
elif c == ' ':
|
||||
return '\n'
|
||||
elif c == '\n':
|
||||
return ' '
|
||||
elif c in ['.', "'", "!", "?"]:
|
||||
return ''
|
||||
else:
|
||||
return c.lower()
|
||||
|
||||
|
||||
def add_noise(orig, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return orig
|
||||
elif type(orig) == list:
|
||||
corrupted = [_corrupt(word, noise_level) for word in orig]
|
||||
corrupted = [w for w in corrupted if w]
|
||||
return corrupted
|
||||
else:
|
||||
return ''.join(_corrupt(c, noise_level) for c in orig)
|
||||
|
||||
|
||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
else:
|
||||
tokens = nlp.tokenizer(raw_text)
|
||||
nlp.tagger(tokens)
|
||||
nlp.entity(tokens)
|
||||
nlp.parser(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=verbose)
|
||||
|
||||
|
||||
def _merge_sents(sents):
|
||||
m_deps = [[], [], [], [], [], []]
|
||||
m_brackets = []
|
||||
i = 0
|
||||
for (ids, words, tags, heads, labels, ner), brackets in sents:
|
||||
m_deps[0].extend(id_ + i for id_ in ids)
|
||||
m_deps[1].extend(words)
|
||||
m_deps[2].extend(tags)
|
||||
m_deps[3].extend(head + i for head in heads)
|
||||
m_deps[4].extend(labels)
|
||||
m_deps[5].extend(ner)
|
||||
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
|
||||
i += len(ids)
|
||||
return [(m_deps, m_brackets)]
|
||||
|
||||
|
||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
|
||||
verbose=False,
|
||||
eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10):
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
ner_model_dir = path.join(model_dir, 'ner')
|
||||
if path.exists(dep_model_dir):
|
||||
shutil.rmtree(dep_model_dir)
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
if path.exists(ner_model_dir):
|
||||
shutil.rmtree(ner_model_dir)
|
||||
os.mkdir(dep_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
os.mkdir(ner_model_dir)
|
||||
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
|
||||
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
|
||||
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
||||
labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
|
||||
beam_width=0)
|
||||
|
||||
if n_sents > 0:
|
||||
gold_tuples = gold_tuples[:n_sents]
|
||||
|
||||
nlp = Language(data_dir=model_dir)
|
||||
|
||||
def make_model(n_classes, input_spec, model_dir):
|
||||
print input_spec
|
||||
n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec)
|
||||
print 'Compiling'
|
||||
debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden,
|
||||
n_in, 0.0, 0.0)
|
||||
print 'Done'
|
||||
return TheanoModel(
|
||||
n_classes,
|
||||
input_spec,
|
||||
train_func,
|
||||
predict_func,
|
||||
model_loc=model_dir,
|
||||
debug=debug)
|
||||
|
||||
nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem,
|
||||
make_model)
|
||||
|
||||
print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
|
||||
for itn in range(n_iter):
|
||||
scorer = Scorer()
|
||||
loss = 0
|
||||
for raw_text, sents in gold_tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
sents = _merge_sents(sents)
|
||||
for annot_tuples, ctnt in sents:
|
||||
if len(annot_tuples[1]) == 1:
|
||||
continue
|
||||
score_model(scorer, nlp, raw_text, annot_tuples,
|
||||
verbose=verbose if itn >= 2 else False)
|
||||
if raw_text is None:
|
||||
words = add_noise(annot_tuples[1], corruption_level)
|
||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||
else:
|
||||
raw_text = add_noise(raw_text, corruption_level)
|
||||
tokens = nlp.tokenizer(raw_text)
|
||||
nlp.tagger(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
||||
if not gold.is_projective:
|
||||
raise Exception(
|
||||
"Non-projective sentence in training, after we should "
|
||||
"have enforced projectivity: %s" % annot_tuples
|
||||
)
|
||||
loss += nlp.parser.train(tokens, gold)
|
||||
nlp.entity.train(tokens, gold)
|
||||
nlp.tagger.train(tokens, gold.tags)
|
||||
random.shuffle(gold_tuples)
|
||||
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||
scorer.tags_acc,
|
||||
scorer.token_acc)
|
||||
nlp.parser.model.end_training()
|
||||
nlp.entity.model.end_training()
|
||||
nlp.tagger.model.end_training()
|
||||
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
|
||||
return nlp
|
||||
|
||||
|
||||
def evaluate(nlp, gold_tuples, gold_preproc=True):
|
||||
scorer = Scorer()
|
||||
for raw_text, sents in gold_tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
sents = _merge_sents(sents)
|
||||
for annot_tuples, brackets in sents:
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.entity(tokens)
|
||||
nlp.parser(tokens)
|
||||
else:
|
||||
tokens = nlp(raw_text, merge_mwes=False)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold)
|
||||
return scorer
|
||||
|
||||
|
||||
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
|
||||
nlp = Language(data_dir=model_dir)
|
||||
if beam_width is not None:
|
||||
nlp.parser.cfg.beam_width = beam_width
|
||||
gold_tuples = read_json_file(dev_loc)
|
||||
scorer = Scorer()
|
||||
out_file = codecs.open(out_loc, 'w', 'utf8')
|
||||
for raw_text, sents in gold_tuples:
|
||||
sents = _merge_sents(sents)
|
||||
for annot_tuples, brackets in sents:
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.entity(tokens)
|
||||
nlp.parser(tokens)
|
||||
else:
|
||||
tokens = nlp(raw_text, merge_mwes=False)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=False)
|
||||
for t in tokens:
|
||||
out_file.write(
|
||||
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
|
||||
)
|
||||
return scorer
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_loc=("Location of training file or directory"),
|
||||
dev_loc=("Location of development file or directory"),
|
||||
model_dir=("Location of output model directory",),
|
||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
||||
corruption_level=("Amount of noise to add to training data", "option", "c", float),
|
||||
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
|
||||
out_loc=("Out location", "option", "o", str),
|
||||
n_sents=("Number of training sentences", "option", "n", int),
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
||||
debug=("Debug mode", "flag", "d", bool),
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
||||
debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1,
|
||||
eval_only=False):
|
||||
gold_train = list(read_json_file(train_loc))
|
||||
nlp = train(English, gold_train, model_dir,
|
||||
feat_set='embed',
|
||||
gold_preproc=gold_preproc, n_sents=n_sents,
|
||||
corruption_level=corruption_level, n_iter=n_iter,
|
||||
verbose=verbose)
|
||||
#if out_loc:
|
||||
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
||||
scorer = evaluate(nlp, list(read_json_file(dev_loc)), gold_preproc=gold_preproc)
|
||||
|
||||
print 'TOK', 100-scorer.token_acc
|
||||
print 'POS', scorer.tags_acc
|
||||
print 'UAS', scorer.uas
|
||||
print 'LAS', scorer.las
|
||||
|
||||
print 'NER P', scorer.ents_p
|
||||
print 'NER R', scorer.ents_r
|
||||
print 'NER F', scorer.ents_f
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
490
spacy/_bu_nn.pyx
Normal file
490
spacy/_bu_nn.pyx
Normal file
|
@ -0,0 +1,490 @@
|
|||
"""Feed-forward neural network, using Thenao."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy
|
||||
|
||||
import theano
|
||||
import theano.tensor as T
|
||||
import gzip
|
||||
import cPickle
|
||||
|
||||
|
||||
def load_data(dataset):
|
||||
''' Loads the dataset
|
||||
|
||||
:type dataset: string
|
||||
:param dataset: the path to the dataset (here MNIST)
|
||||
'''
|
||||
|
||||
#############
|
||||
# LOAD DATA #
|
||||
#############
|
||||
|
||||
# Download the MNIST dataset if it is not present
|
||||
data_dir, data_file = os.path.split(dataset)
|
||||
if data_dir == "" and not os.path.isfile(dataset):
|
||||
# Check if dataset is in the data directory.
|
||||
new_path = os.path.join(
|
||||
os.path.split(__file__)[0],
|
||||
"..",
|
||||
"data",
|
||||
dataset
|
||||
)
|
||||
if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
|
||||
dataset = new_path
|
||||
|
||||
if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
|
||||
import urllib
|
||||
origin = (
|
||||
'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
|
||||
)
|
||||
print 'Downloading data from %s' % origin
|
||||
urllib.urlretrieve(origin, dataset)
|
||||
|
||||
print '... loading data'
|
||||
|
||||
# Load the dataset
|
||||
f = gzip.open(dataset, 'rb')
|
||||
train_set, valid_set, test_set = cPickle.load(f)
|
||||
f.close()
|
||||
#train_set, valid_set, test_set format: tuple(input, target)
|
||||
#input is an numpy.ndarray of 2 dimensions (a matrix),
|
||||
#each row corresponding to an example. target is a
|
||||
#numpy.ndarray of 1 dimension (vector)) that have the same length as
|
||||
#the number of rows in the input. It should give the target
|
||||
#target to the example with the same index in the input.
|
||||
|
||||
def shared_dataset(data_xy, borrow=True):
|
||||
""" Function that loads the dataset into shared variables
|
||||
|
||||
The reason we store our dataset in shared variables is to allow
|
||||
Theano to copy it into the GPU memory (when code is run on GPU).
|
||||
Since copying data into the GPU is slow, copying a minibatch everytime
|
||||
is needed (the default behaviour if the data is not in a shared
|
||||
variable) would lead to a large decrease in performance.
|
||||
"""
|
||||
data_x, data_y = data_xy
|
||||
shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX),
|
||||
borrow=borrow)
|
||||
shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX),
|
||||
borrow=borrow)
|
||||
# When storing data on the GPU it has to be stored as floats
|
||||
# therefore we will store the labels as ``floatX`` as well
|
||||
# (``shared_y`` does exactly that). But during our computations
|
||||
# we need them as ints (we use labels as index, and if they are
|
||||
# floats it doesn't make sense) therefore instead of returning
|
||||
# ``shared_y`` we will have to cast it to int. This little hack
|
||||
# lets ous get around this issue
|
||||
return shared_x, T.cast(shared_y, 'int32')
|
||||
|
||||
test_set_x, test_set_y = shared_dataset(test_set)
|
||||
valid_set_x, valid_set_y = shared_dataset(valid_set)
|
||||
train_set_x, train_set_y = shared_dataset(train_set)
|
||||
|
||||
rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
|
||||
(test_set_x, test_set_y)]
|
||||
return rval
|
||||
|
||||
|
||||
class LogisticRegression(object):
|
||||
"""Multi-class Logistic Regression Class
|
||||
|
||||
The logistic regression is fully described by a weight matrix :math:`W`
|
||||
and bias vector :math:`b`. Classification is done by projecting data
|
||||
points onto a set of hyperplanes, the distance to which is used to
|
||||
determine a class membership probability.
|
||||
"""
|
||||
|
||||
def __init__(self, input, n_in, n_out):
|
||||
""" Initialize the parameters of the logistic regression
|
||||
|
||||
:type input: theano.tensor.TensorType
|
||||
:param input: symbolic variable that describes the input of the
|
||||
architecture (one minibatch)
|
||||
|
||||
:type n_in: int
|
||||
:param n_in: number of input units, the dimension of the space in
|
||||
which the datapoints lie
|
||||
|
||||
:type n_out: int
|
||||
:param n_out: number of output units, the dimension of the space in
|
||||
which the labels lie
|
||||
|
||||
"""
|
||||
# start-snippet-1
|
||||
# initialize with 0 the weights W as a matrix of shape (n_in, n_out)
|
||||
self.W = theano.shared(
|
||||
value=numpy.zeros((n_in, n_out),
|
||||
dtype=theano.config.floatX
|
||||
),
|
||||
name='W',
|
||||
borrow=True
|
||||
)
|
||||
# initialize the baises b as a vector of n_out 0s
|
||||
self.b = theano.shared(
|
||||
value=numpy.zeros(
|
||||
(n_out,),
|
||||
dtype=theano.config.floatX
|
||||
),
|
||||
name='b',
|
||||
borrow=True
|
||||
)
|
||||
|
||||
# symbolic expression for computing the matrix of class-membership
|
||||
# probabilities
|
||||
# Where:
|
||||
# W is a matrix where column-k represent the separation hyper plain for
|
||||
# class-k
|
||||
# x is a matrix where row-j represents input training sample-j
|
||||
# b is a vector where element-k represent the free parameter of hyper
|
||||
# plain-k
|
||||
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
|
||||
|
||||
# symbolic description of how to compute prediction as class whose
|
||||
# probability is maximal
|
||||
self.y_pred = T.argmax(self.p_y_given_x, axis=1)
|
||||
# end-snippet-1
|
||||
|
||||
# parameters of the model
|
||||
self.params = [self.W, self.b]
|
||||
|
||||
def neg_ll(self, y):
|
||||
"""Return the mean of the negative log-likelihood of the prediction
|
||||
of this model under a given target distribution.
|
||||
|
||||
.. math::
|
||||
|
||||
\frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
|
||||
\frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
|
||||
\log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
|
||||
\ell (\theta=\{W,b\}, \mathcal{D})
|
||||
|
||||
:type y: theano.tensor.TensorType
|
||||
:param y: corresponds to a vector that gives for each example the
|
||||
correct label
|
||||
|
||||
Note: we use the mean instead of the sum so that
|
||||
the learning rate is less dependent on the batch size
|
||||
"""
|
||||
# start-snippet-2
|
||||
# y.shape[0] is (symbolically) the number of rows in y, i.e.,
|
||||
# number of examples (call it n) in the minibatch
|
||||
# T.arange(y.shape[0]) is a symbolic vector which will contain
|
||||
# [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
|
||||
# Log-Probabilities (call it LP) with one row per example and
|
||||
# one column per class LP[T.arange(y.shape[0]),y] is a vector
|
||||
# v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
|
||||
# LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
|
||||
# the mean (across minibatch examples) of the elements in v,
|
||||
# i.e., the mean log-likelihood across the minibatch.
|
||||
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
|
||||
# end-snippet-2
|
||||
|
||||
def errors(self, y):
|
||||
"""Return a float representing the number of errors in the minibatch
|
||||
over the total number of examples of the minibatch ; zero one
|
||||
loss over the size of the minibatch
|
||||
|
||||
:type y: theano.tensor.TensorType
|
||||
:param y: corresponds to a vector that gives for each example the
|
||||
correct label
|
||||
"""
|
||||
|
||||
# check if y has same dimension of y_pred
|
||||
if y.ndim != self.y_pred.ndim:
|
||||
raise TypeError(
|
||||
'y should have the same shape as self.y_pred',
|
||||
('y', y.type, 'y_pred', self.y_pred.type)
|
||||
)
|
||||
# check if y is of the correct datatype
|
||||
if y.dtype.startswith('int'):
|
||||
# the T.neq operator returns a vector of 0s and 1s, where 1
|
||||
# represents a mistake in prediction
|
||||
return T.mean(T.neq(self.y_pred, y))
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
# start-snippet-1
|
||||
class HiddenLayer(object):
|
||||
def __init__(self, rng, input, n_in, n_out, W=None, b=None,
|
||||
activation=T.tanh):
|
||||
"""
|
||||
Typical hidden layer of a MLP: units are fully-connected and have
|
||||
sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
|
||||
and the bias vector b is of shape (n_out,).
|
||||
|
||||
NOTE : The nonlinearity used here is tanh
|
||||
|
||||
Hidden unit activation is given by: tanh(dot(input,W) + b)
|
||||
|
||||
:type rng: numpy.random.RandomState
|
||||
:param rng: a random number generator used to initialize weights
|
||||
|
||||
:type input: theano.tensor.dmatrix
|
||||
:param input: a symbolic tensor of shape (n_examples, n_in)
|
||||
|
||||
:type n_in: int
|
||||
:param n_in: dimensionality of input
|
||||
|
||||
:type n_out: int
|
||||
:param n_out: number of hidden units
|
||||
|
||||
:type activation: theano.Op or function
|
||||
:param activation: Non linearity to be applied in the hidden
|
||||
layer
|
||||
"""
|
||||
self.input = input
|
||||
# end-snippet-1
|
||||
|
||||
# `W` is initialized with `W_values` which is uniformely sampled
|
||||
# from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
|
||||
# for tanh activation function
|
||||
# the output of uniform if converted using asarray to dtype
|
||||
# theano.config.floatX so that the code is runable on GPU
|
||||
# Note : optimal initialization of weights is dependent on the
|
||||
# activation function used (among other things).
|
||||
# For example, results presented in [Xavier10] suggest that you
|
||||
# should use 4 times larger initial weights for sigmoid
|
||||
# compared to tanh
|
||||
# We have no info for other function, so we use the same as
|
||||
# tanh.
|
||||
if W is None:
|
||||
W_values = numpy.asarray(
|
||||
rng.uniform(
|
||||
low=-numpy.sqrt(6. / (n_in + n_out)),
|
||||
high=numpy.sqrt(6. / (n_in + n_out)),
|
||||
size=(n_in, n_out)
|
||||
),
|
||||
dtype=theano.config.floatX
|
||||
)
|
||||
if activation == theano.tensor.nnet.sigmoid:
|
||||
W_values *= 4
|
||||
|
||||
W = theano.shared(value=W_values, name='W', borrow=True)
|
||||
|
||||
if b is None:
|
||||
b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
||||
b = theano.shared(value=b_values, name='b', borrow=True)
|
||||
|
||||
self.W = W
|
||||
self.b = b
|
||||
|
||||
lin_output = T.dot(input, self.W) + self.b
|
||||
self.output = (
|
||||
lin_output if activation is None
|
||||
else activation(lin_output)
|
||||
)
|
||||
# parameters of the model
|
||||
self.params = [self.W, self.b]
|
||||
|
||||
|
||||
# start-snippet-2
|
||||
class MLP(object):
|
||||
"""Multi-Layer Perceptron Class
|
||||
|
||||
A multilayer perceptron is a feedforward artificial neural network model
|
||||
that has one layer or more of hidden units and nonlinear activations.
|
||||
Intermediate layers usually have as activation function tanh or the
|
||||
sigmoid function (defined here by a ``HiddenLayer`` class) while the
|
||||
top layer is a softmax layer (defined here by a ``LogisticRegression``
|
||||
class).
|
||||
"""
|
||||
|
||||
def __init__(self, rng, input, n_in, n_hidden, n_out):
|
||||
"""Initialize the parameters for the multilayer perceptron
|
||||
|
||||
:type rng: numpy.random.RandomState
|
||||
:param rng: a random number generator used to initialize weights
|
||||
|
||||
:type input: theano.tensor.TensorType
|
||||
:param input: symbolic variable that describes the input of the
|
||||
architecture (one minibatch)
|
||||
|
||||
:type n_in: int
|
||||
:param n_in: number of input units, the dimension of the space in
|
||||
which the datapoints lie
|
||||
|
||||
:type n_hidden: int
|
||||
:param n_hidden: number of hidden units
|
||||
|
||||
:type n_out: int
|
||||
:param n_out: number of output units, the dimension of the space in
|
||||
which the labels lie
|
||||
|
||||
"""
|
||||
|
||||
# Since we are dealing with a one hidden layer MLP, this will translate
|
||||
# into a HiddenLayer with a tanh activation function connected to the
|
||||
# LogisticRegression layer; the activation function can be replaced by
|
||||
# sigmoid or any other nonlinear function
|
||||
self.hidden = HiddenLayer(
|
||||
rng=rng,
|
||||
input=input,
|
||||
n_in=n_in,
|
||||
n_out=n_hidden,
|
||||
activation=T.tanh
|
||||
)
|
||||
|
||||
# The logistic regression layer gets as input the hidden units
|
||||
# of the hidden layer
|
||||
self.maxent = LogisticRegression(
|
||||
input=self.hidden.output,
|
||||
n_in=n_hidden,
|
||||
n_out=n_out
|
||||
)
|
||||
# L1 norm ; one regularization option is to enforce L1 norm to
|
||||
# be small
|
||||
self.L1 = abs(self.hidden.W).sum() + abs(self.maxent.W).sum()
|
||||
|
||||
# square of L2 norm ; one regularization option is to enforce
|
||||
# square of L2 norm to be small
|
||||
self.L2_sqr = (self.hidden.W ** 2).sum() + (self.maxent.W ** 2).sum()
|
||||
|
||||
# negative log likelihood of the MLP is given by the negative
|
||||
# log likelihood of the output of the model, computed in the
|
||||
# logistic regression layer
|
||||
self.neg_ll = self.maxent.neg_ll
|
||||
# same holds for the function computing the number of errors
|
||||
self.errors = self.maxent.errors
|
||||
|
||||
# the parameters of the model are the parameters of the two layer it is
|
||||
# made out of
|
||||
self.params = self.hidden.params + self.maxent.params
|
||||
|
||||
|
||||
|
||||
|
||||
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
|
||||
dataset='mnist.pkl.gz', batch_size=1, n_hidden=500):
|
||||
"""
|
||||
Demonstrate stochastic gradient descent optimization for a multilayer
|
||||
perceptron
|
||||
|
||||
This is demonstrated on MNIST.
|
||||
|
||||
:type learning_rate: float
|
||||
:param learning_rate: learning rate used (factor for the stochastic
|
||||
gradient
|
||||
|
||||
:type L1_reg: float
|
||||
:param L1_reg: L1-norm's weight when added to the cost (see
|
||||
regularization)
|
||||
|
||||
:type L2_reg: float
|
||||
:param L2_reg: L2-norm's weight when added to the cost (see
|
||||
regularization)
|
||||
|
||||
:type n_epochs: int
|
||||
:param n_epochs: maximal number of epochs to run the optimizer
|
||||
|
||||
:type dataset: string
|
||||
:param dataset: the path of the MNIST dataset file from
|
||||
http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
|
||||
"""
|
||||
datasets = load_data(dataset)
|
||||
|
||||
train_set_x, train_set_y = datasets[0]
|
||||
valid_set_x, valid_set_y = datasets[1]
|
||||
test_set_x, test_set_y = datasets[2]
|
||||
|
||||
######################
|
||||
# BUILD ACTUAL MODEL #
|
||||
######################
|
||||
print '... building the model'
|
||||
|
||||
# allocate symbolic variables for the data
|
||||
index = T.lscalar() # index to a [mini]batch
|
||||
x = T.matrix('x') # the data is presented as rasterized images
|
||||
y = T.ivector('y') # the labels are presented as 1D vector of
|
||||
# [int] labels
|
||||
|
||||
rng = numpy.random.RandomState(1234)
|
||||
|
||||
# construct the MLP class
|
||||
mlp = MLP(
|
||||
rng=rng,
|
||||
input=x,
|
||||
n_in=28 * 28,
|
||||
n_hidden=n_hidden,
|
||||
n_out=10
|
||||
)
|
||||
|
||||
# the cost we minimize during training is the negative log likelihood of
|
||||
# the model plus the regularization terms (L1 and L2); cost is expressed
|
||||
# here symbolically
|
||||
|
||||
# compiling a Theano function that computes the mistakes that are made
|
||||
# by the model on a minibatch
|
||||
test_model = theano.function(
|
||||
inputs=[index],
|
||||
outputs=mlp.maxent.errors(y),
|
||||
givens={
|
||||
x: test_set_x[index:index+1],
|
||||
y: test_set_y[index:index+1]
|
||||
}
|
||||
)
|
||||
|
||||
validate_model = theano.function(
|
||||
inputs=[index],
|
||||
outputs=mlp.maxent.errors(y),
|
||||
givens={
|
||||
x: valid_set_x[index:index+1],
|
||||
y: valid_set_y[index:index+1]
|
||||
}
|
||||
)
|
||||
|
||||
# compute the gradient of cost with respect to theta (sotred in params)
|
||||
# the resulting gradients will be stored in a list gparams
|
||||
cost = mlp.neg_ll(y) + L1_reg * mlp.L1 + L2_reg * mlp.L2_sqr
|
||||
gparams = [T.grad(cost, param) for param in mlp.params]
|
||||
|
||||
# specify how to update the parameters of the model as a list of
|
||||
# (variable, update expression) pairs
|
||||
|
||||
updates = [(mlp.params[i], mlp.params[i] - (learning_rate * gparams[i]))
|
||||
for i in xrange(len(gparams))]
|
||||
|
||||
# compiling a Theano function `train_model` that returns the cost, but
|
||||
# in the same time updates the parameter of the model based on the rules
|
||||
# defined in `updates`
|
||||
train_model = theano.function(
|
||||
inputs=[index],
|
||||
outputs=cost,
|
||||
updates=updates,
|
||||
givens={
|
||||
x: train_set_x[index:index+1],
|
||||
y: train_set_y[index:index+1]
|
||||
}
|
||||
)
|
||||
# end-snippet-5
|
||||
|
||||
###############
|
||||
# TRAIN MODEL #
|
||||
###############
|
||||
print '... training'
|
||||
|
||||
start_time = time.clock()
|
||||
|
||||
n_examples = train_set_x.get_value(borrow=True).shape[0]
|
||||
n_dev_examples = valid_set_x.get_value(borrow=True).shape[0]
|
||||
n_test_examples = test_set_x.get_value(borrow=True).shape[0]
|
||||
|
||||
for epoch in range(1, n_epochs+1):
|
||||
for idx in xrange(n_examples):
|
||||
train_model(idx)
|
||||
# compute zero-one loss on validation set
|
||||
error = numpy.mean(map(validate_model, xrange(n_dev_examples)))
|
||||
print('epoch %i, validation error %f %%' % (epoch, error * 100))
|
||||
|
||||
end_time = time.clock()
|
||||
print >> sys.stderr, ('The code for file ' +
|
||||
os.path.split(__file__)[1] +
|
||||
' ran for %.2fm' % ((end_time - start_time) / 60.))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_mlp()
|
|
@ -61,18 +61,18 @@ cdef class Model:
|
|||
self._model.load(self.model_loc, freq_thresh=0)
|
||||
|
||||
def predict(self, Example eg):
|
||||
self.set_scores(<weight_t*>eg.scores.data, <atom_t*>eg.atoms.data)
|
||||
eg.guess = arg_max_if_true(<weight_t*>eg.scores.data, <int*>eg.is_valid.data,
|
||||
self.set_scores(&eg.scores[0], &eg.atoms[0])
|
||||
eg.guess = arg_max_if_true(&eg.scores[0], &eg.is_valid[0],
|
||||
self.n_classes)
|
||||
|
||||
def train(self, Example eg):
|
||||
self.set_scores(<weight_t*>eg.scores.data, <atom_t*>eg.atoms.data)
|
||||
eg.guess = arg_max_if_true(<weight_t*>eg.scores.data,
|
||||
<int*>eg.is_valid.data, self.n_classes)
|
||||
eg.best = arg_max_if_zero(<weight_t*>eg.scores.data, <int*>eg.costs.data,
|
||||
self.set_scores(&eg.scores[0], &eg.atoms[0])
|
||||
eg.guess = arg_max_if_true(&eg.scores[0],
|
||||
&eg.is_valid[0], self.n_classes)
|
||||
eg.best = arg_max_if_zero(&eg.scores[0], &eg.costs[0],
|
||||
self.n_classes)
|
||||
eg.cost = eg.costs[eg.guess]
|
||||
self.update(<atom_t*>eg.atoms.data, eg.guess, eg.best, eg.cost)
|
||||
self.update(&eg.atoms[0], eg.guess, eg.best, eg.cost)
|
||||
|
||||
cdef const weight_t* score(self, atom_t* context) except NULL:
|
||||
cdef int n_feats
|
||||
|
|
3
spacy/_nn.py
Normal file
3
spacy/_nn.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
"""Feed-forward neural network, using Thenao."""
|
||||
|
||||
|
146
spacy/_nn.pyx
Normal file
146
spacy/_nn.pyx
Normal file
|
@ -0,0 +1,146 @@
|
|||
"""Feed-forward neural network, using Thenao."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy
|
||||
|
||||
import theano
|
||||
import theano.tensor as T
|
||||
import plac
|
||||
|
||||
from spacy.gold import read_json_file
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
||||
|
||||
|
||||
def build_model(n_classes, n_vocab, n_hidden, n_word_embed, n_tag_embed):
|
||||
# allocate symbolic variables for the data
|
||||
words = T.vector('words')
|
||||
tags = T.vector('tags')
|
||||
|
||||
word_e = _init_embedding(n_words, n_word_embed)
|
||||
tag_e = _init_embedding(n_tags, n_tag_embed)
|
||||
label_e = _init_embedding(n_labels, n_label_embed)
|
||||
maxent_W, maxent_b = _init_maxent_weights(n_hidden, n_classes)
|
||||
hidden_W, hidden_b = _init_hidden_weights(28*28, n_hidden, T.tanh)
|
||||
params = [hidden_W, hidden_b, maxent_W, maxent_b, word_e, tag_e, label_e]
|
||||
|
||||
x = T.concatenate([
|
||||
T.flatten(word_e[word_indices], outdim=1),
|
||||
T.flatten(tag_e[tag_indices], outdim=1)])
|
||||
|
||||
p_y_given_x = feed_layer(
|
||||
T.nnet.softmax,
|
||||
maxent_W,
|
||||
maxent_b,
|
||||
feed_layer(
|
||||
T.tanh,
|
||||
hidden_W,
|
||||
hidden_b,
|
||||
x))[0]
|
||||
|
||||
guess = T.argmax(p_y_given_x)
|
||||
|
||||
cost = (
|
||||
-T.log(p_y_given_x[y])
|
||||
+ L1(L1_reg, maxent_W, hidden_W, word_e, tag_e)
|
||||
+ L2(L2_reg, maxent_W, hidden_W, wod_e, tag_e)
|
||||
)
|
||||
|
||||
train_model = theano.function(
|
||||
inputs=[words, tags, y],
|
||||
outputs=guess,
|
||||
updates=[update(learning_rate, param, cost) for param in params]
|
||||
)
|
||||
|
||||
evaluate_model = theano.function(
|
||||
inputs=[x, y],
|
||||
outputs=T.neq(y, T.argmax(p_y_given_x[0])),
|
||||
)
|
||||
return train_model, evaluate_model
|
||||
|
||||
|
||||
def _init_embedding(vocab_size, n_dim):
|
||||
embedding = 0.2 * numpy.random.uniform(-1.0, 1.0, (vocab_size+1, n_dim))
|
||||
return theano.shared(embedding).astype(theano.config.floatX)
|
||||
|
||||
|
||||
def _init_maxent_weights(n_hidden, n_out):
|
||||
weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
|
||||
bias = numpy.zeros((10,), dtype=theano.config.floatX)
|
||||
return (
|
||||
theano.shared(name='W', borrow=True, value=weights),
|
||||
theano.shared(name='b', borrow=True, value=bias)
|
||||
)
|
||||
|
||||
|
||||
def _init_hidden_weights(n_in, n_out, activation=T.tanh):
|
||||
rng = numpy.random.RandomState(1234)
|
||||
weights = numpy.asarray(
|
||||
rng.uniform(
|
||||
low=-numpy.sqrt(6. / (n_in + n_out)),
|
||||
high=numpy.sqrt(6. / (n_in + n_out)),
|
||||
size=(n_in, n_out)
|
||||
),
|
||||
dtype=theano.config.floatX
|
||||
)
|
||||
|
||||
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
||||
return (
|
||||
theano.shared(value=weights, name='W', borrow=True),
|
||||
theano.shared(value=bias, name='b', borrow=True)
|
||||
)
|
||||
|
||||
|
||||
def feed_layer(activation, weights, bias, input):
|
||||
return activation(T.dot(input, weights) + bias)
|
||||
|
||||
|
||||
def L1(L1_reg, w1, w2):
|
||||
return L1_reg * (abs(w1).sum() + abs(w2).sum())
|
||||
|
||||
|
||||
def L2(L2_reg, w1, w2):
|
||||
return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())
|
||||
|
||||
|
||||
def update(eta, param, cost):
|
||||
return (param, param - (eta * T.grad(cost, param)))
|
||||
|
||||
|
||||
def main(train_loc, eval_loc, model_dir):
|
||||
learning_rate = 0.01
|
||||
L1_reg = 0.00
|
||||
L2_reg = 0.0001
|
||||
|
||||
print "... reading the data"
|
||||
gold_train = list(read_json_file(train_loc))
|
||||
print '... building the model'
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
|
||||
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
|
||||
|
||||
train_model, evaluate_model = build_model(n_hidden, len(POS_TAGS), learning_rate,
|
||||
L1_reg, L2_reg)
|
||||
|
||||
print '... training'
|
||||
for epoch in range(1, n_epochs+1):
|
||||
for raw_text, sents in gold_tuples:
|
||||
for (ids, words, tags, ner, heads, deps), _ in sents:
|
||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||
for t in tokens:
|
||||
guess = train_model([t.orth], [t.tag])
|
||||
loss += guess != t.tag
|
||||
print loss
|
||||
# compute zero-one loss on validation set
|
||||
#error = numpy.mean([evaluate_model(x, y) for x, y in dev_examples])
|
||||
#print('epoch %i, validation error %f %%' % (epoch, error * 100))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
13
spacy/_theano.pxd
Normal file
13
spacy/_theano.pxd
Normal file
|
@ -0,0 +1,13 @@
|
|||
from ._ml cimport Model
|
||||
from thinc.nn cimport InputLayer
|
||||
|
||||
|
||||
cdef class TheanoModel(Model):
|
||||
cdef InputLayer input_layer
|
||||
cdef object train_func
|
||||
cdef object predict_func
|
||||
cdef object debug
|
||||
|
||||
cdef public float eta
|
||||
cdef public float mu
|
||||
cdef public float t
|
|
@ -9,7 +9,8 @@ from os import path
|
|||
|
||||
|
||||
cdef class TheanoModel(Model):
|
||||
def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None):
|
||||
def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None,
|
||||
debug=None):
|
||||
if model_loc is not None and path.isdir(model_loc):
|
||||
model_loc = path.join(model_loc, 'model')
|
||||
|
||||
|
@ -20,6 +21,7 @@ cdef class TheanoModel(Model):
|
|||
self.input_layer = InputLayer(input_spec, initializer)
|
||||
self.train_func = train_func
|
||||
self.predict_func = predict_func
|
||||
self.debug = debug
|
||||
|
||||
self.n_classes = n_classes
|
||||
self.n_feats = len(self.input_layer)
|
||||
|
@ -27,18 +29,25 @@ cdef class TheanoModel(Model):
|
|||
|
||||
def predict(self, Example eg):
|
||||
self.input_layer.fill(eg.embeddings, eg.atoms)
|
||||
theano_scores = self.predict_func(eg.embeddings)
|
||||
theano_scores = self.predict_func(eg.embeddings)[0]
|
||||
cdef int i
|
||||
for i in range(self.n_classes):
|
||||
eg.scores[i] = theano_scores[i]
|
||||
eg.guess = arg_max_if_true(<weight_t*>eg.scores.data, <int*>eg.is_valid.data,
|
||||
eg.guess = arg_max_if_true(&eg.scores[0], <int*>eg.is_valid[0],
|
||||
self.n_classes)
|
||||
|
||||
def train(self, Example eg):
|
||||
self.predict(eg)
|
||||
update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs)
|
||||
self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu)
|
||||
eg.best = arg_max_if_zero(<weight_t*>eg.scores.data, <int*>eg.costs.data,
|
||||
self.input_layer.fill(eg.embeddings, eg.atoms)
|
||||
theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta)
|
||||
self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
|
||||
for i in range(self.n_classes):
|
||||
eg.scores[i] = theano_scores[i]
|
||||
eg.guess = arg_max_if_true(&eg.scores[0], <int*>eg.is_valid[0],
|
||||
self.n_classes)
|
||||
eg.best = arg_max_if_zero(&eg.scores[0], <int*>eg.costs[0],
|
||||
self.n_classes)
|
||||
eg.cost = eg.costs[eg.guess]
|
||||
self.t += 1
|
||||
|
||||
def end_training(self):
|
||||
pass
|
||||
|
|
17
spacy/syntax/joint.pxd
Normal file
17
spacy/syntax/joint.pxd
Normal file
|
@ -0,0 +1,17 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..gold cimport GoldParseC
|
||||
|
||||
|
||||
cdef class ArcEager(TransitionSystem):
|
||||
pass
|
||||
|
||||
|
||||
cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil
|
||||
cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil
|
||||
|
452
spacy/syntax/joint.pyx
Normal file
452
spacy/syntax/joint.pyx
Normal file
|
@ -0,0 +1,452 @@
|
|||
# cython: profile=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import ctypes
|
||||
import os
|
||||
|
||||
from ..structs cimport TokenC
|
||||
|
||||
from .transition_system cimport do_func_t, get_cost_func_t
|
||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||
from ..gold cimport GoldParse
|
||||
from ..gold cimport GoldParseC
|
||||
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from .stateclass cimport StateClass
|
||||
|
||||
|
||||
DEF NON_MONOTONIC = True
|
||||
DEF USE_BREAK = True
|
||||
DEF USE_ROOT_ARC_SEGMENT = True
|
||||
|
||||
cdef weight_t MIN_SCORE = -90000
|
||||
|
||||
# Break transition from here
|
||||
# http://www.aclweb.org/anthology/P13-1074
|
||||
cdef enum:
|
||||
SHIFT
|
||||
REDUCE
|
||||
LEFT
|
||||
RIGHT
|
||||
|
||||
BREAK
|
||||
|
||||
N_MOVES
|
||||
|
||||
|
||||
MOVE_NAMES = [None] * N_MOVES
|
||||
MOVE_NAMES[SHIFT] = 'S'
|
||||
MOVE_NAMES[REDUCE] = 'D'
|
||||
MOVE_NAMES[LEFT] = 'L'
|
||||
MOVE_NAMES[RIGHT] = 'R'
|
||||
MOVE_NAMES[BREAK] = 'B'
|
||||
|
||||
|
||||
# Helper functions for the arc-eager oracle
|
||||
|
||||
cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
|
||||
cdef int cost = 0
|
||||
cdef int i, S_i
|
||||
for i in range(stcls.stack_depth()):
|
||||
S_i = stcls.S(i)
|
||||
if gold.heads[target] == S_i:
|
||||
cost += 1
|
||||
if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
|
||||
cost += 1
|
||||
cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0
|
||||
return cost
|
||||
|
||||
|
||||
cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
|
||||
cdef int cost = 0
|
||||
cdef int i, B_i
|
||||
for i in range(stcls.buffer_length()):
|
||||
B_i = stcls.B(i)
|
||||
cost += gold.heads[B_i] == target
|
||||
cost += gold.heads[target] == B_i
|
||||
if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
|
||||
break
|
||||
cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0
|
||||
return cost
|
||||
|
||||
|
||||
cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil:
|
||||
if arc_is_gold(gold, head, child):
|
||||
return 0
|
||||
elif stcls.H(child) == gold.heads[child]:
|
||||
return 1
|
||||
# Head in buffer
|
||||
elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
|
||||
if gold.labels[child] == -1:
|
||||
return True
|
||||
elif USE_ROOT_ARC_SEGMENT and _is_gold_root(gold, head) and _is_gold_root(gold, child):
|
||||
return True
|
||||
elif gold.heads[child] == head:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
|
||||
if gold.labels[child] == -1:
|
||||
return True
|
||||
elif label == -1:
|
||||
return True
|
||||
elif gold.labels[child] == label:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
|
||||
return gold.labels[word] == -1 or gold.heads[word] == word
|
||||
|
||||
|
||||
cdef class Shift:
|
||||
@staticmethod
|
||||
cdef bint is_valid(StateClass st, int label) nogil:
|
||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateClass st, int label) nogil:
|
||||
st.push()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef int cost(StateClass st, const GoldParseC* gold, int label) nogil:
|
||||
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
|
||||
|
||||
@staticmethod
|
||||
cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
|
||||
return push_cost(s, gold, s.B(0))
|
||||
|
||||
@staticmethod
|
||||
cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
return 0
|
||||
|
||||
|
||||
cdef class Reduce:
|
||||
@staticmethod
|
||||
cdef bint is_valid(StateClass st, int label) nogil:
|
||||
return st.stack_depth() >= 2
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateClass st, int label) nogil:
|
||||
if st.has_head(st.S(0)):
|
||||
st.pop()
|
||||
else:
|
||||
st.unshift()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
cdef inline int move_cost(StateClass st, const GoldParseC* gold) nogil:
|
||||
return pop_cost(st, gold, st.S(0))
|
||||
|
||||
@staticmethod
|
||||
cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
return 0
|
||||
|
||||
|
||||
cdef class LeftArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(StateClass st, int label) nogil:
|
||||
return not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateClass st, int label) nogil:
|
||||
st.add_arc(st.B(0), st.S(0), label)
|
||||
st.pop()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
|
||||
cdef int cost = 0
|
||||
if arc_is_gold(gold, s.B(0), s.S(0)):
|
||||
return 0
|
||||
else:
|
||||
# Account for deps we might lose between S0 and stack
|
||||
if not s.has_head(s.S(0)):
|
||||
for i in range(1, s.stack_depth()):
|
||||
cost += gold.heads[s.S(i)] == s.S(0)
|
||||
cost += gold.heads[s.S(0)] == s.S(i)
|
||||
return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
|
||||
|
||||
@staticmethod
|
||||
cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
|
||||
|
||||
|
||||
cdef class RightArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(StateClass st, int label) nogil:
|
||||
return not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateClass st, int label) nogil:
|
||||
st.add_arc(st.S(0), st.B(0), label)
|
||||
st.push()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef inline int cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
|
||||
if arc_is_gold(gold, s.S(0), s.B(0)):
|
||||
return 0
|
||||
elif s.shifted[s.B(0)]:
|
||||
return push_cost(s, gold, s.B(0))
|
||||
else:
|
||||
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
|
||||
|
||||
@staticmethod
|
||||
cdef int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
|
||||
|
||||
|
||||
cdef class Break:
|
||||
@staticmethod
|
||||
cdef bint is_valid(StateClass st, int label) nogil:
|
||||
cdef int i
|
||||
if not USE_BREAK:
|
||||
return False
|
||||
elif st.at_break():
|
||||
return False
|
||||
elif st.B(0) == 0:
|
||||
return False
|
||||
elif st.stack_depth() < 1:
|
||||
return False
|
||||
elif (st.S(0) + 1) != st.B(0):
|
||||
# Must break at the token boundary
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateClass st, int label) nogil:
|
||||
st.set_break(st.B(0))
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
|
||||
cdef int cost = 0
|
||||
cdef int i, j, S_i, B_i
|
||||
for i in range(s.stack_depth()):
|
||||
S_i = s.S(i)
|
||||
for j in range(s.buffer_length()):
|
||||
B_i = s.B(j)
|
||||
cost += gold.heads[S_i] == B_i
|
||||
cost += gold.heads[B_i] == S_i
|
||||
# Check for sentence boundary --- if it's here, we can't have any deps
|
||||
# between stack and buffer, so rest of action is irrelevant.
|
||||
s0_root = _get_root(s.S(0), gold)
|
||||
b0_root = _get_root(s.B(0), gold)
|
||||
if s0_root != b0_root or s0_root == -1 or b0_root == -1:
|
||||
return cost
|
||||
else:
|
||||
return cost + 1
|
||||
|
||||
@staticmethod
|
||||
cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
return 0
|
||||
|
||||
cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
||||
while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0:
|
||||
word = gold.heads[word]
|
||||
if gold.labels[word] == -1:
|
||||
return -1
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
cdef class ArcEager(TransitionSystem):
|
||||
@classmethod
|
||||
def get_labels(cls, gold_parses):
|
||||
move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'ROOT': True},
|
||||
LEFT: {'ROOT': True}, BREAK: {'ROOT': True}}
|
||||
for raw_text, sents in gold_parses:
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
for child, head, label in zip(ids, heads, labels):
|
||||
if label.upper() == 'ROOT':
|
||||
label = 'ROOT'
|
||||
if label != 'ROOT':
|
||||
if head < child:
|
||||
move_labels[RIGHT][label] = True
|
||||
elif head > child:
|
||||
move_labels[LEFT][label] = True
|
||||
return move_labels
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
for i in range(gold.length):
|
||||
if gold.heads[i] is None: # Missing values
|
||||
gold.c.heads[i] = i
|
||||
gold.c.labels[i] = -1
|
||||
else:
|
||||
label = gold.labels[i]
|
||||
if label.upper() == 'ROOT':
|
||||
label = 'ROOT'
|
||||
gold.c.heads[i] = gold.heads[i]
|
||||
gold.c.labels[i] = self.strings[label]
|
||||
for end, brackets in gold.brackets.items():
|
||||
for start, label_strs in brackets.items():
|
||||
gold.c.brackets[start][end] = 1
|
||||
for label_str in label_strs:
|
||||
# Add the encoded label to the set
|
||||
gold.brackets[end][start].add(self.strings[label_str])
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
if '-' in name:
|
||||
move_str, label_str = name.split('-', 1)
|
||||
label = self.label_ids[label_str]
|
||||
else:
|
||||
label = 0
|
||||
move = MOVE_NAMES.index(move_str)
|
||||
for i in range(self.n_moves):
|
||||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
|
||||
def move_name(self, int move, int label):
|
||||
label_str = self.strings[label]
|
||||
if label_str:
|
||||
return MOVE_NAMES[move] + '-' + label_str
|
||||
else:
|
||||
return MOVE_NAMES[move]
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
# constructor with the function pointers
|
||||
cdef Transition t
|
||||
t.score = 0
|
||||
t.clas = clas
|
||||
t.move = move
|
||||
t.label = label
|
||||
if move == SHIFT:
|
||||
t.is_valid = Shift.is_valid
|
||||
t.do = Shift.transition
|
||||
t.get_cost = Shift.cost
|
||||
elif move == REDUCE:
|
||||
t.is_valid = Reduce.is_valid
|
||||
t.do = Reduce.transition
|
||||
t.get_cost = Reduce.cost
|
||||
elif move == LEFT:
|
||||
t.is_valid = LeftArc.is_valid
|
||||
t.do = LeftArc.transition
|
||||
t.get_cost = LeftArc.cost
|
||||
elif move == RIGHT:
|
||||
t.is_valid = RightArc.is_valid
|
||||
t.do = RightArc.transition
|
||||
t.get_cost = RightArc.cost
|
||||
elif move == BREAK:
|
||||
t.is_valid = Break.is_valid
|
||||
t.do = Break.transition
|
||||
t.get_cost = Break.cost
|
||||
else:
|
||||
raise Exception(move)
|
||||
return t
|
||||
|
||||
cdef int initialize_state(self, StateClass st) except -1:
|
||||
# Ensure sent_start is set to 0 throughout
|
||||
for i in range(st.length):
|
||||
st._sent[i].sent_start = False
|
||||
st._sent[i].l_edge = i
|
||||
st._sent[i].r_edge = i
|
||||
st.fast_forward()
|
||||
|
||||
cdef int finalize_state(self, StateClass st) except -1:
|
||||
cdef int root_label = self.strings['ROOT']
|
||||
for i in range(st.length):
|
||||
if st._sent[i].head == 0 and st._sent[i].dep == 0:
|
||||
st._sent[i].dep = root_label
|
||||
# If we're not using the Break transition, we segment via root-labelled
|
||||
# arcs between the root words.
|
||||
elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == root_label:
|
||||
st._sent[i].head = 0
|
||||
|
||||
cdef int set_valid(self, bint* output, StateClass stcls) except -1:
|
||||
cdef bint[N_MOVES] is_valid
|
||||
is_valid[SHIFT] = Shift.is_valid(stcls, -1)
|
||||
is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
|
||||
is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
|
||||
is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
|
||||
is_valid[BREAK] = Break.is_valid(stcls, -1)
|
||||
cdef int i
|
||||
n_valid = 0
|
||||
for i in range(self.n_moves):
|
||||
output[i] = is_valid[self.c[i].move]
|
||||
n_valid += output[i]
|
||||
assert n_valid >= 1
|
||||
|
||||
cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
|
||||
cdef int i, move, label
|
||||
cdef label_cost_func_t[N_MOVES] label_cost_funcs
|
||||
cdef move_cost_func_t[N_MOVES] move_cost_funcs
|
||||
cdef int[N_MOVES] move_costs
|
||||
for i in range(N_MOVES):
|
||||
move_costs[i] = -1
|
||||
move_cost_funcs[SHIFT] = Shift.move_cost
|
||||
move_cost_funcs[REDUCE] = Reduce.move_cost
|
||||
move_cost_funcs[LEFT] = LeftArc.move_cost
|
||||
move_cost_funcs[RIGHT] = RightArc.move_cost
|
||||
move_cost_funcs[BREAK] = Break.move_cost
|
||||
|
||||
label_cost_funcs[SHIFT] = Shift.label_cost
|
||||
label_cost_funcs[REDUCE] = Reduce.label_cost
|
||||
label_cost_funcs[LEFT] = LeftArc.label_cost
|
||||
label_cost_funcs[RIGHT] = RightArc.label_cost
|
||||
label_cost_funcs[BREAK] = Break.label_cost
|
||||
|
||||
cdef int* labels = gold.c.labels
|
||||
cdef int* heads = gold.c.heads
|
||||
|
||||
n_gold = 0
|
||||
for i in range(self.n_moves):
|
||||
if self.c[i].is_valid(stcls, self.c[i].label):
|
||||
move = self.c[i].move
|
||||
label = self.c[i].label
|
||||
if move_costs[move] == -1:
|
||||
move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
|
||||
output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
|
||||
n_gold += output[i] == 0
|
||||
else:
|
||||
output[i] = 9000
|
||||
assert n_gold >= 1
|
||||
|
||||
cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
|
||||
cdef bint[N_MOVES] is_valid
|
||||
is_valid[SHIFT] = Shift.is_valid(stcls, -1)
|
||||
is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
|
||||
is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
|
||||
is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
|
||||
is_valid[BREAK] = Break.is_valid(stcls, -1)
|
||||
cdef Transition best
|
||||
cdef weight_t score = MIN_SCORE
|
||||
cdef int i
|
||||
for i in range(self.n_moves):
|
||||
if scores[i] > score and is_valid[self.c[i].move]:
|
||||
best = self.c[i]
|
||||
score = scores[i]
|
||||
assert best.clas < self.n_moves
|
||||
assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length)
|
||||
return best
|
|
@ -51,18 +51,21 @@ def get_templates(name):
|
|||
return pf.ner
|
||||
elif name == 'debug':
|
||||
return pf.unigrams
|
||||
elif name.startswith('embed'):
|
||||
return ((10, pf.words), (10, pf.tags), (10, pf.labels))
|
||||
else:
|
||||
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
|
||||
pf.tree_shape + pf.trigrams)
|
||||
|
||||
|
||||
cdef class Parser:
|
||||
def __init__(self, StringStore strings, model_dir, transition_system):
|
||||
def __init__(self, StringStore strings, model_dir, transition_system,
|
||||
get_model=Model):
|
||||
assert os.path.exists(model_dir) and os.path.isdir(model_dir)
|
||||
self.cfg = Config.read(model_dir, 'config')
|
||||
self.moves = transition_system(strings, self.cfg.labels)
|
||||
templates = get_templates(self.cfg.features)
|
||||
self.model = Model(self.moves.n_moves, templates, model_dir)
|
||||
self.model = get_model(self.moves.n_moves, templates, model_dir)
|
||||
|
||||
def __call__(self, Tokens tokens):
|
||||
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
|
||||
|
@ -71,8 +74,8 @@ cdef class Parser:
|
|||
cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats)
|
||||
while not stcls.is_final():
|
||||
eg.wipe()
|
||||
fill_context(<atom_t*>eg.atoms.data, stcls)
|
||||
self.moves.set_valid(<bint*>eg.is_valid.data, stcls)
|
||||
fill_context(&eg.atoms[0], stcls)
|
||||
self.moves.set_valid(<bint*>&eg.is_valid[0], stcls)
|
||||
|
||||
self.model.predict(eg)
|
||||
|
||||
|
@ -88,8 +91,8 @@ cdef class Parser:
|
|||
cdef int cost = 0
|
||||
while not stcls.is_final():
|
||||
eg.wipe()
|
||||
fill_context(<atom_t*>eg.atoms.data, stcls)
|
||||
self.moves.set_costs(<bint*>eg.is_valid.data, <int*>eg.costs.data, stcls, gold)
|
||||
fill_context(&eg.atoms[0], stcls)
|
||||
self.moves.set_costs(<bint*>&eg.is_valid[0], &eg.costs[0], stcls, gold)
|
||||
|
||||
self.model.train(eg)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user