* Try beam search for SGD

This commit is contained in:
Matthew Honnibal 2016-02-25 03:00:35 +01:00
parent db87db87ea
commit a76316ae7e

View File

@ -22,45 +22,21 @@ from spacy.de import German
import spacy.util import spacy.util
from spacy.syntax.util import Config from spacy.syntax.util import Config
from spacy.scorer import Scorer
from spacy.tagger import Tagger from spacy.tagger import Tagger
from spacy.tagger import P2_orth, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_flags class GoldSents(object):
from spacy.tagger import P1_orth, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_flags def __init__(self, tokenizer, sents, n=5000):
from spacy.tagger import W_orth, W_shape, W_prefix, W_suffix, W_pos, W_flags self.tokenizer = tokenizer
from spacy.tagger import N1_orth, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_flags self.sents = sents
from spacy.tagger import N2_orth, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_flags self.n = n
def __iter__(self):
random.shuffle(self.sents)
for words, gold in self.sents[:self.n]:
tokens = self.tokenizer.tokens_from_list(words)
yield tokens, gold
templates = {
'de': [
(W_orth,),
(P1_orth, P1_pos),
(P2_orth, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,)
]
}
def read_conll(file_): def read_conll(file_):
"""Read a standard CoNLL/MALT-style format""" """Read a standard CoNLL/MALT-style format"""
@ -85,21 +61,64 @@ def _parse_line(line):
pos = pieces[4] pos = pieces[4]
return id_, word, pos return id_, word, pos
def score_model(nlp, gold_tuples, verbose=False): def beam_sgd(tagger, train_data, check_data):
print(tagger.model.widths)
print("Itn.\tTrain\tPrev\tNew")
queue = [(score_model(check_data, tagger), 0, tagger)]
workers = [None] * 100
limit = 4
while True:
for prev_score, i, tagger in list(queue):
#prev_score, i, tagger = max(queue)
train_acc, new_model = get_new_model(train_data, tagger)
new_score = score_model(check_data, new_model)
queue.append((new_score, i+1, new_model))
print('%d:\t%.3f\t%.3f\t%.3f\t%.4f' % (i, train_acc, prev_score, new_score,
tagger.model.eta))
queue.sort(reverse=True)
queue = queue[:limit]
return max(queue)
def score_model(gold_sents, tagger):
correct = 0.0 correct = 0.0
total = 0.0 total = 0.0
for words, gold_tags in gold_tuples: for tokens, gold_tags in gold_sents:
tokens = nlp.tokenizer.tokens_from_list(words) tagger(tokens)
nlp.tagger(tokens)
for token, gold in zip(tokens, gold_tags): for token, gold in zip(tokens, gold_tags):
correct += token.tag_ == gold correct += token.tag_ == gold
total += 1 total += 1
return (correct / total) * 100 return (correct / total) * 100
def get_new_model(gold_sents, tagger):
learn_rate = numpy.random.normal(loc=tagger.model.learn_rate, scale=0.001)
if learn_rate < 0.0001:
learn_rate = 0.0001
new_model = Tagger.blank(tagger.vocab, [],
learn_rate=learn_rate,
depth=tagger.model.depth,
hidden_width=tagger.model.hidden_width,
chars_width=tagger.model.chars_width,
tags_width=tagger.model.tags_width,
left_window=tagger.model.left_window,
right_window=tagger.model.right_window,
tags_window=tagger.model.tags_window,
chars_per_word=tagger.model.chars_per_word)
new_model.model.embeddings = tagger.model.embeddings
new_model.model.weights = tagger.model.weights
correct = 0.0
total = 0.0
for tokens, gold in gold_sents:
correct += new_model.train(tokens, gold)
total += len(tokens)
return (correct / total), new_model
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=0, def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=0,
gold_preproc=False, eta=0.005): gold_preproc=False, **model_args):
pos_model_dir = path.join(model_dir, 'pos') pos_model_dir = path.join(model_dir, 'pos')
if path.exists(pos_model_dir): if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir) shutil.rmtree(pos_model_dir)
@ -109,59 +128,75 @@ def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=0,
for words, tags in train_sents: for words, tags in train_sents:
for word in words: for word in words:
_ = nlp.vocab[word] _ = nlp.vocab[word]
nlp.tagger = Tagger.blank(nlp.vocab, templates['de'], learn_rate=eta)
print(nlp.tagger.model.widths)
print("Itn.\tTrain\tCheck\tDev")
nr_train = len(train_sents) nr_train = len(train_sents)
random.shuffle(train_sents) random.shuffle(train_sents)
heldout_sents = train_sents[:int(nr_train * 0.1)] heldout_sents = train_sents[:int(nr_train * 0.1)]
train_sents = train_sents[len(heldout_sents):] train_sents = train_sents[len(heldout_sents):]
prev_score = 0.0
variance = 0.001 train_sents = GoldSents(nlp.tokenizer, train_sents)
last_good_learn_rate = nlp.tagger.model.eta heldout_sents = GoldSents(nlp.tokenizer, heldout_sents)
n = 0
total = 0 tagger = Tagger.blank(nlp.vocab, [], **model_args)
acc = 0 return beam_sgd(tagger, train_sents, heldout_sents)
while True:
words, gold_tags = random.choice(train_sents) #prev_score = 0.0
tokens = nlp.tokenizer.tokens_from_list(words) #variance = 0.001
acc += nlp.tagger.train(tokens, gold_tags) #last_good_learn_rate = nlp.tagger.model.eta
total += len(tokens) #n = 0
n += 1 #total = 0
if n and n % 20000 == 0: #acc = 0
dev_score = score_model(nlp, heldout_sents) #last_model = (nlp.tagger.model.weights, nlp.tagger.model.embeddings)
eval_score = score_model(nlp, dev_sents) #while True:
if dev_score >= prev_score: # words, gold_tags = random.choice(train_sents)
nlp.tagger.model.keep_update() # tokens = nlp.tokenizer.tokens_from_list(words)
prev_score = dev_score # acc += nlp.tagger.train(tokens, gold_tags)
variance = 0.001 # total += len(tokens)
last_good_learn_rate = nlp.tagger.model.eta # n += 1
nlp.tagger.model.eta *= 1.01 # if n and n % 20000 == 0:
print('%d:\t%.3f\t%.3f\t%.3f\t%.4f' % (n, acc/total, dev_score, eval_score, nlp.tagger.model.eta)) # dev_score = score_model(nlp, heldout_sents)
else: # eval_score = score_model(nlp, dev_sents)
nlp.tagger.model.backtrack() # if dev_score >= prev_score:
new_eta = numpy.random.normal(loc=last_good_learn_rate, scale=variance) # last_model = (nlp.tagger.model.weights, nlp.tagger.model.embeddings)
if new_eta >= 0.0001: # prev_score = dev_score
nlp.tagger.model.eta = new_eta # variance = 0.001
else: # last_good_learn_rate = nlp.tagger.model.eta
nlp.tagger.model.eta = 0.0001 # nlp.tagger.model.eta *= 1.01
print('X:\t%.3f\t%.3f\t%.3f\t%.4f' % (acc/total, dev_score, eval_score, nlp.tagger.model.eta)) #
variance *= 1.1 # else:
prev_score *= 0.9999 # nlp.tagger.model.weights = last_model[0]
acc = 0.0 # nlp.tagger.model.embeddings = last_model[1]
total = 0.0 # new_eta = numpy.random.normal(loc=last_good_learn_rate, scale=variance)
nlp.end_training(data_dir=model_dir) # if new_eta >= 0.0001:
return nlp # nlp.tagger.model.eta = new_eta
# else:
# nlp.tagger.model.eta = 0.0001
# print('X:\t%.3f\t%.3f\t%.3f\t%.4f' % (acc/total, dev_score, eval_score, nlp.tagger.model.eta))
# variance *= 1.1
# prev_score *= 0.9999
# acc = 0.0
# total = 0.0
#nlp.end_training(data_dir=model_dir)
#return nlp
@plac.annotations( @plac.annotations(
train_loc=("Location of training file or directory"), train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"), dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",), model_dir=("Location of output model directory",),
eta=("Learning rate for Adagrad optimizer", "option", "e", float), learn_rate=("Learning rate for SGD", "option", "e", float),
n_iter=("Number of training iterations", "option", "i", int), n_iter=("Number of training iterations", "option", "i", int),
depth=("Number of hidden layers", "option", "d", int),
hidden_width=("Number of neurons in each hidden layers", "option", "H", int),
chars_width=("Width of character embedding", "option", "C", int),
tags_width=("Width of tag embedding", "option", "T", int),
left_window=("Number of words of left context", "option", "l", int),
right_window=("Number of words of right context", "option", "r", int),
tags_window=("Number of tags in history", "option", "t", int),
chars_per_word=("Number of characters per word", "option", "c", int),
) )
def main(lang_id, train_loc, dev_loc, model_dir, n_iter=5, eta=0.005): def main(lang_id, train_loc, dev_loc, model_dir, n_iter=5, learn_rate=0.005,
depth=3, hidden_width=100, chars_width=5, tags_width=10, left_window=2,
right_window=2, tags_window=2, chars_per_word=8):
if lang_id == 'en': if lang_id == 'en':
Language = English Language = English
elif lang_id == 'de': elif lang_id == 'de':
@ -173,11 +208,11 @@ def main(lang_id, train_loc, dev_loc, model_dir, n_iter=5, eta=0.005):
with codecs.open(train_loc, 'r', 'utf8') as file_: with codecs.open(train_loc, 'r', 'utf8') as file_:
train_sents = read_conll(file_) train_sents = read_conll(file_)
dev_sents = read_conll(codecs.open(dev_loc, 'r', 'utf8')) dev_sents = read_conll(codecs.open(dev_loc, 'r', 'utf8'))
nlp = train(Language, train_sents, dev_sents, model_dir, n_iter=n_iter, eta=eta) nlp = train(Language, train_sents, dev_sents, model_dir,
#nlp = Language(data_dir=model_dir) n_iter=n_iter, learn_rate=learn_rate,
scorer = score_model(nlp, dev_sents) depth=depth, hidden_width=hidden_width, chars_width=chars_width, tags_width=tags_width,
print('TOK', 100-scorer.token_acc) left_window=left_window, right_window=right_window, tags_window=tags_window,
print('POS', scorer.tags_acc) chars_per_word=chars_per_word)
if __name__ == '__main__': if __name__ == '__main__':