mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-12 04:38:28 +03:00
* Try beam search for SGD
This commit is contained in:
parent
db87db87ea
commit
a76316ae7e
|
@ -22,45 +22,21 @@ from spacy.de import German
|
||||||
import spacy.util
|
import spacy.util
|
||||||
from spacy.syntax.util import Config
|
from spacy.syntax.util import Config
|
||||||
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
from spacy.tagger import Tagger
|
from spacy.tagger import Tagger
|
||||||
|
|
||||||
|
|
||||||
from spacy.tagger import P2_orth, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_flags
|
class GoldSents(object):
|
||||||
from spacy.tagger import P1_orth, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_flags
|
def __init__(self, tokenizer, sents, n=5000):
|
||||||
from spacy.tagger import W_orth, W_shape, W_prefix, W_suffix, W_pos, W_flags
|
self.tokenizer = tokenizer
|
||||||
from spacy.tagger import N1_orth, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_flags
|
self.sents = sents
|
||||||
from spacy.tagger import N2_orth, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_flags
|
self.n = n
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
random.shuffle(self.sents)
|
||||||
|
for words, gold in self.sents[:self.n]:
|
||||||
|
tokens = self.tokenizer.tokens_from_list(words)
|
||||||
|
yield tokens, gold
|
||||||
|
|
||||||
templates = {
|
|
||||||
'de': [
|
|
||||||
(W_orth,),
|
|
||||||
(P1_orth, P1_pos),
|
|
||||||
(P2_orth, P2_pos),
|
|
||||||
(N1_orth,),
|
|
||||||
(N2_orth,),
|
|
||||||
|
|
||||||
(W_suffix,),
|
|
||||||
(W_prefix,),
|
|
||||||
|
|
||||||
(P1_pos,),
|
|
||||||
(P2_pos,),
|
|
||||||
(P1_pos, P2_pos),
|
|
||||||
(P1_pos, W_orth),
|
|
||||||
(P1_suffix,),
|
|
||||||
(N1_suffix,),
|
|
||||||
|
|
||||||
(W_shape,),
|
|
||||||
|
|
||||||
(W_flags,),
|
|
||||||
(N1_flags,),
|
|
||||||
(N2_flags,),
|
|
||||||
(P1_flags,),
|
|
||||||
(P2_flags,)
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def read_conll(file_):
|
def read_conll(file_):
|
||||||
"""Read a standard CoNLL/MALT-style format"""
|
"""Read a standard CoNLL/MALT-style format"""
|
||||||
|
@ -85,21 +61,64 @@ def _parse_line(line):
|
||||||
pos = pieces[4]
|
pos = pieces[4]
|
||||||
return id_, word, pos
|
return id_, word, pos
|
||||||
|
|
||||||
|
|
||||||
def score_model(nlp, gold_tuples, verbose=False):
|
def beam_sgd(tagger, train_data, check_data):
|
||||||
|
print(tagger.model.widths)
|
||||||
|
print("Itn.\tTrain\tPrev\tNew")
|
||||||
|
queue = [(score_model(check_data, tagger), 0, tagger)]
|
||||||
|
workers = [None] * 100
|
||||||
|
limit = 4
|
||||||
|
while True:
|
||||||
|
for prev_score, i, tagger in list(queue):
|
||||||
|
#prev_score, i, tagger = max(queue)
|
||||||
|
train_acc, new_model = get_new_model(train_data, tagger)
|
||||||
|
new_score = score_model(check_data, new_model)
|
||||||
|
queue.append((new_score, i+1, new_model))
|
||||||
|
print('%d:\t%.3f\t%.3f\t%.3f\t%.4f' % (i, train_acc, prev_score, new_score,
|
||||||
|
tagger.model.eta))
|
||||||
|
queue.sort(reverse=True)
|
||||||
|
queue = queue[:limit]
|
||||||
|
return max(queue)
|
||||||
|
|
||||||
|
|
||||||
|
def score_model(gold_sents, tagger):
|
||||||
correct = 0.0
|
correct = 0.0
|
||||||
total = 0.0
|
total = 0.0
|
||||||
for words, gold_tags in gold_tuples:
|
for tokens, gold_tags in gold_sents:
|
||||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
tagger(tokens)
|
||||||
nlp.tagger(tokens)
|
|
||||||
for token, gold in zip(tokens, gold_tags):
|
for token, gold in zip(tokens, gold_tags):
|
||||||
correct += token.tag_ == gold
|
correct += token.tag_ == gold
|
||||||
total += 1
|
total += 1
|
||||||
return (correct / total) * 100
|
return (correct / total) * 100
|
||||||
|
|
||||||
|
|
||||||
|
def get_new_model(gold_sents, tagger):
|
||||||
|
learn_rate = numpy.random.normal(loc=tagger.model.learn_rate, scale=0.001)
|
||||||
|
if learn_rate < 0.0001:
|
||||||
|
learn_rate = 0.0001
|
||||||
|
|
||||||
|
new_model = Tagger.blank(tagger.vocab, [],
|
||||||
|
learn_rate=learn_rate,
|
||||||
|
depth=tagger.model.depth,
|
||||||
|
hidden_width=tagger.model.hidden_width,
|
||||||
|
chars_width=tagger.model.chars_width,
|
||||||
|
tags_width=tagger.model.tags_width,
|
||||||
|
left_window=tagger.model.left_window,
|
||||||
|
right_window=tagger.model.right_window,
|
||||||
|
tags_window=tagger.model.tags_window,
|
||||||
|
chars_per_word=tagger.model.chars_per_word)
|
||||||
|
new_model.model.embeddings = tagger.model.embeddings
|
||||||
|
new_model.model.weights = tagger.model.weights
|
||||||
|
correct = 0.0
|
||||||
|
total = 0.0
|
||||||
|
for tokens, gold in gold_sents:
|
||||||
|
correct += new_model.train(tokens, gold)
|
||||||
|
total += len(tokens)
|
||||||
|
return (correct / total), new_model
|
||||||
|
|
||||||
|
|
||||||
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=0,
|
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=0,
|
||||||
gold_preproc=False, eta=0.005):
|
gold_preproc=False, **model_args):
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
pos_model_dir = path.join(model_dir, 'pos')
|
||||||
if path.exists(pos_model_dir):
|
if path.exists(pos_model_dir):
|
||||||
shutil.rmtree(pos_model_dir)
|
shutil.rmtree(pos_model_dir)
|
||||||
|
@ -109,59 +128,75 @@ def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=0,
|
||||||
for words, tags in train_sents:
|
for words, tags in train_sents:
|
||||||
for word in words:
|
for word in words:
|
||||||
_ = nlp.vocab[word]
|
_ = nlp.vocab[word]
|
||||||
nlp.tagger = Tagger.blank(nlp.vocab, templates['de'], learn_rate=eta)
|
|
||||||
print(nlp.tagger.model.widths)
|
|
||||||
print("Itn.\tTrain\tCheck\tDev")
|
|
||||||
nr_train = len(train_sents)
|
nr_train = len(train_sents)
|
||||||
random.shuffle(train_sents)
|
random.shuffle(train_sents)
|
||||||
heldout_sents = train_sents[:int(nr_train * 0.1)]
|
heldout_sents = train_sents[:int(nr_train * 0.1)]
|
||||||
train_sents = train_sents[len(heldout_sents):]
|
train_sents = train_sents[len(heldout_sents):]
|
||||||
prev_score = 0.0
|
|
||||||
variance = 0.001
|
train_sents = GoldSents(nlp.tokenizer, train_sents)
|
||||||
last_good_learn_rate = nlp.tagger.model.eta
|
heldout_sents = GoldSents(nlp.tokenizer, heldout_sents)
|
||||||
n = 0
|
|
||||||
total = 0
|
tagger = Tagger.blank(nlp.vocab, [], **model_args)
|
||||||
acc = 0
|
return beam_sgd(tagger, train_sents, heldout_sents)
|
||||||
while True:
|
|
||||||
words, gold_tags = random.choice(train_sents)
|
#prev_score = 0.0
|
||||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
#variance = 0.001
|
||||||
acc += nlp.tagger.train(tokens, gold_tags)
|
#last_good_learn_rate = nlp.tagger.model.eta
|
||||||
total += len(tokens)
|
#n = 0
|
||||||
n += 1
|
#total = 0
|
||||||
if n and n % 20000 == 0:
|
#acc = 0
|
||||||
dev_score = score_model(nlp, heldout_sents)
|
#last_model = (nlp.tagger.model.weights, nlp.tagger.model.embeddings)
|
||||||
eval_score = score_model(nlp, dev_sents)
|
#while True:
|
||||||
if dev_score >= prev_score:
|
# words, gold_tags = random.choice(train_sents)
|
||||||
nlp.tagger.model.keep_update()
|
# tokens = nlp.tokenizer.tokens_from_list(words)
|
||||||
prev_score = dev_score
|
# acc += nlp.tagger.train(tokens, gold_tags)
|
||||||
variance = 0.001
|
# total += len(tokens)
|
||||||
last_good_learn_rate = nlp.tagger.model.eta
|
# n += 1
|
||||||
nlp.tagger.model.eta *= 1.01
|
# if n and n % 20000 == 0:
|
||||||
print('%d:\t%.3f\t%.3f\t%.3f\t%.4f' % (n, acc/total, dev_score, eval_score, nlp.tagger.model.eta))
|
# dev_score = score_model(nlp, heldout_sents)
|
||||||
else:
|
# eval_score = score_model(nlp, dev_sents)
|
||||||
nlp.tagger.model.backtrack()
|
# if dev_score >= prev_score:
|
||||||
new_eta = numpy.random.normal(loc=last_good_learn_rate, scale=variance)
|
# last_model = (nlp.tagger.model.weights, nlp.tagger.model.embeddings)
|
||||||
if new_eta >= 0.0001:
|
# prev_score = dev_score
|
||||||
nlp.tagger.model.eta = new_eta
|
# variance = 0.001
|
||||||
else:
|
# last_good_learn_rate = nlp.tagger.model.eta
|
||||||
nlp.tagger.model.eta = 0.0001
|
# nlp.tagger.model.eta *= 1.01
|
||||||
print('X:\t%.3f\t%.3f\t%.3f\t%.4f' % (acc/total, dev_score, eval_score, nlp.tagger.model.eta))
|
#
|
||||||
variance *= 1.1
|
# else:
|
||||||
prev_score *= 0.9999
|
# nlp.tagger.model.weights = last_model[0]
|
||||||
acc = 0.0
|
# nlp.tagger.model.embeddings = last_model[1]
|
||||||
total = 0.0
|
# new_eta = numpy.random.normal(loc=last_good_learn_rate, scale=variance)
|
||||||
nlp.end_training(data_dir=model_dir)
|
# if new_eta >= 0.0001:
|
||||||
return nlp
|
# nlp.tagger.model.eta = new_eta
|
||||||
|
# else:
|
||||||
|
# nlp.tagger.model.eta = 0.0001
|
||||||
|
# print('X:\t%.3f\t%.3f\t%.3f\t%.4f' % (acc/total, dev_score, eval_score, nlp.tagger.model.eta))
|
||||||
|
# variance *= 1.1
|
||||||
|
# prev_score *= 0.9999
|
||||||
|
# acc = 0.0
|
||||||
|
# total = 0.0
|
||||||
|
#nlp.end_training(data_dir=model_dir)
|
||||||
|
#return nlp
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
train_loc=("Location of training file or directory"),
|
train_loc=("Location of training file or directory"),
|
||||||
dev_loc=("Location of development file or directory"),
|
dev_loc=("Location of development file or directory"),
|
||||||
model_dir=("Location of output model directory",),
|
model_dir=("Location of output model directory",),
|
||||||
eta=("Learning rate for Adagrad optimizer", "option", "e", float),
|
learn_rate=("Learning rate for SGD", "option", "e", float),
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
n_iter=("Number of training iterations", "option", "i", int),
|
||||||
|
depth=("Number of hidden layers", "option", "d", int),
|
||||||
|
hidden_width=("Number of neurons in each hidden layers", "option", "H", int),
|
||||||
|
chars_width=("Width of character embedding", "option", "C", int),
|
||||||
|
tags_width=("Width of tag embedding", "option", "T", int),
|
||||||
|
left_window=("Number of words of left context", "option", "l", int),
|
||||||
|
right_window=("Number of words of right context", "option", "r", int),
|
||||||
|
tags_window=("Number of tags in history", "option", "t", int),
|
||||||
|
chars_per_word=("Number of characters per word", "option", "c", int),
|
||||||
)
|
)
|
||||||
def main(lang_id, train_loc, dev_loc, model_dir, n_iter=5, eta=0.005):
|
def main(lang_id, train_loc, dev_loc, model_dir, n_iter=5, learn_rate=0.005,
|
||||||
|
depth=3, hidden_width=100, chars_width=5, tags_width=10, left_window=2,
|
||||||
|
right_window=2, tags_window=2, chars_per_word=8):
|
||||||
if lang_id == 'en':
|
if lang_id == 'en':
|
||||||
Language = English
|
Language = English
|
||||||
elif lang_id == 'de':
|
elif lang_id == 'de':
|
||||||
|
@ -173,11 +208,11 @@ def main(lang_id, train_loc, dev_loc, model_dir, n_iter=5, eta=0.005):
|
||||||
with codecs.open(train_loc, 'r', 'utf8') as file_:
|
with codecs.open(train_loc, 'r', 'utf8') as file_:
|
||||||
train_sents = read_conll(file_)
|
train_sents = read_conll(file_)
|
||||||
dev_sents = read_conll(codecs.open(dev_loc, 'r', 'utf8'))
|
dev_sents = read_conll(codecs.open(dev_loc, 'r', 'utf8'))
|
||||||
nlp = train(Language, train_sents, dev_sents, model_dir, n_iter=n_iter, eta=eta)
|
nlp = train(Language, train_sents, dev_sents, model_dir,
|
||||||
#nlp = Language(data_dir=model_dir)
|
n_iter=n_iter, learn_rate=learn_rate,
|
||||||
scorer = score_model(nlp, dev_sents)
|
depth=depth, hidden_width=hidden_width, chars_width=chars_width, tags_width=tags_width,
|
||||||
print('TOK', 100-scorer.token_acc)
|
left_window=left_window, right_window=right_window, tags_window=tags_window,
|
||||||
print('POS', scorer.tags_acc)
|
chars_per_word=chars_per_word)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in New Issue
Block a user