This commit is contained in:
Matthew Honnibal 2016-07-27 02:56:36 +02:00
parent 6a98a3142f
commit ac63274e15
5 changed files with 55 additions and 221 deletions

View File

@ -163,26 +163,43 @@ def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic
nr_trimmed = 0 nr_trimmed = 0
eg_seen = 0 eg_seen = 0
loss = 0 loss = 0
micro_eval = gold_tuples[:50]
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(gold_tuples) try:
for _, sents in gold_tuples: eg_seen = _train_epoch(nlp, gold_tuples, eg_seen, itn,
for annot_tuples, _ in sents: dev_loc, micro_eval)
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) except KeyboardInterrupt:
nlp.tagger.tag_from_strings(tokens, annot_tuples[2]) print("Saving model...")
gold = GoldParse(tokens, annot_tuples) break
loss += nlp.parser.train(tokens, gold) #nlp.end_training(model_dir)
nlp.parser.model.end_training()
eg_seen += 1 print("Saved. Evaluating...")
if eg_seen % 10000 == 0:
dev_uas = score_file(nlp, dev_loc).uas
train_uas = score_sents(nlp, gold_tuples[:1000]).uas
size = nlp.parser.model.mem.size
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%d' % (itn, int(loss), nr_trimmed,
train_uas, dev_uas, size))
loss = 0
nlp.end_training(model_dir)
return nlp return nlp
def _train_epoch(nlp, gold_tuples, eg_seen, itn, dev_loc, micro_eval):
random.shuffle(gold_tuples)
loss = 0
nr_trimmed = 0
for _, sents in gold_tuples:
for annot_tuples, _ in sents:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
gold = GoldParse(tokens, annot_tuples)
loss += nlp.parser.train(tokens, gold, itn=itn)
eg_seen += 1
if eg_seen % 1000 == 0:
if eg_seen % 20000 == 0:
dev_uas = score_file(nlp, dev_loc).uas
else:
dev_uas = 0.0
train_uas = score_sents(nlp, micro_eval).uas
size = nlp.parser.model.mem.size
nr_upd = nlp.parser.model.time
print('%d,%d:\t%d\t%.3f\t%.3f\t%.3f\t%d' % (itn, nr_upd, int(loss), nr_trimmed,
train_uas, dev_uas, size))
loss = 0
return eg_seen
@plac.annotations( @plac.annotations(
train_loc=("Location of CoNLL 09 formatted training file"), train_loc=("Location of CoNLL 09 formatted training file"),
@ -206,11 +223,13 @@ def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=Fals
batch_norm=batch_norm, batch_norm=batch_norm,
learn_rate=learn_rate, learn_rate=learn_rate,
update_step=update_step) update_step=update_step)
scorer = Scorer()
with io.open(dev_loc, 'r', encoding='utf8') as file_: scorer = score_file(nlp, dev_loc)
for _, sents in read_conll(file_): #scorer = Scorer()
for annot_tuples, _ in sents: #with io.open(dev_loc, 'r', encoding='utf8') as file_:
score_model(scorer, nlp, None, annot_tuples) # for _, sents in read_conll(file_):
# for annot_tuples, _ in sents:
# score_model(scorer, nlp, None, annot_tuples)
print('TOK', scorer.token_acc) print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc) print('POS', scorer.tags_acc)
print('UAS', scorer.uas) print('UAS', scorer.uas)

View File

@ -50,6 +50,7 @@ MOD_NAMES = [
'spacy.syntax.stateclass', 'spacy.syntax.stateclass',
'spacy.syntax._state', 'spacy.syntax._state',
'spacy.tokenizer', 'spacy.tokenizer',
'spacy.syntax._neural',
'spacy.syntax.parser', 'spacy.syntax.parser',
'spacy.syntax.beam_parser', 'spacy.syntax.beam_parser',
'spacy.syntax.nonproj', 'spacy.syntax.nonproj',
@ -174,7 +175,8 @@ def setup_package():
mod_path = mod_name.replace('.', '/') + '.cpp' mod_path = mod_name.replace('.', '/') + '.cpp'
ext_modules.append( ext_modules.append(
Extension(mod_name, [mod_path], Extension(mod_name, [mod_path],
language='c++', include_dirs=include_dirs)) language='c++', include_dirs=include_dirs,
libraries=['/Users/matt/blis/lib/blis']))
if not is_source_release(root): if not is_source_release(root):
generate_cython(root, 'spacy') generate_cython(root, 'spacy')

View File

@ -114,15 +114,13 @@ cdef class BeamParser(Parser):
else: else:
violn.check_crf(pred, gold) violn.check_crf(pred, gold)
if isinstance(self.model, ParserNeuralNet): if isinstance(self.model, ParserNeuralNet):
min_grad = 0.01 ** (itn+1) min_grad = 0.1 ** (itn+1)
for grad, hist in zip(violn.p_probs, violn.p_hist): for grad, hist in zip(violn.p_probs, violn.p_hist):
assert not math.isnan(grad) assert not math.isnan(grad) and not math.isinf(grad)
assert not math.isinf(grad)
if abs(grad) >= min_grad: if abs(grad) >= min_grad:
self._update_dense(tokens, hist, grad) self._update_dense(tokens, hist, grad)
for grad, hist in zip(violn.g_probs, violn.g_hist): for grad, hist in zip(violn.g_probs, violn.g_hist):
assert not math.isnan(grad) assert not math.isnan(grad) and not math.isinf(grad)
assert not math.isinf(grad)
if abs(grad) >= min_grad: if abs(grad) >= min_grad:
self._update_dense(tokens, hist, grad) self._update_dense(tokens, hist, grad)
else: else:

View File

@ -1,33 +1,9 @@
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.neural.nn cimport NeuralNet
from thinc.linear.features cimport ConjunctionExtracter
from thinc.base cimport Model
from thinc.extra.eg cimport Example
from thinc.typedefs cimport weight_t
from thinc.structs cimport FeatureC
from .stateclass cimport StateClass
from .arc_eager cimport TransitionSystem from .arc_eager cimport TransitionSystem
from ..tokens.doc cimport Doc
from ..structs cimport TokenC
from thinc.structs cimport NeuralNetC, ExampleC
from ._state cimport StateC from ._state cimport StateC
from ..structs cimport TokenC
from thinc.base cimport Model
cdef class ParserNeuralNet(NeuralNet): from thinc.linalg cimport *
cdef ConjunctionExtracter extracter
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
cdef class ParserPerceptron(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
cdef object _models
cdef NeuralNetC** _models_c
cdef int** _masks
cdef int _nr_model
cdef class Parser: cdef class Parser:

View File

@ -24,10 +24,13 @@ import random
from cymem.cymem cimport Pool, Address from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t, idx_t from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t, idx_t
from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec from thinc.linalg cimport VecVec
from thinc.structs cimport NeuralNetC, SparseArrayC, ExampleC from thinc.structs cimport NeuralNetC, SparseArrayC, ExampleC
from thinc.extra.eg cimport Example
from preshed.maps cimport MapStruct from preshed.maps cimport MapStruct
from preshed.maps cimport map_get from preshed.maps cimport map_get
from thinc.structs cimport FeatureC from thinc.structs cimport FeatureC
@ -50,6 +53,7 @@ from ._parse_features cimport fill_context
from ._parse_features cimport * from ._parse_features cimport *
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from ._neural cimport ParserNeuralNet, ParserPerceptron
DEBUG = False DEBUG = False
@ -77,171 +81,6 @@ def ParserFactory(transition_system):
return lambda strings, dir_: Parser(strings, dir_, transition_system) return lambda strings, dir_: Parser(strings, dir_, transition_system)
cdef class ParserPerceptron(AveragedPerceptron):
@property
def widths(self):
return (self.extracter.nr_templ,)
def update(self, Example eg):
'''Does regression on negative cost. Sort of cute?'''
self.time += 1
cdef weight_t loss = 0.0
best = eg.best
for clas in range(eg.c.nr_class):
if not eg.c.is_valid[clas]:
continue
if eg.c.scores[clas] < eg.c.scores[best]:
continue
loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2
d_loss = 2 * (-eg.c.costs[clas] - eg.c.scores[clas])
step = d_loss * 0.001
for feat in eg.c.features[:eg.c.nr_feat]:
self.update_weight(feat.key, clas, feat.value * step)
return int(loss)
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
state = <const StateC*>_state
fill_context(eg.atoms, state)
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
cdef class ParserNeuralNet(NeuralNet):
def __init__(self, shape, **kwargs):
vector_widths = [4] * 76
slots = [0, 1, 2, 3] # S0
slots += [4, 5, 6, 7] # S1
slots += [8, 9, 10, 11] # S2
slots += [12, 13, 14, 15] # S3+
slots += [16, 17, 18, 19] # B0
slots += [20, 21, 22, 23] # B1
slots += [24, 25, 26, 27] # B2
slots += [28, 29, 30, 31] # B3+
slots += [32, 33, 34, 35] * 2 # S0l, S0r
slots += [36, 37, 38, 39] * 2 # B0l, B0r
slots += [40, 41, 42, 43] * 2 # S1l, S1r
slots += [44, 45, 46, 47] * 2 # S2l, S2r
slots += [48, 49, 50, 51, 52, 53, 54, 55]
slots += [53, 54, 55, 56]
input_length = sum(vector_widths[slot] for slot in slots)
widths = [input_length] + shape
NeuralNet.__init__(self, widths, embed=(vector_widths, slots), **kwargs)
@property
def nr_feat(self):
return 2000
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
memset(eg.features, 0, 2000 * sizeof(FeatureC))
state = <const StateC*>_state
fill_context(eg.atoms, state)
feats = eg.features
feats = _add_token(feats, 0, state.S_(0), 1.0)
feats = _add_token(feats, 4, state.S_(1), 1.0)
feats = _add_token(feats, 8, state.S_(2), 1.0)
# Rest of the stack, with exponential decay
for i in range(3, state.stack_depth()):
feats = _add_token(feats, 12, state.S_(i), 1.0 * 0.5**(i-2))
feats = _add_token(feats, 16, state.B_(0), 1.0)
feats = _add_token(feats, 20, state.B_(1), 1.0)
feats = _add_token(feats, 24, state.B_(2), 1.0)
# Rest of the buffer, with exponential decay
for i in range(3, min(8, state.buffer_length())):
feats = _add_token(feats, 28, state.B_(i), 1.0 * 0.5**(i-2))
feats = _add_subtree(feats, 32, state, state.S(0))
feats = _add_subtree(feats, 40, state, state.B(0))
feats = _add_subtree(feats, 48, state, state.S(1))
feats = _add_subtree(feats, 56, state, state.S(2))
feats = _add_pos_bigram(feats, 64, state.S_(0), state.B_(0))
feats = _add_pos_bigram(feats, 65, state.S_(1), state.S_(0))
feats = _add_pos_bigram(feats, 66, state.S_(1), state.B_(0))
feats = _add_pos_bigram(feats, 67, state.S_(0), state.B_(1))
feats = _add_pos_bigram(feats, 68, state.S_(0), state.R_(state.S(0), 1))
feats = _add_pos_bigram(feats, 69, state.S_(0), state.R_(state.S(0), 2))
feats = _add_pos_bigram(feats, 70, state.S_(0), state.L_(state.S(0), 1))
feats = _add_pos_bigram(feats, 71, state.S_(0), state.L_(state.S(0), 2))
feats = _add_pos_trigram(feats, 72, state.S_(1), state.S_(0), state.B_(0))
feats = _add_pos_trigram(feats, 73, state.S_(0), state.B_(0), state.B_(1))
feats = _add_pos_trigram(feats, 74, state.S_(0), state.R_(state.S(0), 1),
state.R_(state.S(0), 2))
feats = _add_pos_trigram(feats, 75, state.S_(0), state.L_(state.S(0), 1),
state.L_(state.S(0), 2))
eg.nr_feat = feats - eg.features
cdef void _set_delta_lossC(self, weight_t* delta_loss,
const weight_t* Zs, const weight_t* scores) nogil:
for i in range(self.c.widths[self.c.nr_layer-1]):
delta_loss[i] = Zs[i]
cdef void _softmaxC(self, weight_t* out) nogil:
pass
cdef inline FeatureC* _add_token(FeatureC* feats,
int slot, const TokenC* token, weight_t value) nogil:
# Word
feats.i = slot
feats.key = token.lex.norm
feats.value = value
feats += 1
# POS tag
feats.i = slot+1
feats.key = token.tag
feats.value = value
feats += 1
# Dependency label
feats.i = slot+2
feats.key = token.dep
feats.value = value
feats += 1
# Word, label, tag
feats.i = slot+3
cdef uint64_t key[3]
key[0] = token.lex.cluster
key[1] = token.tag
key[2] = token.dep
feats.key = hash64(key, sizeof(key), 0)
feats.value = value
feats += 1
return feats
cdef inline FeatureC* _add_subtree(FeatureC* feats, int slot, const StateC* state, int t) nogil:
value = 1.0
for i in range(state.n_R(t)):
feats = _add_token(feats, slot, state.R_(t, i+1), value)
value *= 0.5
slot += 4
value = 1.0
for i in range(state.n_L(t)):
feats = _add_token(feats, slot, state.L_(t, i+1), value)
value *= 0.5
return feats
cdef inline FeatureC* _add_pos_bigram(FeatureC* feat, int slot,
const TokenC* t1, const TokenC* t2) nogil:
cdef uint64_t[2] key
key[0] = t1.tag
key[1] = t2.tag
feat.i = slot
feat.key = hash64(key, sizeof(key), slot)
feat.value = 1.0
return feat+1
cdef inline FeatureC* _add_pos_trigram(FeatureC* feat, int slot,
const TokenC* t1, const TokenC* t2, const TokenC* t3) nogil:
cdef uint64_t[3] key
key[0] = t1.tag
key[1] = t2.tag
key[2] = t3.tag
feat.i = slot
feat.key = hash64(key, sizeof(key), slot)
feat.value = 1.0
return feat+1
cdef class Parser: cdef class Parser:
def __init__(self, StringStore strings, transition_system, model): def __init__(self, StringStore strings, transition_system, model):
self.moves = transition_system self.moves = transition_system