Fix parsing and tok2vec models

This commit is contained in:
Matthew Honnibal 2017-09-06 05:50:58 -05:00
parent e88a42e460
commit 24ff6b0ad9
3 changed files with 14 additions and 16 deletions

View File

@ -229,20 +229,18 @@ def drop_layer(layer, factor=2.):
def Tok2Vec(width, embed_size, preprocess=None): def Tok2Vec(width, embed_size, preprocess=None):
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower') norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix') prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix') suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape') shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3)) embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
tok2vec = ( tok2vec = (
with_flatten( with_flatten(
asarray(Model.ops, dtype='uint64') asarray(Model.ops, dtype='uint64')
>> uniqued(embed, column=5) >> uniqued(embed, column=5)
>> drop_layer( >> Residual(
Residual(
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
)
) ** 4, pad=4 ) ** 4, pad=4
) )
) )
@ -372,6 +370,7 @@ def fine_tune(embedding, combine=None):
"fine_tune currently only supports addition. Set combine=None") "fine_tune currently only supports addition. Set combine=None")
def fine_tune_fwd(docs_tokvecs, drop=0.): def fine_tune_fwd(docs_tokvecs, drop=0.):
docs, tokvecs = docs_tokvecs docs, tokvecs = docs_tokvecs
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
vecs, bp_vecs = embedding.begin_update(docs, drop=drop) vecs, bp_vecs = embedding.begin_update(docs, drop=drop)

View File

@ -72,8 +72,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
util.env_opt('batch_compound', 1.001)) util.env_opt('batch_compound', 1.001))
if resume: if resume:
prints(output_path / 'model19.pickle', title="Resuming training") prints(output_path / 'model9.pickle', title="Resuming training")
nlp = dill.load((output_path / 'model19.pickle').open('rb')) nlp = dill.load((output_path / 'model9.pickle').open('rb'))
else: else:
nlp = lang_class(pipeline=pipeline) nlp = lang_class(pipeline=pipeline)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents) corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
@ -87,7 +87,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
if resume: if resume:
i += 20 i += 20
with tqdm.tqdm(total=n_train_words, leave=False) as pbar: with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True, train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0) gold_preproc=gold_preproc, max_length=0)
losses = {} losses = {}
for batch in minibatch(train_docs, size=batch_sizes): for batch in minibatch(train_docs, size=batch_sizes):

View File

@ -393,8 +393,7 @@ cdef class Parser:
tokvecs = self.model[0].ops.flatten(tokvecses) tokvecs = self.model[0].ops.flatten(tokvecses)
if USE_FINE_TUNE: if USE_FINE_TUNE:
# TODO: This is incorrect! Unhack when training next model tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
nr_state = len(docs) nr_state = len(docs)
nr_class = self.moves.n_moves nr_class = self.moves.n_moves
@ -533,7 +532,7 @@ cdef class Parser:
golds = [golds] golds = [golds]
if USE_FINE_TUNE: if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs += self.model[0].ops.flatten(my_tokvecs) tokvecs = self.model[0].ops.flatten(my_tokvecs)
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
@ -706,7 +705,7 @@ cdef class Parser:
lower, stream, drop=dropout) lower, stream, drop=dropout)
return state2vec, upper return state2vec, upper
nr_feature = 13 nr_feature = 8
def get_token_ids(self, states): def get_token_ids(self, states):
cdef StateClass state cdef StateClass state