From 24ff6b0ad9a5ee976cf8b8e88704136d66c0c93b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 6 Sep 2017 05:50:58 -0500 Subject: [PATCH] Fix parsing and tok2vec models --- spacy/_ml.py | 17 ++++++++--------- spacy/cli/train.py | 6 +++--- spacy/syntax/nn_parser.pyx | 7 +++---- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index e6437cdcf..003541f4b 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -229,20 +229,18 @@ def drop_layer(layer, factor=2.): def Tok2Vec(width, embed_size, preprocess=None): cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): - norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower') - prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix') - suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix') - shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape') + norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') + prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') + suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') + shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3)) tok2vec = ( with_flatten( asarray(Model.ops, dtype='uint64') >> uniqued(embed, column=5) - >> drop_layer( - Residual( - (ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) - ) + >> Residual( + (ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) ) ** 4, pad=4 ) ) @@ -372,6 +370,7 @@ def fine_tune(embedding, combine=None): "fine_tune currently only supports addition. Set combine=None") def fine_tune_fwd(docs_tokvecs, drop=0.): docs, tokvecs = docs_tokvecs + lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') vecs, bp_vecs = embedding.begin_update(docs, drop=drop) @@ -556,7 +555,7 @@ def build_text_classifier(nr_class, width=64, **cfg): cnn_model = ( # TODO Make concatenate support lists - concatenate_lists(trained_vectors, static_vectors) + concatenate_lists(trained_vectors, static_vectors) >> with_flatten( LN(Maxout(width, width*2)) >> Residual( diff --git a/spacy/cli/train.py b/spacy/cli/train.py index ddec2c069..a22db6abc 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -72,8 +72,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, util.env_opt('batch_compound', 1.001)) if resume: - prints(output_path / 'model19.pickle', title="Resuming training") - nlp = dill.load((output_path / 'model19.pickle').open('rb')) + prints(output_path / 'model9.pickle', title="Resuming training") + nlp = dill.load((output_path / 'model9.pickle').open('rb')) else: nlp = lang_class(pipeline=pipeline) corpus = GoldCorpus(train_path, dev_path, limit=n_sents) @@ -87,7 +87,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, if resume: i += 20 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: - train_docs = corpus.train_docs(nlp, projectivize=True, + train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, gold_preproc=gold_preproc, max_length=0) losses = {} for batch in minibatch(train_docs, size=batch_sizes): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 34e504da9..1c4107c06 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -393,8 +393,7 @@ cdef class Parser: tokvecs = self.model[0].ops.flatten(tokvecses) if USE_FINE_TUNE: - # TODO: This is incorrect! Unhack when training next model - tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) + tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) nr_state = len(docs) nr_class = self.moves.n_moves @@ -533,7 +532,7 @@ cdef class Parser: golds = [golds] if USE_FINE_TUNE: my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) - tokvecs += self.model[0].ops.flatten(my_tokvecs) + tokvecs = self.model[0].ops.flatten(my_tokvecs) cuda_stream = get_cuda_stream() @@ -706,7 +705,7 @@ cdef class Parser: lower, stream, drop=dropout) return state2vec, upper - nr_feature = 13 + nr_feature = 8 def get_token_ids(self, states): cdef StateClass state