Fix parsing and tok2vec models

2026-02-02 05:26:01 +03:00 · 2017-09-06 05:50:58 -05:00 · 2017-09-06 05:50:58 -05:00 · 24ff6b0ad9
commit 24ff6b0ad9
parent e88a42e460
3 changed files with 14 additions and 16 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -229,20 +229,18 @@ def drop_layer(layer, factor=2.):
 def Tok2Vec(width, embed_size, preprocess=None):
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
-        norm = get_col(cols.index(NORM))     >> HashEmbed(width, embed_size, name='embed_lower')
-        prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
-        suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
-        shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
+        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
+        prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
+        suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
+        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')

        embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
        tok2vec = (
            with_flatten(
                asarray(Model.ops, dtype='uint64')
                >> uniqued(embed, column=5)
-                >> drop_layer(
-                    Residual(
-                        (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
-                    )
+                >> Residual(
+                    (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
                ) ** 4, pad=4
            )
        )
@ -372,6 +370,7 @@ def fine_tune(embedding, combine=None):
            "fine_tune currently only supports addition. Set combine=None")
    def fine_tune_fwd(docs_tokvecs, drop=0.):
        docs, tokvecs = docs_tokvecs
+
        lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')

        vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
@ -556,7 +555,7 @@ def build_text_classifier(nr_class, width=64, **cfg):

        cnn_model = (
            # TODO Make concatenate support lists
-            concatenate_lists(trained_vectors, static_vectors) 
+            concatenate_lists(trained_vectors, static_vectors)
            >> with_flatten(
                LN(Maxout(width, width*2))
                >> Residual(
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -72,8 +72,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                                   util.env_opt('batch_compound', 1.001))

    if resume:
-        prints(output_path / 'model19.pickle', title="Resuming training")
-        nlp = dill.load((output_path / 'model19.pickle').open('rb'))
+        prints(output_path / 'model9.pickle', title="Resuming training")
+        nlp = dill.load((output_path / 'model9.pickle').open('rb'))
    else:
        nlp = lang_class(pipeline=pipeline)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
@ -87,7 +87,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
            if resume:
                i += 20
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
-                train_docs = corpus.train_docs(nlp, projectivize=True,
+                train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
                                               gold_preproc=gold_preproc, max_length=0)
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -393,8 +393,7 @@ cdef class Parser:

        tokvecs = self.model[0].ops.flatten(tokvecses)
        if USE_FINE_TUNE:
-            # TODO: This is incorrect! Unhack when training next model
-            tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
+            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))

        nr_state = len(docs)
        nr_class = self.moves.n_moves
@ -533,7 +532,7 @@ cdef class Parser:
            golds = [golds]
        if USE_FINE_TUNE:
            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
-            tokvecs += self.model[0].ops.flatten(my_tokvecs)
+            tokvecs = self.model[0].ops.flatten(my_tokvecs)

        cuda_stream = get_cuda_stream()

@ -706,7 +705,7 @@ cdef class Parser:
                        lower, stream, drop=dropout)
        return state2vec, upper

-    nr_feature = 13
+    nr_feature = 8

    def get_token_ids(self, states):
        cdef StateClass state