Get pre-computed version working

2025-10-31 16:07:41 +03:00 · 2017-05-08 00:38:35 +02:00 · 2017-05-08 00:38:35 +02:00 · 10682d35ab
commit 10682d35ab
parent 35458987e8
4 changed files with 106 additions and 91 deletions
--- a/bin/parser/train_ud.py
+++ b/bin/parser/train_ud.py
@ -144,7 +144,6 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
        docs = list(Xs)
        for doc in docs:
            encoder(doc)
        parser.begin_training(docs, ys)
        nn_loss = [0.]
        def track_progress():
            scorer = score_model(vocab, encoder, tagger, parser, dev_Xs, dev_ys)
@ -153,7 +152,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
            nn_loss.append(0.)
        trainer.each_epoch.append(track_progress)
        trainer.batch_size = 12
-        trainer.nb_epoch = 2
+        trainer.nb_epoch = 20
        for docs, golds in trainer.iterate(Xs, ys, progress_bar=False):
            docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
            tokvecs, upd_tokvecs = encoder.begin_update(docs)
@ -161,9 +160,9 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
                doc.tensor = tokvec
            for doc, gold in zip(docs, golds):
                tagger.update(doc, gold)
-            d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer)
+            d_tokvecs = parser.update(docs, golds, sgd=optimizer)
            upd_tokvecs(d_tokvecs, sgd=optimizer)
-            nn_loss[-1] += loss
+            #nn_loss[-1] += loss
    nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
    #nlp.end_training(model_dir)
    #scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
@ -173,7 +172,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
 if __name__ == '__main__':
    import cProfile
    import pstats
-    if 0:
+    if 1:
        plac.call(main)
    else:
        cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -51,47 +51,6 @@ def doc2feats(cols):
    model = layerize(forward)
    return model
 def build_feature_precomputer(model, feat_maps):
    '''Allow a model to be "primed" by pre-computing input features in bulk.
    This is used for the parser, where we want to take a batch of documents,
    and compute vectors for each (token, position) pair. These vectors can then
    be reused, especially for beam-search.
    Let's say we're using 12 features for each state, e.g. word at start of
    buffer, three words on stack, their children, etc. In the normal arc-eager
    system, a document of length N is processed in 2*N states. This means we'll
    create 2*N*12 feature vectors --- but if we pre-compute, we only need
    N*12 vector computations. The saving for beam-search is much better:
    if we have a beam of k, we'll normally make 2*N*12*K computations -- 
    so we can save the factor k. This also gives a nice CPU/GPU division:
    we can do all our hard maths up front, packed into large multiplications,
    and do the hard-to-program parsing on the CPU.
    '''
    def precompute(input_vectors):
        cached, backprops = zip(*[lyr.begin_update(input_vectors)
                                for lyr in feat_maps)
        def forward(batch_token_ids, drop=0.):
            output = ops.allocate((batch_size, output_width))
            # i: batch index
            # j: position index (i.e. N0, S0, etc
            # tok_i: Index of the token within its document
            for i, token_ids in enumerate(batch_token_ids):
                for j, tok_i in enumerate(token_ids):
                    output[i] += cached[j][tok_i]
            def backward(d_vector, sgd=None):
                d_inputs = ops.allocate((batch_size, n_feat, vec_width))
                for i, token_ids in enumerate(batch_token_ids):
                    for j in range(len(token_ids)):
                        d_inputs[i][j] = backprops[j](d_vector, sgd)
                # Return the IDs, so caller can associate to correct token
                return (batch_token_ids, d_inputs)
            return vector, backward
        return chain(layerize(forward), model)
    return precompute
 def print_shape(prefix):
    def forward(X, drop=0.):
        return X, lambda dX, **kwargs: dX
@ -114,3 +73,47 @@ def flatten(seqs, drop=0.):
        return d_X
    X = ops.xp.concatenate([ops.asarray(seq) for seq in seqs])
    return X, finish_update
 #def build_feature_precomputer(model, feat_maps):
 #    '''Allow a model to be "primed" by pre-computing input features in bulk.
 #
 #    This is used for the parser, where we want to take a batch of documents,
 #    and compute vectors for each (token, position) pair. These vectors can then
 #    be reused, especially for beam-search.
 #
 #    Let's say we're using 12 features for each state, e.g. word at start of
 #    buffer, three words on stack, their children, etc. In the normal arc-eager
 #    system, a document of length N is processed in 2*N states. This means we'll
 #    create 2*N*12 feature vectors --- but if we pre-compute, we only need
 #    N*12 vector computations. The saving for beam-search is much better:
 #    if we have a beam of k, we'll normally make 2*N*12*K computations -- 
 #    so we can save the factor k. This also gives a nice CPU/GPU division:
 #    we can do all our hard maths up front, packed into large multiplications,
 #    and do the hard-to-program parsing on the CPU.
 #    '''
 #    def precompute(input_vectors):
 #        cached, backprops = zip(*[lyr.begin_update(input_vectors)
 #                                for lyr in feat_maps)
 #        def forward(batch_token_ids, drop=0.):
 #            output = ops.allocate((batch_size, output_width))
 #            # i: batch index
 #            # j: position index (i.e. N0, S0, etc
 #            # tok_i: Index of the token within its document
 #            for i, token_ids in enumerate(batch_token_ids):
 #                for j, tok_i in enumerate(token_ids):
 #                    output[i] += cached[j][tok_i]
 #            def backward(d_vector, sgd=None):
 #                d_inputs = ops.allocate((batch_size, n_feat, vec_width))
 #                for i, token_ids in enumerate(batch_token_ids):
 #                    for j in range(len(token_ids)):
 #                        d_inputs[i][j] = backprops[j](d_vector, sgd)
 #                # Return the IDs, so caller can associate to correct token
 #                return (batch_token_ids, d_inputs)
 #            return vector, backward
 #        return chain(layerize(forward), model)
 #    return precompute
 #
 #
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@ -13,5 +13,6 @@ cdef class Parser:
    cdef readonly object model
    cdef readonly TransitionSystem moves
    cdef readonly object cfg
    cdef public object feature_maps
    #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -28,10 +28,11 @@ from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
 from thinc.api import layerize
-from numpy import exp
+from thinc.api import layerize, chain
 from thinc.neural import Model, Maxout
 from .._ml import get_col
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
@ -46,8 +47,9 @@ from ..strings cimport StringStore
 from ..gold cimport GoldParse
 from ..attrs cimport TAG, DEP
 from .._ml import build_state2vec, build_model, precompute_hiddens
 def get_templates(*args, **kwargs):
    return []
 USE_FTRL = True
 DEBUG = False
@ -56,30 +58,39 @@ def set_debug(val):
    DEBUG = val
-def get_greedy_model_for_batch(tokvecs, TransitionSystem moves, feat_maps, upper_model):
+def get_greedy_model_for_batch(tokvecs, TransitionSystem moves, upper_model, feat_maps):
    cdef int[:, :] is_valid_
    cdef float[:, :] costs_
-    cdef int[:, :] token_ids
+    lengths = [len(t) for t in tokvecs]
    tokvecs = upper_model.ops.flatten(tokvecs)
    is_valid = upper_model.ops.allocate((len(tokvecs), moves.n_moves), dtype='i')
    costs = upper_model.ops.allocate((len(tokvecs), moves.n_moves), dtype='f')
-    token_ids = upper_model.ops.allocate((len(tokvecs), StateClass.nr_context_tokens()),
+    token_ids = upper_model.ops.allocate((len(tokvecs), len(feat_maps)), dtype='i')
                                         dtype='uint64')
    cached, backprops = zip(*[lyr.begin_update(tokvecs) for lyr in feat_maps])
    is_valid_ = is_valid
    costs_ = costs
-    def forward(states, drop=0.):
+    def forward(states_offsets, drop=0.):
        nonlocal is_valid, costs, token_ids, moves
        states, offsets = states_offsets
        is_valid = is_valid[:len(states)]
        costs = costs[:len(states)]
        token_ids = token_ids[:len(states)]
        is_valid = is_valid[:len(states)]
        cdef StateClass state
-        for i, state in enumerate(states):
+        cdef int i
        for i, (offset, state) in enumerate(zip(offsets, states)):
            state.set_context_tokens(token_ids[i])
            moves.set_valid(&is_valid_[i, 0], state.c)
-
+        adjusted_ids = token_ids.copy()
-        features = cached[token_ids].sum(axis=1)
+        for i, offset in enumerate(offsets):
            adjusted_ids[i] *= token_ids[i] >= 0
            adjusted_ids[i] += offset
        features = upper_model.ops.allocate((len(states), 64), dtype='f')
        for i in range(len(states)):
            for j, tok_i in enumerate(adjusted_ids[i]):
                if tok_i >= 0:
                    features[i] += cached[j][tok_i]
        scores, bp_scores = upper_model.begin_update(features, drop=drop)
        softmaxed = upper_model.ops.softmax(scores)
@ -89,15 +100,16 @@ def get_greedy_model_for_batch(tokvecs, TransitionSystem moves, feat_maps, upper
        def backward(golds, sgd=None):
            nonlocal costs_, is_valid_, moves
            cdef int i
            for i, (state, gold) in enumerate(zip(states, golds)):
                moves.set_costs(&is_valid_[i, 0], &costs_[i, 0],
                    state, gold)
            d_scores = scores.copy()
            d_scores.fill(0)
            set_log_loss(upper_model.ops, d_scores,
-                scores, is_valid_, costs_)
+                scores, is_valid, costs)
            d_tokens = bp_scores(d_scores, sgd)
-            return d_tokens
+            return (token_ids, d_tokens)
        return softmaxed, backward
@ -127,14 +139,18 @@ def transition_batch(TransitionSystem moves, states, scores):
 def init_states(TransitionSystem moves, docs):
    states = []
    cdef Doc doc
    cdef StateClass state
    offsets = []
    states = []
    offset = 0
    for i, doc in enumerate(docs):
        state = StateClass.init(doc.c, doc.length)
        moves.initialize_state(state.c)
        states.append(state)
-    return states
+        offsets.append(offset)
        offset += len(doc)
    return states, offsets
 cdef class Parser:
@ -184,18 +200,22 @@ cdef class Parser:
        cfg['actions'] = TransitionSystem.get_actions(**cfg)
        self.moves = TransitionSystem(vocab.strings, cfg['actions'])
        if model is None:
-            model = self.build_model(**cfg)
+            self.model, self.feature_maps = self.build_model(**cfg)
-        self.model = model
+        else:
            self.model, self.feature_maps = model
        self.cfg = cfg
    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)
-    def build_model(self, width=32, nr_vector=1000, nF=1, nB=1, nS=1, nL=1, nR=1, **_):
+    def build_model(self, width=64, nr_vector=1000, nF=1, nB=1, nS=1, nL=1, nR=1, **_):
        nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR)
-        self.model = build_model(width*2, 2, self.moves.n_moves)
+
        model = chain(Maxout(width, width), Maxout(self.moves.n_moves, width))
        # TODO
-        self.feature_maps = [] #build_feature_maps(nr_context_tokens, width, nr_vector)
+        feature_maps = [Maxout(width, width)
                        for i in range(nr_context_tokens)]
        return model, feature_maps
    def __call__(self, Doc tokens):
        """
@ -245,19 +265,21 @@ cdef class Parser:
        cdef Doc doc
        cdef StateClass state
        model = get_greedy_model_for_batch([d.tensor for d in docs],
-                    self.moves, self.model, self.feat_maps)
+                    self.moves, self.model, self.feature_maps)
-        states = [StateClass.init(doc.c, doc.length) for doc in docs]
+        states, offsets = init_states(self.moves, docs)
-        todo = list(states)
+        all_states = list(states)
        todo = list(zip(states, offsets))
        while todo:
-            scores = model(todo)
+            states, offsets = zip(*todo)
-            transition_batch(self.moves, todo, scores)
+            scores = model((states, offsets))
-            todo = [st for st in states if not st.is_final()]
+            transition_batch(self.moves, states, scores)
-        for state, doc in zip(states, docs):
+            todo = [st for st in todo if not st[0].py_is_final()]
        for state, doc in zip(all_states, docs):
            self.moves.finalize_state(state.c)
            for i in range(doc.length):
                doc.c[i] = state.c._sent[i]
        for doc in docs:
-            self.moves.finalize_parse(doc)
+            self.moves.finalize_doc(doc)
    def update(self, docs, golds, drop=0., sgd=None):
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
@ -266,33 +288,23 @@ cdef class Parser:
            self.moves.preprocess_gold(gold)
        model = get_greedy_model_for_batch([d.tensor for d in docs],
-                    self.moves, self.model, self.feat_maps)
+                    self.moves, self.model, self.feature_maps)
-        states = init_states(self.moves, docs)
+        states, offsets = init_states(self.moves, docs)
        d_tokens = [self.model.ops.allocate(d.tensor.shape) for d in docs]
        output = list(d_tokens)
-        todo = zip(states, golds, d_tokens)
+        todo = zip(states, offsets, golds, d_tokens)
        while todo:
-            states, golds, d_tokens = zip(*todo)
+            states, offsets, golds, d_tokens = zip(*todo)
-            scores, finish_update = model.begin_update(token_ids)
+            scores, finish_update = model.begin_update((states, offsets))
-            d_state_features = finish_update(golds, sgd=sgd)
+            (token_ids, d_state_features) = finish_update(golds, sgd=sgd)
            for i, token_ids in enumerate(token_ids):
                d_tokens[i][token_ids] += d_state_features[i]
-            transition_batch(self.moves, states)
+            transition_batch(self.moves, states, scores)
            # Get unfinished states (and their matching gold and token gradients)
            todo = filter(lambda sp: not sp[0].py_is_final(), todo)
        return output
    def begin_training(self, docs, golds):
        for gold in golds:
            self.moves.preprocess_gold(gold)
        states = self._init_states(docs)
        tokvecs = [d.tensor for d in docs]
        features = self._get_features(states, tokvecs)
        self.model.begin_training(features)
    def step_through(self, Doc doc, GoldParse gold=None):
        """
        Set up a stepwise state, to introspect and control the transition sequence.