Merge pull request #1392 from explosion/feature/parser-history-model

💫 Parser history features
2025-12-14 05:34:16 +03:00 · 2017-10-07 15:07:02 +02:00 · 2017-10-07 15:07:02 +02:00 · eb0595bea9
commit eb0595bea9
parent 36c68015f3 d70cf19158
7 changed files with 188 additions and 52 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -32,7 +32,7 @@ import io
 # TODO: Unset this once we don't want to support models previous models.
 import thinc.neural._classes.layernorm
-thinc.neural._classes.layernorm.set_compat_six_eight(True)
+thinc.neural._classes.layernorm.set_compat_six_eight(False)
 VECTORS_KEY = 'spacy_pretrained_vectors'
@ -213,6 +213,72 @@ class PrecomputableMaxouts(Model):
            return dXf
        return Yfp, backward
 # Thinc's Embed class is a bit broken atm, so drop this here.
 from thinc import describe
 from thinc.neural._classes.embed import _uniform_init
@describe.attributes(
    nV=describe.Dimension("Number of vectors"),
    nO=describe.Dimension("Size of output"),
    vectors=describe.Weights("Embedding table",
        lambda obj: (obj.nV, obj.nO),
        _uniform_init(-0.1, 0.1)
    ),
    d_vectors=describe.Gradient("vectors")
 )
 class Embed(Model):
    name = 'embed'
    def __init__(self, nO, nV=None, **kwargs):
        if nV is not None:
            nV += 1
        Model.__init__(self, **kwargs)
        if 'name' in kwargs:
            self.name = kwargs['name']
        self.column = kwargs.get('column', 0)
        self.nO = nO
        self.nV = nV
    def predict(self, ids):
        if ids.ndim == 2:
            ids = ids[:, self.column]
        return self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
    def begin_update(self, ids, drop=0.):
        if ids.ndim == 2:
            ids = ids[:, self.column]
        vectors = self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
        def backprop_embed(d_vectors, sgd=None):
            n_vectors = d_vectors.shape[0]
            self.ops.scatter_add(self.d_vectors, ids, d_vectors)
            if sgd is not None:
                sgd(self._mem.weights, self._mem.gradient, key=self.id)
            return None
        return vectors, backprop_embed
 def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
    '''Wrap a model, adding features representing action history.'''
    if hist_size == 0:
        return layerize(noop())
    embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
                    for i in range(hist_size)]
    embed = concatenate(*embed_tables)
    ops = embed.ops
    def add_history_fwd(vectors_hists, drop=0.):
        vectors, hist_ids = vectors_hists
        hist_feats, bp_hists = embed.begin_update(hist_ids, drop=drop)
        outputs = ops.xp.hstack((vectors, hist_feats))
        def add_history_bwd(d_outputs, sgd=None):
            d_vectors = d_outputs[:, :vectors.shape[1]]
            d_hists = d_outputs[:, vectors.shape[1]:]
            bp_hists(d_hists, sgd=sgd)
            return embed.ops.xp.ascontiguousarray(d_vectors)
        return outputs, add_history_bwd
    return wrap(add_history_fwd, embed)
 def drop_layer(layer, factor=2.):
    def drop_layer_fwd(X, drop=0.):
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -42,7 +42,8 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
    Evaluate a model. To render a sample of parses in a HTML file, set an output
    directory as the displacy_path argument.
    """
-    util.use_gpu(gpu_id)
+    if gpu_id >= 0:
        util.use_gpu(gpu_id)
    util.set_env_log(False)
    data_path = util.ensure_path(data_path)
    displacy_path = util.ensure_path(displacy_path)
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -21,6 +21,7 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves)
    moves = <const Transition*>_moves
    dest.clone(src)
    moves[clas].do(dest.c, moves[clas].label)
    dest.c.push_hist(clas)
 cdef int _check_final_state(void* _state, void* extra_args) except -1:
@ -148,8 +149,8 @@ def get_token_ids(states, int n_tokens):
 nr_update = 0
 def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
                states, golds,
-                state2vec, vec2scores, 
+                state2vec, vec2scores,
-                int width, float density,
+                int width, float density, int hist_feats,
                losses=None, drop=0.):
    global nr_update
    cdef MaxViolation violn
@ -180,7 +181,11 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
        # Now that we have our flat list of states, feed them through the model
        token_ids = get_token_ids(states, nr_feature)
        vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
-        scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
+        if hist_feats:
            hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i')
            scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop)
        else:
            scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
        # Store the callbacks for the backward pass
        backprops.append((token_ids, bp_vectors, bp_scores))
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -1,4 +1,4 @@
-from libc.string cimport memcpy, memset
+from libc.string cimport memcpy, memset, memmove
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint32_t, uint64_t
@ -15,6 +15,23 @@ from ..typedefs cimport attr_t
 cdef inline bint is_space_token(const TokenC* token) nogil:
    return Lexeme.c_check_flag(token.lex, IS_SPACE)
 cdef struct RingBufferC:
    int[8] data
    int i
    int default
 cdef inline int ring_push(RingBufferC* ring, int value) nogil:
    ring.data[ring.i] = value
    ring.i += 1
    if ring.i >= 8:
        ring.i = 0
 cdef inline int ring_get(RingBufferC* ring, int i) nogil:
    if i >= ring.i:
        return ring.default
    else:
        return ring.data[ring.i-i]
 cdef cppclass StateC:
    int* _stack
@ -23,6 +40,7 @@ cdef cppclass StateC:
    TokenC* _sent
    Entity* _ents
    TokenC _empty_token
    RingBufferC _hist
    int length
    int offset
    int _s_i
@ -37,6 +55,7 @@ cdef cppclass StateC:
        this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
        this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
        this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
        memset(&this._hist, 0, sizeof(this._hist))
        this.offset = 0
        cdef int i
        for i in range(length + (PADDING * 2)):
@ -74,6 +93,9 @@ cdef cppclass StateC:
        free(this.shifted - PADDING)
    void set_context_tokens(int* ids, int n) nogil:
        if n == 2:
            ids[0] = this.B(0)
            ids[1] = this.S(0)
        if n == 8:
            ids[0] = this.B(0)
            ids[1] = this.B(1)
@ -271,7 +293,14 @@ cdef cppclass StateC:
        sig[8] = this.B_(0)[0]
        sig[9] = this.E_(0)[0]
        sig[10] = this.E_(1)[0]
-        return hash64(sig, sizeof(sig), this._s_i)
+        return hash64(sig, sizeof(sig), this._s_i) \
             + hash64(<void*>&this._hist, sizeof(RingBufferC), 1)
    void push_hist(int act) nogil:
        ring_push(&this._hist, act+1)
    int get_hist(int i) nogil:
        return ring_get(&this._hist, i)
    void push() nogil:
        if this.B(0) != -1:
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -50,6 +50,7 @@ from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
 from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
 from .._ml import Residual, drop_layer, flatten
 from .._ml import link_vectors_to_models
 from .._ml import HistoryFeatures
 from ..compat import json_dumps
 from . import _parse_features
@ -67,12 +68,10 @@ from ..gold cimport GoldParse
 from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
 from . import _beam_utils
 USE_FINE_TUNE = True
 def get_templates(*args, **kwargs):
    return []
 USE_FTRL = True
 DEBUG = False
 def set_debug(val):
    global DEBUG
@ -239,12 +238,17 @@ cdef class Parser:
    Base class of the DependencyParser and EntityRecognizer.
    """
    @classmethod
-    def Model(cls, nr_class, token_vector_width=128, hidden_width=200, depth=1, **cfg):
+    def Model(cls, nr_class, **cfg):
-        depth = util.env_opt('parser_hidden_depth', depth)
+        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 2))
-        token_vector_width = util.env_opt('token_vector_width', token_vector_width)
+        token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
-        hidden_width = util.env_opt('hidden_width', hidden_width)
+        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
-        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
+        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1))
-        embed_size = util.env_opt('embed_size', 7000)
+        embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4))
        hist_width = util.env_opt('history_width', cfg.get('hist_width', 16))
        if hist_size >= 1 and depth == 0:
            raise ValueError("Inconsistent hyper-params: "
                "history_feats >= 1 but parser_hidden_depth==0")
        tok2vec = Tok2Vec(token_vector_width, embed_size,
                          pretrained_dims=cfg.get('pretrained_dims', 0))
        tok2vec = chain(tok2vec, flatten)
@ -262,22 +266,40 @@ cdef class Parser:
            if depth == 0:
                upper = chain()
                upper.is_noop = True
-            else:
+            elif hist_size and depth == 1:
                upper = chain(
-                    clone(Maxout(hidden_width), depth-1),
+                    HistoryFeatures(nr_class=nr_class, hist_size=hist_size,
                                    nr_dim=hist_width),
                    zero_init(Affine(nr_class, hidden_width+hist_size*hist_width,
                                     drop_factor=0.0)))
                upper.is_noop = False
            elif hist_size:
                upper = chain(
                    HistoryFeatures(nr_class=nr_class, hist_size=hist_size,
                                    nr_dim=hist_width),
                    LayerNorm(Maxout(hidden_width, hidden_width+hist_size*hist_width)),
                    clone(LayerNorm(Maxout(hidden_width, hidden_width)), depth-2),
                    zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
                )
                upper.is_noop = False
            else:
                upper = chain(
                    clone(LayerNorm(Maxout(hidden_width, hidden_width)), depth-1),
                    zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
                )
                upper.is_noop = False
        # TODO: This is an unfortunate hack atm!
        # Used to set input dimensions in network.
        lower.begin_training(lower.ops.allocate((500, token_vector_width)))
        upper.begin_training(upper.ops.allocate((500, hidden_width)))
        cfg = {
            'nr_class': nr_class,
-            'depth': depth,
+            'hidden_depth': depth,
            'token_vector_width': token_vector_width,
            'hidden_width': hidden_width,
-            'maxout_pieces': parser_maxout_pieces
+            'maxout_pieces': parser_maxout_pieces,
            'hist_size': hist_size,
            'hist_width': hist_width
        }
        return (tok2vec, lower, upper), cfg
@ -350,7 +372,7 @@ cdef class Parser:
            _cleanup(beam)
            return output
-    def pipe(self, docs, int batch_size=1000, int n_threads=2,
+    def pipe(self, docs, int batch_size=256, int n_threads=2,
             beam_width=None, beam_density=None):
        """
        Process a stream of documents.
@ -427,12 +449,18 @@ cdef class Parser:
                    self._parse_step(next_step[i],
                        feat_weights, nr_class, nr_feat, nr_piece)
            else:
                hists = []
                for i in range(nr_step):
                    st = next_step[i]
                    st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
                    self.moves.set_valid(&c_is_valid[i*nr_class], st)
                    hists.append([st.get_hist(j+1) for j in range(8)])
                hists = numpy.asarray(hists)
                vectors = state2vec(token_ids[:next_step.size()])
-                scores = vec2scores(vectors)
+                if self.cfg.get('hist_size'):
                    scores = vec2scores((vectors, hists))
                else:
                    scores = vec2scores(vectors)
                c_scores = <float*>scores.data
                for i in range(nr_step):
                    st = next_step[i]
@ -440,6 +468,7 @@ cdef class Parser:
                        &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
                    action = self.moves.c[guess]
                    action.do(st, action.label)
                    st.push_hist(guess)
            this_step, next_step = next_step, this_step
            next_step.clear()
            for st in this_step:
@ -478,7 +507,12 @@ cdef class Parser:
                        states.append(stcls)
                token_ids = self.get_token_ids(states)
                vectors = state2vec(token_ids)
-                scores = vec2scores(vectors)
+                if self.cfg.get('hist_size', 0):
                    hists = numpy.asarray([st.history[:self.cfg['hist_size']]
                                           for st in states], dtype='i')
                    scores = vec2scores((vectors, hists))
                else:
                    scores = vec2scores(vectors)
                j = 0
                c_scores = <float*>scores.data
                for i in range(beam.size):
@ -497,8 +531,6 @@ cdef class Parser:
            const float* feat_weights,
            int nr_class, int nr_feat, int nr_piece) nogil:
        '''This only works with no hidden layers -- fast but inaccurate'''
        #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
        #    self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
        token_ids = <int*>calloc(nr_feat, sizeof(int))
        scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
        is_valid = <int*>calloc(nr_class, sizeof(int))
@ -510,6 +542,7 @@ cdef class Parser:
        guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
        action = self.moves.c[guess]
        action.do(state, action.label)
        state.push_hist(guess)
        free(is_valid)
        free(scores)
@ -550,7 +583,11 @@ cdef class Parser:
            if drop != 0:
                mask = vec2scores.ops.get_dropout_mask(vector.shape, drop)
                vector *= mask
-            scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
+            hists = numpy.asarray([st.history for st in states], dtype='i')
            if self.cfg.get('hist_size', 0):
                scores, bp_scores = vec2scores.begin_update((vector, hists), drop=drop)
            else:
                scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
            d_scores = self.get_batch_loss(states, golds, scores)
            d_scores /= len(docs)
@ -569,7 +606,8 @@ cdef class Parser:
            else:
                backprops.append((token_ids, d_vector, bp_vector))
            self.transition_batch(states, scores)
-            todo = [st for st in todo if not st[0].is_final()]
+            todo = [(st, gold) for (st, gold) in todo
                    if not st.is_final()]
            if losses is not None:
                losses[self.name] += (d_scores**2).sum()
            n_steps += 1
@ -602,7 +640,7 @@ cdef class Parser:
        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
                                        states, golds,
                                        state2vec, vec2scores,
-                                        width, density,
+                                        width, density, self.cfg.get('hist_size', 0),
                                        drop=drop, losses=losses)
        backprop_lower = []
        cdef float batch_size = len(docs)
@ -648,6 +686,7 @@ cdef class Parser:
                while state.B(0) < start and not state.is_final():
                    action = self.moves.c[oracle_actions.pop(0)]
                    action.do(state.c, action.label)
                    state.c.push_hist(action.clas)
                    n_moves += 1
                has_gold = self.moves.has_gold(gold, start=start,
                                               end=start+max_length)
@ -711,6 +750,7 @@ cdef class Parser:
            action = self.moves.c[guess]
            action.do(state.c, action.label)
            c_scores += scores.shape[1]
            state.c.push_hist(guess)
    def get_batch_loss(self, states, golds, float[:, ::1] scores):
        cdef StateClass state
@ -934,6 +974,7 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves)
    moves = <const Transition*>_moves
    dest.clone(src)
    moves[clas].do(dest.c, moves[clas].label)
    dest.c.push_hist(clas)
 cdef int _check_final_state(void* _state, void* extra_args) except -1:
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from libc.string cimport memcpy, memset
 from libc.stdint cimport uint32_t, uint64_t
 import numpy
 from ..vocab cimport EMPTY_LEXEME
 from ..structs cimport Entity
@ -38,6 +39,13 @@ cdef class StateClass:
    def token_vector_lenth(self):
        return self.doc.tensor.shape[1]
    @property
    def history(self):
        hist = numpy.ndarray((8,), dtype='i')
        for i in range(8):
            hist[i] = self.c.get_hist(i+1)
        return hist
    def is_final(self):
        return self.c.is_final()
@ -54,27 +62,3 @@ cdef class StateClass:
        n0 = words[self.B(0)]
        n1 = words[self.B(1)]
        return ' '.join((third, second, top, '|', n0, n1))
    @classmethod
    def nr_context_tokens(cls):
        return 13
    def set_context_tokens(self, int[::1] output):
        output[0] = self.B(0)
        output[1] = self.B(1)
        output[2] = self.S(0)
        output[3] = self.S(1)
        output[4] = self.S(2)
        output[5] = self.L(self.S(0), 1)
        output[6] = self.L(self.S(0), 2)
        output[6] = self.R(self.S(0), 1)
        output[7] = self.L(self.B(0), 1)
        output[8] = self.R(self.S(0), 2)
        output[9] = self.L(self.S(1), 1)
        output[10] = self.L(self.S(1), 2)
        output[11] = self.R(self.S(1), 1)
        output[12] = self.R(self.S(1), 2)
        for i in range(13):
            if output[i] != -1:
                output[i] += self.c.offset
--- a/website/api/_top-level/_cli.jade
+++ b/website/api/_top-level/_cli.jade
@ -314,6 +314,16 @@ p
        +cell Size of the parser's and NER's hidden layers.
        +cell #[code 128]
    +row
        +cell #[code history_feats]
        +cell Number of previous action ID features for parser and NER.
        +cell #[code 128]
    +row
        +cell #[code history_width]
        +cell Number of embedding dimensions for each action ID.
        +cell #[code 128]
    +row
        +cell #[code learn_rate]
        +cell Learning rate.