# cython: infer_types=True, cdivision=True, boundscheck=False
cimport cython.parallel
cimport numpy as np
from libc.math cimport exp
from libcpp.vector cimport vector
from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
from cymem.cymem cimport Pool
from thinc.extra.search cimport Beam
from thinc.backends.linalg cimport Vec, VecVec
cimport blis.cy

import numpy
import numpy.random
from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops

from ..typedefs cimport weight_t, class_t, hash_t
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
from .stateclass cimport StateClass
from .transition_system cimport Transition

from ..compat import copy_array
from ..errors import Errors, TempErrors
from ..util import link_vectors_to_models, create_default_optimizer
from .. import util
from . import _beam_utils
from . import nonproj


cdef WeightsC get_c_weights(model) except *:
    cdef WeightsC output
    cdef precompute_hiddens state2vec = model.state2vec
    output.feat_weights = state2vec.get_feat_weights()
    output.feat_bias = <const float*>state2vec.bias.data
    cdef np.ndarray vec2scores_W
    cdef np.ndarray vec2scores_b
    if model.vec2scores is None:
        output.hidden_weights = NULL
        output.hidden_bias = NULL
    else:
        vec2scores_W = model.vec2scores.get_param("W")
        vec2scores_b = model.vec2scores.get_param("b")
        output.hidden_weights = <const float*>vec2scores_W.data
        output.hidden_bias = <const float*>vec2scores_b.data
    cdef np.ndarray class_mask = model._class_mask
    output.seen_classes = <const float*>class_mask.data
    return output


cdef SizesC get_c_sizes(model, int batch_size) except *:
    cdef SizesC output
    output.states = batch_size
    if model.vec2scores is None:
        output.classes = model.state2vec.get_dim("nO")
    else:
        output.classes = model.vec2scores.get_dim("nO")
    output.hiddens = model.state2vec.get_dim("nO")
    output.pieces = model.state2vec.get_dim("nP")
    output.feats = model.state2vec.get_dim("nF")
    output.embed_width = model.tokvecs.shape[1]
    return output


cdef ActivationsC alloc_activations(SizesC n) nogil:
    cdef ActivationsC A
    memset(&A, 0, sizeof(A))
    resize_activations(&A, n)
    return A


cdef void free_activations(const ActivationsC* A) nogil:
    free(A.token_ids)
    free(A.scores)
    free(A.unmaxed)
    free(A.hiddens)
    free(A.is_valid)


cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
    if n.states <= A._max_size:
        A._curr_size = n.states
        return
    if A._max_size == 0:
        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
        A._max_size = n.states
    else:
        A.token_ids = <int*>realloc(A.token_ids,
            n.states * n.feats * sizeof(A.token_ids[0]))
        A.scores = <float*>realloc(A.scores,
            n.states * n.classes * sizeof(A.scores[0]))
        A.unmaxed = <float*>realloc(A.unmaxed,
            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
        A.hiddens = <float*>realloc(A.hiddens,
            n.states * n.hiddens * sizeof(A.hiddens[0]))
        A.is_valid = <int*>realloc(A.is_valid,
            n.states * n.classes * sizeof(A.is_valid[0]))
        A._max_size = n.states
    A._curr_size = n.states


cdef void predict_states(ActivationsC* A, StateC** states,
        const WeightsC* W, SizesC n) nogil:
    cdef double one = 1.0
    resize_activations(A, n)
    for i in range(n.states):
        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
    sum_state_features(A.unmaxed,
        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
    for i in range(n.states):
        VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
            W.feat_bias, 1., n.hiddens * n.pieces)
        for j in range(n.hiddens):
            index = i * n.hiddens * n.pieces + j * n.pieces
            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
    memset(A.scores, 0, n.states * n.classes * sizeof(float))
    if W.hidden_weights == NULL:
        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
    else:
        # Compute hidden-to-output
        blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE,
            n.states, n.classes, n.hiddens, one,
            <float*>A.hiddens, n.hiddens, 1,
            <float*>W.hidden_weights, n.hiddens, 1,
            one,
            <float*>A.scores, n.classes, 1)
        # Add bias
        for i in range(n.states):
            VecVec.add_i(&A.scores[i*n.classes],
                W.hidden_bias, 1., n.classes)
    # Set unseen classes to minimum value
    i = 0
    min_ = A.scores[0]
    for i in range(1, n.states * n.classes):
        if A.scores[i] < min_:
            min_ = A.scores[i]
    for i in range(n.states):
        for j in range(n.classes):
            if not W.seen_classes[j]:
                A.scores[i*n.classes+j] = min_


cdef void sum_state_features(float* output,
        const float* cached, const int* token_ids, int B, int F, int O) nogil:
    cdef int idx, b, f, i
    cdef const float* feature
    padding = cached
    cached += F * O
    cdef int id_stride = F*O
    cdef float one = 1.
    for b in range(B):
        for f in range(F):
            if token_ids[f] < 0:
                feature = &padding[f*O]
            else:
                idx = token_ids[f] * id_stride + f*O
                feature = &cached[idx]
            blis.cy.axpyv(blis.cy.NO_CONJUGATE, O, one,
                <float*>feature, 1,
                &output[b*O], 1)
        token_ids += F


cdef void cpu_log_loss(float* d_scores,
        const float* costs, const int* is_valid, const float* scores,
        int O) nogil:
    """Do multi-label log loss"""
    cdef double max_, gmax, Z, gZ
    best = arg_max_if_gold(scores, costs, is_valid, O)
    guess = Vec.arg_max(scores, O)
    if best == -1 or guess == -1:
        # These shouldn't happen, but if they do, we want to make sure we don't
        # cause an OOB access.
        return
    Z = 1e-10
    gZ = 1e-10
    max_ = scores[guess]
    gmax = scores[best]
    for i in range(O):
        Z += exp(scores[i] - max_)
        if costs[i] <= costs[best]:
            gZ += exp(scores[i] - gmax)
    for i in range(O):
        if costs[i] <= costs[best]:
            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
        else:
            d_scores[i] = exp(scores[i]-max_) / Z


cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
        const int* is_valid, int n) nogil:
    # Find minimum cost
    cdef float cost = 1
    for i in range(n):
        if is_valid[i] and costs[i] < cost:
            cost = costs[i]
    # Now find best-scoring with that cost
    cdef int best = -1
    for i in range(n):
        if costs[i] <= cost and is_valid[i]:
            if best == -1 or scores[i] > scores[best]:
                best = i
    return best


cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
    cdef int best = -1
    for i in range(n):
        if is_valid[i] >= 1:
            if best == -1 or scores[i] > scores[best]:
                best = i
    return best


class ParserModel(Model):
    def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
        # don't define nO for this object, because we can't dynamically change it
        Model.__init__(self, name="parser_model", forward=forward, dims={"nI": None})
        if tok2vec.has_dim("nI"):
            self.set_dim("nI", tok2vec.get_dim("nI"))
        self._layers = [tok2vec, lower_model]
        if upper_model is not None:
            self._layers.append(upper_model)
        self.unseen_classes = set()
        if unseen_classes:
            for class_ in unseen_classes:
                self.unseen_classes.add(class_)
        self.set_ref("tok2vec", tok2vec)

    def predict(self, docs):
        step_model = ParserStepModel(docs, self._layers,
                        unseen_classes=self.unseen_classes, train=False)
        return step_model

    def resize_output(self, new_nO):
        if len(self._layers) == 2:
            return
        if self.upper.has_dim("nO") and (new_nO == self.upper.get_dim("nO")):
            return
        smaller = self.upper
        nI = None
        if smaller.has_dim("nI"):
            nI = smaller.get_dim("nI")
        with use_ops('numpy'):
            larger = Linear(nO=new_nO, nI=nI)
            larger.init = smaller.init
        # it could be that the model is not initialized yet, then skip this bit
        if nI:
            larger_W = larger.ops.alloc2f(new_nO, nI)
            larger_b = larger.ops.alloc1f(new_nO)
            smaller_W = smaller.get_param("W")
            smaller_b = smaller.get_param("b")
            # Weights are stored in (nr_out, nr_in) format, so we're basically
            # just adding rows here.
            if smaller.has_dim("nO"):
                larger_W[:smaller.get_dim("nO")] = smaller_W
                larger_b[:smaller.get_dim("nO")] = smaller_b
                for i in range(smaller.get_dim("nO"), new_nO):
                    self.unseen_classes.add(i)

            larger.set_param("W", larger_W)
            larger.set_param("b", larger_b)
        self._layers[-1] = larger

    def initialize(self, X=None, Y=None):
        self.tok2vec.initialize()
        self.lower.initialize(X=X, Y=Y)
        if self.upper is not None:
            # In case we need to trigger the callbacks
            statevecs = self.ops.alloc((2, self.lower.get_dim("nO")))
            self.upper.initialize(X=statevecs)

    def finish_update(self, optimizer):
        self.tok2vec.finish_update(optimizer)
        self.lower.finish_update(optimizer)
        if self.upper is not None:
            self.upper.finish_update(optimizer)

    @property
    def tok2vec(self):
        return self._layers[0]

    @property
    def lower(self):
        return self._layers[1]

    @property
    def upper(self):
        return self._layers[2]


def forward(model:ParserModel, X, is_train):
    step_model = ParserStepModel(X, model._layers, unseen_classes=model.unseen_classes,
        train=is_train)

    return step_model, step_model.finish_steps


class ParserStepModel(Model):
    def __init__(self, docs, layers, unseen_classes=None, train=True):
        Model.__init__(self, name="parser_step_model", forward=step_forward)
        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
        if layers[1].get_dim("nP") >= 2:
            activation = "maxout"
        elif len(layers) == 2:
            activation = None
        else:
            activation = "relu"
        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
                                            activation=activation, train=train)
        if len(layers) == 3:
            self.vec2scores = layers[-1]
        else:
            self.vec2scores = None
        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
        self.backprops = []
        if self.vec2scores is None:
            self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f')
        else:
            self._class_mask = numpy.zeros((self.vec2scores.get_dim("nO"),), dtype='f')
        self._class_mask.fill(1)
        if unseen_classes is not None:
            for class_ in unseen_classes:
                self._class_mask[class_] = 0.

    @property
    def nO(self):
        return self.state2vec.nO

    def class_is_unseen(self, class_):
        return self._class_mask[class_]

    def mark_class_unseen(self, class_):
        self._class_mask[class_] = 0

    def mark_class_seen(self, class_):
        self._class_mask[class_] = 1

    def get_token_ids(self, batch):
        states = _beam_utils.collect_states(batch)
        cdef StateClass state
        states = [state for state in states if not state.is_final()]
        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
                                          dtype='i', order='C')
        ids.fill(-1)
        c_ids = <int*>ids.data
        for state in states:
            state.c.set_context_tokens(c_ids, ids.shape[1])
            c_ids += ids.shape[1]
        return ids

    def finish_steps(self, golds):
        # Add a padding vector to the d_tokvecs gradient, so that missing
        # values don't affect the real gradient.
        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
        # Tells CUDA to block, so our async copies complete.
        if self.cuda_stream is not None:
            self.cuda_stream.synchronize()
        for ids, d_vector, bp_vector in self.backprops:
            d_state_features = bp_vector((d_vector, ids))
            ids = ids.flatten()
            d_state_features = d_state_features.reshape(
                (ids.size, d_state_features.shape[2]))
            self.ops.scatter_add(d_tokvecs, ids,
                d_state_features)
        # Padded -- see update()
        self.bp_tokvecs(d_tokvecs[:-1])
        return d_tokvecs


def step_forward(model: ParserStepModel, states, is_train):
    token_ids = model.get_token_ids(states)
    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
    if model.vec2scores is not None:
        scores, get_d_vector = model.vec2scores(vector, is_train)
    else:
        scores = NumpyOps().asarray(vector)
        get_d_vector = lambda d_scores: d_scores
    # If the class is unseen, make sure its score is minimum
    scores[:, model._class_mask == 0] = numpy.nanmin(scores)

    def backprop_parser_step(d_scores):
        # Zero vectors for unseen classes
        d_scores *= model._class_mask
        d_vector = get_d_vector(d_scores)
        if isinstance(model.state2vec.ops, CupyOps) \
        and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
            # Move token_ids and d_vector to GPU, asynchronously
            model.backprops.append((
                util.get_async(model.cuda_stream, token_ids),
                util.get_async(model.cuda_stream, d_vector),
                get_d_tokvecs
            ))
        else:
            model.backprops.append((token_ids, d_vector, get_d_tokvecs))
        return None
    return scores, backprop_parser_step


cdef class precompute_hiddens:
    """Allow a model to be "primed" by pre-computing input features in bulk.

    This is used for the parser, where we want to take a batch of documents,
    and compute vectors for each (token, position) pair. These vectors can then
    be reused, especially for beam-search.

    Let's say we're using 12 features for each state, e.g. word at start of
    buffer, three words on stack, their children, etc. In the normal arc-eager
    system, a document of length N is processed in 2*N states. This means we'll
    create 2*N*12 feature vectors --- but if we pre-compute, we only need
    N*12 vector computations. The saving for beam-search is much better:
    if we have a beam of k, we'll normally make 2*N*12*K computations --
    so we can save the factor k. This also gives a nice CPU/GPU division:
    we can do all our hard maths up front, packed into large multiplications,
    and do the hard-to-program parsing on the CPU.
    """
    cdef readonly int nF, nO, nP
    cdef bint _is_synchronized
    cdef public object ops
    cdef np.ndarray _features
    cdef np.ndarray _cached
    cdef np.ndarray bias
    cdef object _cuda_stream
    cdef object _bp_hiddens
    cdef object activation

    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
                 activation="maxout", train=False):
        gpu_cached, bp_features = lower_model(tokvecs, train)
        cdef np.ndarray cached
        if not isinstance(gpu_cached, numpy.ndarray):
            # Note the passing of cuda_stream here: it lets
            # cupy make the copy asynchronously.
            # We then have to block before first use.
            cached = gpu_cached.get(stream=cuda_stream)
        else:
            cached = gpu_cached
        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
        else:
            self.bias = lower_model.get_param("b")
        self.nF = cached.shape[1]
        if lower_model.has_dim("nP"):
            self.nP = lower_model.get_dim("nP")
        else:
            self.nP = 1
        self.nO = cached.shape[2]
        self.ops = lower_model.ops
        assert activation in (None, "relu", "maxout")
        self.activation = activation
        self._is_synchronized = False
        self._cuda_stream = cuda_stream
        self._cached = cached
        self._bp_hiddens = bp_features

    cdef const float* get_feat_weights(self) except NULL:
        if not self._is_synchronized and self._cuda_stream is not None:
            self._cuda_stream.synchronize()
            self._is_synchronized = True
        return <float*>self._cached.data

    def has_dim(self, name):
        if name == "nF":
            return self.nF if self.nF is not None else True
        elif name == "nP":
            return self.nP if self.nP is not None else True
        elif name == "nO":
            return self.nO if self.nO is not None else True
        else:
            return False

    def get_dim(self, name):
        if name == "nF":
            return self.nF
        elif name == "nP":
            return self.nP
        elif name == "nO":
            return self.nO
        else:
            raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")

    def set_dim(self, name, value):
        if name == "nF":
            self.nF = value
        elif name == "nP":
            self.nP = value
        elif name == "nO":
            self.nO = value
        else:
            raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")

    def __call__(self, X, bint is_train):
        if is_train:
            return self.begin_update(X)
        else:
            return self.predict(X), lambda X: X

    def predict(self, X):
        return self.begin_update(X)[0]

    def begin_update(self, token_ids):
        cdef np.ndarray state_vector = numpy.zeros(
            (token_ids.shape[0], self.nO, self.nP), dtype='f')
        # This is tricky, but (assuming GPU available);
        # - Input to forward on CPU
        # - Output from forward on CPU
        # - Input to backward on GPU!
        # - Output from backward on GPU
        bp_hiddens = self._bp_hiddens

        feat_weights = self.get_feat_weights()
        cdef int[:, ::1] ids = token_ids
        sum_state_features(<float*>state_vector.data,
            feat_weights, &ids[0,0],
            token_ids.shape[0], self.nF, self.nO*self.nP)
        state_vector = state_vector + self.bias
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)

        def backward(d_state_vector_ids):
            d_state_vector, token_ids = d_state_vector_ids
            d_state_vector = bp_nonlinearity(d_state_vector)
            d_tokens = bp_hiddens((d_state_vector, token_ids))
            return d_tokens
        return state_vector, backward

    def _nonlinearity(self, state_vector):
        if isinstance(state_vector, numpy.ndarray):
            ops = NumpyOps()
        else:
            ops = CupyOps()

        if self.activation == "maxout":
            state_vector, mask = ops.maxout(state_vector)
        else:
            state_vector = state_vector.reshape(state_vector.shape[:-1])
            if self.activation == "relu":
                mask = state_vector >= 0.
                state_vector *= mask
            else:
                mask = None

        def backprop_nonlinearity(d_best):
            if isinstance(d_best, numpy.ndarray):
                ops = NumpyOps()
            else:
                ops = CupyOps()
            if mask is not None:
                mask_ = ops.asarray(mask)
            # This will usually be on GPU
            d_best = ops.asarray(d_best)
            # Fix nans (which can occur from unseen classes.)
            d_best[ops.xp.isnan(d_best)] = 0.
            if self.activation == "maxout":
                mask_ = ops.asarray(mask)
                return ops.backprop_maxout(d_best, mask_, self.nP)
            elif self.activation == "relu":
                mask_ = ops.asarray(mask)
                d_best *= mask_
                d_best = d_best.reshape((d_best.shape + (1,)))
                return d_best
            else:
                return d_best.reshape((d_best.shape + (1,)))
        return state_vector, backprop_nonlinearity