/**
 * @file: Mlp.hpp
 *
 * Interface to the MLP computations. Cares not about multi- or
 * single-objectivity, or anything else regarding the learning
 * algorithm; just provides objective function values and derivatives
 * for differentiable ones.
 *
 * Implements simple ASCII-text serialization that can be easily
 * parsed or written in Matlab or whatever analysis tool with minimal
 * string manipulations or conversions.
 *
 * All computations require an external workspace of doubles. This
 * inconvenience provides for a couple of optimizations:
 *
 *   (1) The forward pass can be done only once even if the error and
 *       backpropagation phases are done for different formulations. A
 *       copy of the workspace must be made before error and gradient
 *       evaluation, but the costly sums and sigmoids need not be
 *       evaluated twice. 
 *
 *   (2) Only one thread-wise storage could be used for sequentially
 *       evaluating a population of MLPs when the maximum required
 *       space is known beforehand.
 *
 */
#ifndef MLP_HPP_
#define MLP_HPP_

#include "SynapticRandomizer.hpp"

#include<vector>
#include<cstdlib>
#include<iostream>
#include<random>
#include<string>

using std::vector;
using std::cout;
using std::ostream;
using std::istream;
using std::string;

namespace jymlp{

/** As of now, define activation functions as an enum. FIXME: hyptan
    actually is tansig*/
namespace actf{
    enum ActF : int {Unset = 0, linear = 1, hyptan = 2, logsig = 3};
}
/**
 * The error function type. q2a2 corresponds to computing mean squared
 * error (MSE), q2a1 to computing mean euclidean vector norm error
 * (MEE), and q1a1 to computing mean absolute error (MAE).
 */
namespace errt{
    enum ErrT : int {Unset = 0, q2a2 = 1, q2a1 = 2, q1a1 = 3};
}
using jymlp::actf::ActF;
using jymlp::errt::ErrT;

enum PrettyPrintStyle : int  {plaintext = 0, latexeq = 1};

class Mlp{
protected:
    /** Linear storage of weights */
    vector<double> weights;
    /** Linear storage of neuron count in-hid-out */
    vector<size_t> nneur;
    /** Linear storage of activation types  */
    vector<ActF> actf;

    /** Re-initialize weights using random values from U(-a,a). */
    void initRnd(double a, SynapticRandomizer & sr);

public:

    /**
     * Construct uninitialized; only for reading content from a stream
     * very soon.
     */
    Mlp();

    /** Construct w. all-zero weights and tanh-tanh-lin activ
     *  (default).
     */
    Mlp(const vector<size_t>& inneur);

    /** Construct with given weights and actfs. Deep copies are made. */
    Mlp(const vector<size_t>& inneur, const vector<ActF>& iactf, const vector<double>& iweights);

    /** Construct a deep-copy of another similar dude. */
    Mlp(const Mlp &other);

    /** Construct with given weights and actfs. Initialize with
     *  weights with a given SynapticRandomizer. If you use threads,
     *  initialize the randomizers properly for each thread!
     */
    Mlp(const vector<size_t>& inneur, const vector<ActF>& iactf, SynapticRandomizer& sr);

    /**
     * Return the number of doubles needed in the external workspace
     * storage that is always required for the layerwise computations.
     */
    size_t getWorkspaceSize();

    /** Return number of layers as input-hidden-...-hidden-output */
    size_t getNLayers() const {return nneur.size();}

    /** Return the total number of weights. */
    size_t getNWeights() const {return weights.size();}

    /** Return the total number of nodes/neurons on a layer. */
    size_t getNNeurons(int layer) const {return nneur[layer];}

    /** Return the total number of neurons on hidden layers. */
    size_t getNHiddenNeurons() const;

    /**
     * Return the number of non-zero weights; the count is for strict
     * zeros without numerical tolerance, so small weights should be
     * pruned before this measure is useful.
     */
    size_t getNumNonzeroWeights() const;

    /**
     * Return the number of connected input nodes, i.e., those which
     * feed into at least one neuron on the next layer.
     */
    size_t getNumConnectedInputs() const;

    /** Return the number of input nodes. */
    size_t getNumInputs() const {return nneur[0];}

    /** Return the number of output nodes. */
    size_t getNumOutputs() const {return nneur[nneur.size()-1];}

    /**
     * Feed an input vector to produce the outputs of each layer; the
     * outputs are stored in the external workspace memory array for
     * future examination or backpropagation computations. Size of the
     * workspace must be at least numberOfLayers * max(nneur).
     */
    void forward(const vector<double> &input, double * workspace) const;

    /**
     * Evaluate the error vector (N(x)-t) for a target vector; call
     * this after forward() if the error was to be computed instead of
     * only the network outputs; updates the result in the workspace.
     */
    void errorVec(const vector<double> &target, double * workspace) const;

    /** Create and return a copy of the current workspace output
     *  (network output or error vector, depending on workspace
     *  state)
     */
    vector <double>
    copyOutputVec(double * workspace) const;

    /** Return a base-1 index to the class (1,2,...C) represented by
     *  the output (must be called directly after the forward pass)
     */
    size_t
    outputVecAsClassIndex(const double * workspace) const;

#if 0
    /**
     * Add coeff * ||e||^2 to destination, without backpropagation.
     */
    void addEuc2(double coeff, double * dest, double * workspace) const;
#endif

    /**
     * Add the weighted error to the destination value and optionally
     * its gradient by backpropagating through the workspace; must be
     * called after forward() and errorVec() because the error vector
     * must be computed in the workspace in order to evaluate the
     * error contribution or backpropagate. Passing nullptr as destG
     * bypasses the backpropagation step, in which case no changes are
     * made to the workspace.
     *
     * FIXME: That means that the function could be overloaded with a
     * const* version for workspace!!
     */
    void backwardEucSq(double coeff,
                       double * destE,
                       double * destG,
                       double * workspace,
                       ErrT errortype) const;


    /**
     * Add a weight decay term of the form coeff/2 * sum w_i^2 - sum of
     * either all weights, or weights except the biases of the output
     * layer.
     */
    void weightDecaySq(double coeff,
                       double *destE,
                       double *destG,
                       bool excludeOutputBias) const;

    /**
     * Add a weight decay term of the form coeff * sum abs(w_i) - sum
     * of either all weights, or weights except the biases of the
     * output layer.
     */
    void weightDecayAbs(double coeff,
                        double *destE,
                        double *destG,
                        bool excludeOutputBias) const;

    /**
     * Update the weights by adding values multiplied by a coefficient
     * to the current weight values. For example update(-.001, grad)
     * for the simplest kind of steepest-descent backprop. The same
     * function can be used for random "jogging", for example
     * update(.1, gaussian).
     */
    void update(double coeff, const double *wupd);

    /**
     * Evaluate and add to the referred outputs the squared or
     * non-squared error and partial (sub)derivatives wrt weights,
     * multiplied by a coefficient, for a given input and a given
     * target vector.
     *
     * This is likely to be the innermost computation in algorithmic
     * iteration, and thus the function that dominates the
     * computational cost. Therefore, exceptionally, the output
     * parameters are raw pointers to numerical vectors, and there may
     * be a nullptr for any value that is not necessary for the
     * current purpose.
     *
     * Some profiling should be done to make sure this really is the
     * place to optimize and if useful optimizations can be done.
     *
     * FIXME: Split this to (1) forward eval, (2) error computation,
     * (3) backward loop.
     */
    void addErrorAndGradient(const vector<double> &input,
                             const vector<double> &target,
                             double coefficient,
                             double * sqerror,
                             double * error,
                             vector<double> * mseGrad,
                             vector<double> * meeGrad) const;


    /** Stream to an ASCII version, easily readable in simple scripts,
     * e.g., in Matlab or Octave.
     *
     * Format: Output always begins with format version. Currently only
     * "version 1" is implemented.
     *
     * Version 1 stream (values packed & separated with a single space):
     *
     *  int version,
     *  int[nlay]        number of layers,
     *  int[nlay]        number of neurons on each layer,
     *  ActF[nlay]       activation functions (input activation must be
     *                   always Unset == 0),
     *  double[nneurTot] synaptic weights as a row-major linear array.
     */
    void toStream(ostream & o);

    /** Set new values from an ASCII stream, as created by toStream() */
    void fromStream(istream & ins);

    /** Pretty-print layer sizes, ex. "3-4-3" */
    string prettyPrintLayerSizes();

    /** Pretty-print layer activations, ex. "in-tanh-tanh-lin" */
    string prettyPrintLayerActivations();

    /** Pretty-print weight matrices. */
    string prettyPrintWeights() const;

    /** Pretty-print gradient matrices. */
    string prettyPrintGradient(const vector <double> & grad) const;

    /** Pretty-print everything. */
    string prettyPrint();
};

}
#endif /* MLP_HPP_ */