/**
 * I declare this source public domain knowledge. Use as you wish.
 */

/**
 * A simple multilayered perceptron.
 * 
 * This is a feedforward neural network without much possibility 
 * of customization. It can have any number of hidden layers with the
 * basic tanh activation function, and it always has a linear output 
 * layer.
 * 
 * Implementation note: Creating new objects during training is avoided by
 * pre-allocating working memory as object members during instantiation.
 * These work arrays are updated and used in all the key methods.
 * 
 * @author nieminen@jyu.fi
 *
 */
public class SimpleMLP {
    /** Numbers of neurons on each layer; this describes the
     * neural architecture. Can be given on instantiation only.
     */
    private int[] nneur;

    /** Weights reside in layerwise matrices. */
    private double[][][] W;

    /** Space required for computing the gradient. */
    private double[][][] G;

    /** Space required for computing one output vector */
    private double[][] tmpout;

    /** Counter of optimization steps since random initialization of weights. */
    private int iterSinceInit = 0;

    /**
     * Creates an MLP with the given architecture.
     * 
     * @param numbersOfNeurons  The number of neurons on each layer; 
     *                          Must include input and output!!
     */
    public SimpleMLP(int[] numbersOfNeurons){
        nneur = numbersOfNeurons.clone();

        /* Create the required matrices for weights, gradient, and computation */
        W = new double[nneur.length-1][][];
        G = new double[nneur.length-1][][];
        tmpout = new double[nneur.length-1][];

        for(int L=0;L<nneur.length-1;L++){
            W[L] = new double[nneur[L+1]][nneur[L]+1];			
            G[L] = new double[nneur[L+1]][nneur[L]+1];			
            tmpout[L] = new double[nneur[L+1]];
        }

        initRandomly();
    }


    /**
     * Randomizes the network weights.
     */
    public void initRandomly(){
        /* We just fill in, assuming W already exists */
        for (int L=0;L<W.length;L++){
            for(int i=0;i<W[L].length;i++){
                for(int j=0;j<W[L][0].length;j++){
                    W[L][i][j] = -1.0 + 2*Math.random();
                }
            }
        }
        iterSinceInit = 0;
    }


    /** 
     * Evaluates the mean-of-squared-errors cost function along with its gradient.
     * Can be used for any gradient-based optimization algorithm.
     */
    private double costAndGrad(
            double[][] inputs, double[][] targets)
    {
        int N=inputs.length;

        /* Initialize to zero */
        double cost = 0.0;
        NeuralMath.scaleManyMatrices(G, 0.0);

        /* Sum over training data pairs */
        for (int i=0;i<N;i++){
            cost += costOneVec(inputs[i], targets[i]);
        }

        /* Scale and return */
        NeuralMath.scaleManyMatrices(G, 1.0/N);
        return cost / (2*N);
    }


    /**
     * Returns the cost summation term from one training input 
     * and target pair, and adds the corresponding effect to the 
     * gradient.
     * 
     * We don't create new objects here in the inner loops; hence the
     * methods are designed to have side effects on arguments.
     */
    private double costOneVec(double[] input, double[] target)
    {
        double[][] out = tmpout;
        int nlayers = W.length;
        int L; 

        /* Forward loop for layer outputs*/
        double[] in;
        for (L=0, in = input; L<nlayers; L++,in = out[L-1]){
            NeuralMath.vecMatWithBias(in, W[L], out[L]);
            if (L < nlayers-1)
                NeuralMath.applyTansigActivation(out[L]);
        }

        /* Squared distance ||output-target||^2 added to the cost: */
        double[] e = out[nlayers-1];
        NeuralMath.subtract(e,target);
        double costTerm = NeuralMath.dot(e,e);

        /* Back-propagate the error to get partial derivatives wrt weights. */
        for (L=nlayers-1; L >= 0; L--){
            double[] outPrevious = L>0 ? out[L-1]:input;
            if (L < nlayers-1)
                NeuralMath.tanhDerivTimesSubmatTVec(W[L+1], out[L+1], out[L]);
            NeuralMath.matPlusVecVecTwbias(G[L], out[L], outPrevious);
        }		
        return costTerm;
    }


    /** Applies the network for one input vector. */
    private double[] feedVector(double[] input){
        double[][] out = tmpout;

        int nlayers = W.length;
        int L; 

        /* Forward loop for layer outputs*/
        double[] in;
        for (L=0, in = input; L<nlayers; L++,in = out[L-1]){
            NeuralMath.vecMatWithBias(in, W[L], out[L]);
            if (L < nlayers-1)
                NeuralMath.applyTansigActivation(out[L]);
        }

        /* Last output is the network response */
        return out[nlayers-1].clone();
    }

    /** Evaluate the MLP response for many input vectors. */
    public double[][] feedMatrix(double[][] inputs){
        double[][] outputs = new double[inputs.length][];
        for(int i=0;i<inputs.length;i++){
            outputs[i] = feedVector(inputs[i]);
        }
        return outputs;
    }


    /** 
     * Trains the network using simple gradient descent method with 
     * constant step size.
     * 
     * This is the "original innovation" called "backpropagation training 
     * using gradient descent"; but nowadays you should use a better 
     * optimization algorithm like some conjugate gradient method, 
     * Broyden-Fletcher-Goldfarb-Shanno, Levenberg-Marquardt, ...
     * 
     * This shall suffice as a first example, though.
     */
    public void trainGD(double[][] inputs, double[][] targets, double stepsize, int maxIter){
        double fgoal = 1e-7;
        double gnsgoal = 1e-7;

        double f=Double.NaN,gns=Double.NaN;
        long timeStart = System.currentTimeMillis();
        int iter;

        for(iter=0; iter<maxIter; iter++){
            f = costAndGrad(inputs, targets);
            gns = NeuralMath.squareSumOfMatrices(G);

            if (f <= fgoal) break;			
            if (gns <= gnsgoal) break;

            /* Take a step downwards, i.e., along negated gradient. */
            NeuralMath.addManyMatrices(W, -stepsize, G);
        }
        iterSinceInit += iter;

        long timeTotal = System.currentTimeMillis() - timeStart;

        System.out.printf("Finished in %.5f seconds, after %d iterations.%n", timeTotal / 1000.0, iter);
        System.out.printf("Average time per iteration %.9f seconds%n", timeTotal / 1000.0 / iter);
        System.out.printf("Total training steps since random initialization %d%n",iterSinceInit);
        System.out.printf("Current cost function value %.5f gradient norm squared %.5f%n", f, gns);
    }


    /** Train with integer labels (for classification tasks) */
    public void trainGD(double[][] inputs, int[] targets, double stepsize, int maxIter) {
        int maxt = 0;
        for (int i=0;i<targets.length;i++){if (targets[i]>maxt) maxt=targets[i];}

        /* Build binary vectors for internal use. */
        double[][] targetvecs = new double[inputs.length][maxt];
        for(int i=0;i<targetvecs.length;i++){
            for(int j=0;j<maxt;j++){
                targetvecs[i][j] = j==targets[i]-1 ? 1.0:-1.0;
            }
        }

        trainGD(inputs,targetvecs,stepsize,maxIter);
    }


    /** Feed inputs and convert output to integer labels (for classification tasks) */
    public int[] classifyMatrix(double[][] inputs) {
        double[][] outs = feedMatrix(inputs);
        /* We just select the neuron whose output has the largest value. */
        int[] outclasses = new int[outs.length];
        for(int i=0;i<outclasses.length;i++){
            int imax = 0; double vmax = Double.NEGATIVE_INFINITY;
            for(int j=0;j<outs[0].length;j++){
                if (outs[i][j]>vmax){vmax = outs[i][j]; imax=j;}
            }
            outclasses[i] = imax+1;  // Begin class indices from 1.
        }
        return outclasses;
    }
}