"""
MNIST handwritten letters identified by a basic ELM (Extreme Learning Machine)
The whole neural network is in class ELM, with methods train(), which is actually just a linear equation solution,
and predict().

Input data is in n samples each holding m data points, stored as x(n,m)
Since data points should be kept together in memory, Python's column major storage order favors x(n,m).  
The output data is n samples each in i cathegories (numbers 0..9), stored as y(n,i)

For memory  efficiency, I won't transpose these arrays, so the network, written in the component form, is
  y(n,i) = activation( x(n,m) @ w1(m,h) + b1(1,h) ) @ w2(h,i)
This is what method predict() computes using matrix products.
where h is the number of hidden layer nodes. Use random but fixed elements
in  w1(m,h) and in b1(1,h). Mark 
  M(n,h) :=  activation( x(n,m) @ w1(m,h) + b1(1,h) )
and pseudoinverse it to Mpinv(h,n), and solve for w2 for the training set (x,y),
  w2(h,i) = Mpinv(h,n) @ y(n,i)

The method predict() computes
  y(n,i) = activation( x(n,m) @ w1(m,h) + b1(1,h) ) @ w2(h,i)
for testing input X_test and compares with known y_test.

Run with
  python ELM_Mnist.py
to use a smaller train/test set, print out debugging information about array sizes, and plot a few of the digits.

Run with
  python -O ELM_Mnist.py
to train the network to the full MNIST data set. 

Main computes the result for several numbers of hidden layer nodes to see how results improve - and take longer.

Vesa Apaja, October 2022

"""

import numpy as np
import csv
import matplotlib.pyplot as plt
from time import time as T

def read_MNIST_data():
    try:
        with open('mnist_train.csv','r') as f:
            print('reading '+f.name)
            train = []
            for ind,line in enumerate(csv.reader(f)):
                train.append(line)
                if ind > maxtrain: break
            train = np.array(train)
            train = train.astype(float)
        with open('mnist_test.csv','r') as f:
            print('reading '+f.name)
            test = []
            for ind,line in enumerate(csv.reader(f)):
                test.append(line)
                if ind > maxtest: break
            test = np.array(test)
            test = test.astype(float)
    except:
        print('Reading data dailed') 
        print('Have you downloaded MNIST data in csv format from https://pjreddie.com/projects/mnist-in-csv/ ?')
        exit()
    return train, test

def convert_data(train, test):
    if __debug__:
        print('train :',train.shape)
        print('test :',test.shape)
    
    # MNIST data is given as 28x28 pictures, that's 784 values per picture
    # the first value is the identification, from 0..9 (now a float, convert to int)
        
    X_train = train[:,1:]
    y_train = train[:,:1].astype(int)  # use :1 and not 0 to get a more usable array shape  
    X_test  = test[:,1:]
    y_test  = test[:,:1].astype(int)

    # convert the number 0..9 in y_train and y_test to cathegories
    # 0 = [1,0,0,0,0,0,0,0,0,0]
    # 1 = [0,1,0,0,0,0,0,0,0,0]
    # ..
    # 9 = [0,0,0,0,0,0,0,0,0,1]
    # there are functions that does it, but I want to use plain NumPy
    
    def categorize(ys):
        yy = np.zeros((ys.shape[0],10), dtype=int)
        for y, ident, in zip(yy,ys):
            y[ident[0]] = 1
        return yy

    y_train_orig = np.copy(y_train) # store original numeric identification
    y_train = categorize(y_train)
    y_test_orig = np.copy(y_test) # stote original numeric identification
    y_test  = categorize(y_test)


    # Scale images to range [0, 1]
    
    X_train /= 255
    X_test /= 255
    
    if __debug__:
        print('X_train :',X_train.shape)
        print('y_train :',y_train.shape)
        print('X_test :',X_test.shape)
        print('y_test :',y_test.shape)
        print('y_test = ',y_test)

    return X_train, y_train, X_test, y_test, y_train_orig
        
# plotting
# --------


def plot_images(title,images,labels,findings=None):
    """
    title: title of the whole 5x5 plot
    images: set of images to plot, first 25 picked
    labels: set of digits the images are showing
    findings (optional): set of digits the NN thinks he images are showing
    """
    fig,axs = plt.subplots(5,5,figsize=(8,8))
    plt.suptitle(title)
    k = 0
    for i in range(5):
        for j in range(5):
            axs[i,j].axis('off')
            try:
                axs[i,j].imshow(images[k], cmap='Greys')
                
                try:
                    axs[i,j].set_title(f'{labels[k]} not {findings[k]}')
                except:
                    axs[i,j].set_title(f'{labels[k]}')
                k+=1
            except:
                pass
                
    plt.ion()   
    plt.draw()
    plt.pause(1e-3)
    plt.ioff()
  
    
# activation functions
def relu(z):
   return np.maximum(z, 0, z)

def sigmoid(z):
    return 1/(1+np.exp(-z))


class ELM():
    def __init__(self, hidden_size = 1000, activation = relu):
        self.hidden_size =  hidden_size 
        self.activation = activation    
        print('ELM: activation',self.activation.__name__)
        print('ELM: number of nodes in hidden layer ',self.hidden_size)

    def _set_weight_bias(self, n, m, std = 1.0):
        self.w1 = np.random.normal(scale = std, size=[n,m])
        self.b1 = np.random.normal(scale = std, size=[1,m])
        
    def train(self, x, y):
        self._set_weight_bias(x.shape[1], self.hidden_size)
        if __debug__ : print('w1:',self.w1.shape)
        if __debug__ : print('x:',x.shape)
        if __debug__ : print('b1:',self.b1.shape)
        M = self.activation(x @ self.w1  + self.b1)
        if __debug__ : print('M:',M.shape)
        if __debug__ : print('y:',y.shape)
        Mpinv = np.linalg.pinv(M)
        if __debug__ : print('Mpinv:',Mpinv.shape)
        self.w2 = Mpinv @ y
        if __debug__ : print('w2: ',self.w2.shape)

    def predict(self, x):
        y = self.activation(x @ self.w1 + self.b1) @ self.w2 
        return y
        

if __name__ == '__main__':
    
    plot_figures = False
    if __debug__ : plot_figures = True

    maxtrain = maxtest = 100000000 # all
    if __debug__: maxtrain = 398
    if __debug__: maxtest = 13
    
    train, test = read_MNIST_data()
    X_train, y_train, X_test, y_test, y_train_orig = convert_data(train, test)
  
    if plot_figures:
        plot_images('Sample of training digits',
                    X_train.reshape(X_train.shape[0],28,28),
                    y_train_orig[:,0])
        plt.draw()
        plt.pause(1e-3)

    res = []
    for hidden in range(100,1100,100):

        tic = T()
        model = ELM(hidden_size = hidden)
        model.train(X_train, y_train)        
        y_pred = model.predict(X_test)
        
        # one-hot encoding
        pred_values = np.argmax(y_pred, axis=1)
        true_values =  np.argmax(y_test, axis=1)
        
        bad_filter, = np.where(pred_values != true_values)
        nbad = len(bad_filter)
        n = len(true_values)
        acc = (n-nbad)*100.0/n
        print(f'accuracy {acc:.5f} %     {nbad} bad identifications out of {n} test images')
        toc = T()
        res.append([hidden,acc,toc-tic])

        
        if plot_figures:
            plot_images('Some misinterpreted digits',
                        X_test.reshape(X_test.shape[0],28,28)[bad_filter],
                        true_values[bad_filter],pred_values[bad_filter])
            break
        else:
            hids,accs,times = zip(*res)
            plt.figure(1)
            plt.plot(hids,accs,'go-')
            plt.xlabel('hidden nodes')
            plt.ylabel('accuracy (%)')
            plt.draw()
            plt.pause(1e-3)
            
            plt.figure(2)
            plt.plot(hids,times,'go-')
            plt.xlabel('hidden nodes')
            plt.ylabel('timing (s)')
            plt.draw()
            plt.pause(1e-3)
            
        
    plt.show()