import numpy as np
import cupy as cp
from numba import cuda
from time import time
from scipy import fft
import cupyx.scipy.fft as cufft
import matplotlib.pyplot as plt

# warm up cupy compilation: try without these to see the difference 
gpu_arr = cp.random.randint(0, 255, size=(10,10))
cufft.fftn(gpu_arr)

res = []
# max N in cupy is limited by GPU memory, 
# too large N gives a cupy.cuda.memory.OutOfMemoryError

for N in range(1000,9000, 1000):
    cpu_arr = np.random.randint(0, 255, size=(N, N))
    gpu_arr = cp.asarray(cpu_arr)

    t_cpu_start = time()
    fft_arr = fft.fftn(cpu_arr)
    t_cpu_end = time()

    t_gpu_start = time()
    fft_arr = cufft.fftn(gpu_arr)
    t_gpu_end = time()

    
    res.append([N, t_cpu_end-t_cpu_start, t_gpu_end-t_gpu_start])
    print(f'{res[-1]}, timing ratio = {res[-1][1]/res[-1][2]:8.4f}')


x, y1, y2 = zip(*res)
plt.plot(x, y1, label="CPU using scipy FFT")
plt.plot(x, y2, label="GPU using cupyx.scipy FFT")
plt.title('FFT of integer matrix in CPU vs. GPU')  
plt.xlabel('matrix size N')
plt.ylabel('time (s)')
plt.legend()
plt.show()