#
# Speed comparison of 2D Ising model simulation
# of Numba njit vs. numba_cuda GPU
#
# Notes on cuda, Numba and Nvidia
# -------------------------------
# (mostly about numba_cuda and torch version conflicts)
# I recommend you create a virtual environment
# python -m venv numba_cuda_env
# activate it 
# source numba_cuda_env/bin/activate
# pip install numba_cuda scipy matplotlib
# If you don't want to use Nvidia cuda bindings, but prefer to use the *legacy driver API*, set
# (in bash) export NUMBA_CUDA_USE_NVIDIA_BINDING=0
# or run the code from bash command line as
# NUMBA_CUDA_USE_NVIDIA_BINDING=0 python numba_cuda_ising_compare.py
# 
# The problematic library is  Nvidia JIT LTO Library "nvjitlink" version.
# My 2025 nvidia-smi shows Cuda version 13.0, while the system cuda-toolkit is version 12.3 (distro fedora 42).
# That causes a version mismatch.
# Installing
# pip install nvidia-nvjitlink-cu12==12.9.86
# has the correct 12.9 version, and
# python numba_cuda_ising_compare.py
# works as is, but may break PyTorch.
# 

import numpy as np
from numba import cuda, njit
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32
from math import exp
import time
import matplotlib.pyplot as plt

# -------------------------------------
# GPU Kernel of Checkerboard Metropolis
# -------------------------------------
@cuda.jit
def ising_step_gpu(gpu_spins, J, L, beta, rng_states, color):
    
    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    j = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    
    if i >= L or j >= L:
        return

    # Checkerboard update only sites with given color
    if (i + j) % 2 != color:
        return

    thread_id = i + j * L
    s = gpu_spins[i, j]

    # Nearest neighbours with Periodic Boundary Conditions
    left  = gpu_spins[(i-1)%L, j]
    right = gpu_spins[(i+1)%L, j]
    down  = gpu_spins[i, (j-1)%L]
    up    = gpu_spins[i, (j+1)%L]

    # Energy change for flipping spin at site [i,j]
    dE = J * 2*s*(up+down+left+right)

    # Metropolis question
    acc = False
    if dE <= 0:
         acc = True
    else:
        rnd = xoroshiro128p_uniform_float32(rng_states, thread_id)
        if exp(-beta * dE) > rnd:
            acc = True
    if acc:
        gpu_spins[i, j] = -s

#    s1  ..
# s2  s  s3
#    s4  ..
# --------------------------------------
# CPU version of Checkerboard Metropolis 
# --------------------------------------
@njit
def ising_step_cpu(spins, J,  L, beta, color):
    for i in range(L):
        for j in range(L):
            if (i + j) % 2 != color:
                continue

            s = spins[i, j]
            left  = spins[(i-1)%L, j]
            right = spins[(i+1)%L, j]
            down  = spins[i, (j-1)%L]
            up    = spins[i, (j+1)%L]

            dE = J * 2 * s * (up + down + left + right)            
            
            # Metropolis question
            acc = False
            if dE <= 0:
                acc = True
            else:
                rnd = np.random.rand()
                if exp(-beta * dE) > rnd:
                    acc = True
            if acc:
                spins[i, j] = -s


# ---------------------------
# Parameters
# ---------------------------
L = 1024  # lattice size; try 128 or 512 
T = 2.1  # temperature
beta = 1.0 / T
J = 1    # Ising model J
N_sweeps = 5000  # number of Monte Carlo sweeps over the whole lattice

# GPU setup
threads_per_block = (8, 8)
x_blocks = (L + threads_per_block[0] - 1) // threads_per_block[0]
y_blocks = (L + threads_per_block[1] - 1) // threads_per_block[1]

# ---------------------------
# CPU simulation
# ---------------------------
print('CPU simulation begins')
spins_cpu = np.ones((L, L), dtype=np.int32)
start_cpu = time.time()
for sweep in range(N_sweeps):
    ising_step_cpu(spins_cpu, J, L, beta, 0)
    ising_step_cpu(spins_cpu, J, L, beta, 1)
end_cpu = time.time()
cpu_time = end_cpu - start_cpu


# ---------------------------
# GPU simulation
# ---------------------------
print('GPU simulation begins')
spins_gpu = np.ones((L, L), dtype=np.int32)
d_spins = cuda.to_device(spins_gpu) # copy data to GPU
rng_states = create_xoroshiro128p_states(L*L, seed=42) # threads need their own RNG state

start_gpu = time.time()
for sweep in range(N_sweeps):
    ising_step_gpu[(x_blocks, y_blocks), threads_per_block](
        d_spins, J, L, beta, rng_states, 0)
    ising_step_gpu[(x_blocks, y_blocks), threads_per_block](
        d_spins, J, L, beta, rng_states, 1)
cuda.synchronize() # wait until GPU operations are done or timing means nothing
end_gpu = time.time()
gpu_time = end_gpu - start_gpu


# Copy to host for plotting
spins_gpu = d_spins.copy_to_host()

# ---------------------------
# Plot CPU and GPU results
# ---------------------------
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
Lx = min(128, L) # plot only part of a large grid or no structures can be seen
note = '' if Lx>=L else '(plot only a 128x128 corner)'

    
plt.imshow(spins_cpu[0:Lx, 0:Lx], cmap='bwr')
plt.title(f'CPU timing {cpu_time:.3f} s '+note)

plt.subplot(1,2,2)
plt.imshow(spins_gpu[0:Lx, 0:Lx], cmap='bwr')
plt.title(f'GPU timing {gpu_time:.3f} s '+note)

plt.show()