# ===========================
# numba_cuda Ising simulation
# ===========================
#
# Notes on cuda, Numba and Nvidia
# -------------------------------
# (mostly about numba_cuda and torch version conflicts)
# I recommend you create a virtual environment
# python -m venv numba_cuda_env
# activate it 
# source numba_cuda_env/bin/activate
# pip install numba_cuda scipy matplotlib
# If you don't want to use Nvidia cuda bindings, but prefer to use the *legacy driver API*, set
# (in bash) export NUMBA_CUDA_USE_NVIDIA_BINDING=0
# or run the code from bash command line as
# NUMBA_CUDA_USE_NVIDIA_BINDING=0 python numba_cuda_ising.py
# 
# The problematic library is  Nvidia JIT LTO Library "nvjitlink" version.
# My 2025 nvidia-smi shows Cuda version 12.9, while the system cuda-toolkit is version 12.3 (distro fedora 42).
# That causes a version mismatch.
# Installing
# pip install nvidia-nvjitlink-cu12==12.9.86
# has the correct 12.9 version, and
# python numba_cuda_ising.py
# works as is, but may break PyTorch.
# 
import numpy as np
from numba import cuda, int32, jit
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32
from math import exp  # Numba GPU-compatible
import matplotlib.pyplot as plt
from time import time
from scipy.stats import sem


# ----------------------------------------
# CUDA kernel for checkerboard Metropolis
# ----------------------------------------
@cuda.jit
def ising_step(gpu_spins, J, L, beta, rng_states, color):
    
    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    j = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    
    if i >= L or j >= L:
        return

    # Checkerboard update of sites with this color
    if (i + j) % 2 != color:
        return

    thread_id = i + j * L
    s = gpu_spins[i, j]

    # Nearest neighbours with Periodic Boundary Conditions
    left  = gpu_spins[(i-1)%L, j]
    right = gpu_spins[(i+1)%L, j]
    down  = gpu_spins[i, (j-1)%L]
    up    = gpu_spins[i, (j+1)%L]

    # Energy change for flipping spin at site [i,j]
    dE = J * 2*s*(up+down+left+right)

    # Metropolis question
    acc = False
    if dE <= 0:
         acc = True
    else:
        rnd = xoroshiro128p_uniform_float32(rng_states, thread_id)
        if exp(-beta * dE) > rnd:
            acc = True
    if acc:
        gpu_spins[i, j] = -s


# ---------------------------
# Host code
# ---------------------------
@jit 
def Energy(J, L, spins):
    E = 0
    for i in range(L):
        for j in range(L):
            E += -J*spins[i, j]*(spins[(i+1)%L, j] + spins[(i-1)%L, j] + spins[i, (j-1)%L] + spins[i, (j+1)%L])
    return E
#
# note: infinite lattice critical temperature is kB*Tc/J = 2.2692
# 
L = 128 # lattice is LxL
J = 1 # Ising model J
spins = np.ones((L, L), dtype=np.int32)  # initial state is all spins up

E = Energy(J, L, spins)
print(f'Initial lattice: E per spin = {E/(L*L):0.5f}')

# Copy spins to GPU
gpu_spins = cuda.to_device(spins)


# A CUDA kernel runs many threads in parallel.
# threads are grouped into blocks.
# 

# Threads and blocks
# ------------------
# let one thread handle one spin
threads_per_block = (8, 8) # block threads to 8x8 = 64 thread blocks

# count how many such blocks are needed to cover all LxL spins (threads) in x and y direction
x_blocks = (L + threads_per_block[0] - 1) // threads_per_block[0]
y_blocks = (L + threads_per_block[1] - 1) // threads_per_block[1]

# Why?
# For L=128 (= 2**7) these give (x_blocks, y_blocks) = (16, 16)
# 8x16 = 128, so all spins are covered. Same for L=512 (= 2**9).
# If you have L=129, (x_blocks, y_blocks) = (17, 17),
# and 8*17=136, so again all spins are covered and some extra:
# the extra are skipped using the test
# if i >= L or j >= L:
#         return

# Create a distinct RNG state for each thread
# -------------------------------------------
# This makes sure each thread generates non-overlapping random number sequences from the
# available cycle of 2**128-1 elements (that's what the 128 means in the name xoroshiro128)
# see https://nvidia.github.io/numba-cuda/user/random.html
# (the old cuda within numba was at https://numba.pydata.org/numba-doc/dev/cuda/random.html)

rng_states = cuda.random.create_xoroshiro128p_states(L*L, seed=235) # try seed=235


# Execute Monte Carlo Ising steps
# -------------------------------

# Number of Monte Carlo sweeps (MC steps are done spin by spin)
N_sweeps = 100
# number of N_sweep sweeps per measurement at a given T
Nmax = 10 

# GPU updates of spins are inherently parallel
# Each spin update depends on four neighboring spins
# (up, down, left, and right in the kernel ising_step() )
# Avoid race condition by updating spins in a checkerboard order of blacks and whites.


def do_sweeps(beta):
    for sweep in range(N_sweeps):
        # update black sites (0)  of the checkerboard
        ising_step[(x_blocks, y_blocks), threads_per_block](
            gpu_spins, J, L, beta, rng_states, 0)
        # update white sites (1) of the checkerboard
        ising_step[(x_blocks, y_blocks), threads_per_block](
            gpu_spins, J, L, beta, rng_states, 1)

# Warm up sweeps:
T = 5.0 # temperature 
beta = 1.0 / T  
do_sweeps(beta)
# Copy result back to host
spins = gpu_spins.copy_to_host()
E = Energy(J, L, spins)
print(f' T = {T:.8f} E = {E/(L*L):0.5f}')

# Plot spins and E(T)
plt.ion()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.set_aspect("equal")
im = ax1.imshow(spins, cmap='bwr') 

def plot_spins(spins, N, N_sweeps):
    im.set_data(spins)
    fig.suptitle(f'2D Ising Model T = {T:.3f} {N} x {N_sweeps} sweeps')
    plt.draw()
    plt.pause(1e-3)

N = 1
plot_spins(spins, N, N_sweeps)
input('Press enter to go on simulation for range T=[5, 0]')

# Energy figure:
ax2.set(xlim=(0, 6), ylim=(-4.2, 0.2))
ax2.set_xlabel('T')
ax2.set_ylabel('E')
plt.draw()
plt.pause(1e-3)

    
res = []

E = np.zeros(Nmax)


for T in np.linspace(5.0, 0.00001, 100):
    beta = 1/T    
    i = 0
    for N in range(Nmax):
        # GPU sweeps:
        # -----------
        do_sweeps(beta)
        # host calculations:
        # ------------------
        spins = gpu_spins.copy_to_host()        
        E[i] = Energy(J, L, spins)/(L*L)
        plot_spins(spins, N, N_sweeps)
        i += 1

    err = sem(E)
    Eave = E.sum()/Nmax
    res.append([T, Eave, err])
    print(f'T = {T:.3f}  <E> = {Eave:.8f} +/- {err:.8f}')
        
    
    plt.subplot(1,2,2) 
    Ts, Es, errs = zip(*res)    
    plt.errorbar(Ts, Es, errs, marker='o', ls='none', markersize=2, capsize=2)
    plt.draw()
    plt.pause(1e-3)

# plot only errorbars
plt.figure(2)
Ts, Es, errs = zip(*res)
plt.plot(Ts, errs, '-')
plt.xlabel('T')
plt.ylabel('errorbar length')
plt.title(f'Errorbars for fixed number of sweeps for each T') 

print('All done')
plt.show()
input('Press enter to exit')