# =========================== # numba_cuda Ising simulation # =========================== # # Notes on cuda, Numba and Nvidia # ------------------------------- # (mostly about numba_cuda and torch version conflicts) # I recommend you create a virtual environment # python -m venv numba_cuda_env # activate it # source numba_cuda_env/bin/activate # pip install numba_cuda scipy matplotlib # If you don't want to use Nvidia cuda bindings, but prefer to use the *legacy driver API*, set # (in bash) export NUMBA_CUDA_USE_NVIDIA_BINDING=0 # or run the code from bash command line as # NUMBA_CUDA_USE_NVIDIA_BINDING=0 python numba_cuda_ising.py # # The problematic library is Nvidia JIT LTO Library "nvjitlink" version. # My 2025 nvidia-smi shows Cuda version 12.9, while the system cuda-toolkit is version 12.3 (distro fedora 42). # That causes a version mismatch. # Installing # pip install nvidia-nvjitlink-cu12==12.9.86 # has the correct 12.9 version, and # python numba_cuda_ising.py # works as is, but may break PyTorch. # import numpy as np from numba import cuda, int32, jit from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32 from math import exp # Numba GPU-compatible import matplotlib.pyplot as plt from time import time from scipy.stats import sem # ---------------------------------------- # CUDA kernel for checkerboard Metropolis # ---------------------------------------- @cuda.jit def ising_step(gpu_spins, J, L, beta, rng_states, color): i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x j = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y if i >= L or j >= L: return # Checkerboard update of sites with this color if (i + j) % 2 != color: return thread_id = i + j * L s = gpu_spins[i, j] # Nearest neighbours with Periodic Boundary Conditions left = gpu_spins[(i-1)%L, j] right = gpu_spins[(i+1)%L, j] down = gpu_spins[i, (j-1)%L] up = gpu_spins[i, (j+1)%L] # Energy change for flipping spin at site [i,j] dE = J * 2*s*(up+down+left+right) # Metropolis question acc = False if dE <= 0: acc = True else: rnd = xoroshiro128p_uniform_float32(rng_states, thread_id) if exp(-beta * dE) > rnd: acc = True if acc: gpu_spins[i, j] = -s # --------------------------- # Host code # --------------------------- @jit def Energy(J, L, spins): E = 0 for i in range(L): for j in range(L): E += -J*spins[i, j]*(spins[(i+1)%L, j] + spins[(i-1)%L, j] + spins[i, (j-1)%L] + spins[i, (j+1)%L]) return E # # note: infinite lattice critical temperature is kB*Tc/J = 2.2692 # L = 128 # lattice is LxL J = 1 # Ising model J spins = np.ones((L, L), dtype=np.int32) # initial state is all spins up E = Energy(J, L, spins) print(f'Initial lattice: E per spin = {E/(L*L):0.5f}') # Copy spins to GPU gpu_spins = cuda.to_device(spins) # A CUDA kernel runs many threads in parallel. # threads are grouped into blocks. # # Threads and blocks # ------------------ # let one thread handle one spin threads_per_block = (8, 8) # block threads to 8x8 = 64 thread blocks # count how many such blocks are needed to cover all LxL spins (threads) in x and y direction x_blocks = (L + threads_per_block[0] - 1) // threads_per_block[0] y_blocks = (L + threads_per_block[1] - 1) // threads_per_block[1] # Why? # For L=128 (= 2**7) these give (x_blocks, y_blocks) = (16, 16) # 8x16 = 128, so all spins are covered. Same for L=512 (= 2**9). # If you have L=129, (x_blocks, y_blocks) = (17, 17), # and 8*17=136, so again all spins are covered and some extra: # the extra are skipped using the test # if i >= L or j >= L: # return # Create a distinct RNG state for each thread # ------------------------------------------- # This makes sure each thread generates non-overlapping random number sequences from the # available cycle of 2**128-1 elements (that's what the 128 means in the name xoroshiro128) # see https://nvidia.github.io/numba-cuda/user/random.html # (the old cuda within numba was at https://numba.pydata.org/numba-doc/dev/cuda/random.html) rng_states = cuda.random.create_xoroshiro128p_states(L*L, seed=235) # try seed=235 # Execute Monte Carlo Ising steps # ------------------------------- # Number of Monte Carlo sweeps (MC steps are done spin by spin) N_sweeps = 100 # number of N_sweep sweeps per measurement at a given T Nmax = 10 # GPU updates of spins are inherently parallel # Each spin update depends on four neighboring spins # (up, down, left, and right in the kernel ising_step() ) # Avoid race condition by updating spins in a checkerboard order of blacks and whites. def do_sweeps(beta): for sweep in range(N_sweeps): # update black sites (0) of the checkerboard ising_step[(x_blocks, y_blocks), threads_per_block]( gpu_spins, J, L, beta, rng_states, 0) # update white sites (1) of the checkerboard ising_step[(x_blocks, y_blocks), threads_per_block]( gpu_spins, J, L, beta, rng_states, 1) # Warm up sweeps: T = 5.0 # temperature beta = 1.0 / T do_sweeps(beta) # Copy result back to host spins = gpu_spins.copy_to_host() E = Energy(J, L, spins) print(f' T = {T:.8f} E = {E/(L*L):0.5f}') # Plot spins and E(T) plt.ion() fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5)) ax1.set_aspect("equal") im = ax1.imshow(spins, cmap='bwr') def plot_spins(spins, N, N_sweeps): im.set_data(spins) fig.suptitle(f'2D Ising Model T = {T:.3f} {N} x {N_sweeps} sweeps') plt.draw() plt.pause(1e-3) N = 1 plot_spins(spins, N, N_sweeps) input('Press enter to go on simulation for range T=[5, 0]') # Energy figure: ax2.set(xlim=(0, 6), ylim=(-4.2, 0.2)) ax2.set_xlabel('T') ax2.set_ylabel('E') plt.draw() plt.pause(1e-3) res = [] E = np.zeros(Nmax) for T in np.linspace(5.0, 0.00001, 100): beta = 1/T i = 0 for N in range(Nmax): # GPU sweeps: # ----------- do_sweeps(beta) # host calculations: # ------------------ spins = gpu_spins.copy_to_host() E[i] = Energy(J, L, spins)/(L*L) plot_spins(spins, N, N_sweeps) i += 1 err = sem(E) Eave = E.sum()/Nmax res.append([T, Eave, err]) print(f'T = {T:.3f} = {Eave:.8f} +/- {err:.8f}') plt.subplot(1,2,2) Ts, Es, errs = zip(*res) plt.errorbar(Ts, Es, errs, marker='o', ls='none', markersize=2, capsize=2) plt.draw() plt.pause(1e-3) # plot only errorbars plt.figure(2) Ts, Es, errs = zip(*res) plt.plot(Ts, errs, '-') plt.xlabel('T') plt.ylabel('errorbar length') plt.title(f'Errorbars for fixed number of sweeps for each T') print('All done') plt.show() input('Press enter to exit')