# # Speed comparison of 2D Ising model simulation # of Numba njit vs. numba_cuda GPU # # Notes on cuda, Numba and Nvidia # ------------------------------- # (mostly about numba_cuda and torch version conflicts) # I recommend you create a virtual environment # python -m venv numba_cuda_env # activate it # source numba_cuda_env/bin/activate # pip install numba_cuda scipy matplotlib # If you don't want to use Nvidia cuda bindings, but prefer to use the *legacy driver API*, set # (in bash) export NUMBA_CUDA_USE_NVIDIA_BINDING=0 # or run the code from bash command line as # NUMBA_CUDA_USE_NVIDIA_BINDING=0 python numba_cuda_ising_compare.py # # The problematic library is Nvidia JIT LTO Library "nvjitlink" version. # My 2025 nvidia-smi shows Cuda version 13.0, while the system cuda-toolkit is version 12.3 (distro fedora 42). # That causes a version mismatch. # Installing # pip install nvidia-nvjitlink-cu12==12.9.86 # has the correct 12.9 version, and # python numba_cuda_ising_compare.py # works as is, but may break PyTorch. # import numpy as np from numba import cuda, njit from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32 from math import exp import time import matplotlib.pyplot as plt # ------------------------------------- # GPU Kernel of Checkerboard Metropolis # ------------------------------------- @cuda.jit def ising_step_gpu(gpu_spins, J, L, beta, rng_states, color): i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x j = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y if i >= L or j >= L: return # Checkerboard update only sites with given color if (i + j) % 2 != color: return thread_id = i + j * L s = gpu_spins[i, j] # Nearest neighbours with Periodic Boundary Conditions left = gpu_spins[(i-1)%L, j] right = gpu_spins[(i+1)%L, j] down = gpu_spins[i, (j-1)%L] up = gpu_spins[i, (j+1)%L] # Energy change for flipping spin at site [i,j] dE = J * 2*s*(up+down+left+right) # Metropolis question acc = False if dE <= 0: acc = True else: rnd = xoroshiro128p_uniform_float32(rng_states, thread_id) if exp(-beta * dE) > rnd: acc = True if acc: gpu_spins[i, j] = -s # s1 .. # s2 s s3 # s4 .. # -------------------------------------- # CPU version of Checkerboard Metropolis # -------------------------------------- @njit def ising_step_cpu(spins, J, L, beta, color): for i in range(L): for j in range(L): if (i + j) % 2 != color: continue s = spins[i, j] left = spins[(i-1)%L, j] right = spins[(i+1)%L, j] down = spins[i, (j-1)%L] up = spins[i, (j+1)%L] dE = J * 2 * s * (up + down + left + right) # Metropolis question acc = False if dE <= 0: acc = True else: rnd = np.random.rand() if exp(-beta * dE) > rnd: acc = True if acc: spins[i, j] = -s # --------------------------- # Parameters # --------------------------- L = 1024 # lattice size; try 128 or 512 T = 2.1 # temperature beta = 1.0 / T J = 1 # Ising model J N_sweeps = 5000 # number of Monte Carlo sweeps over the whole lattice # GPU setup threads_per_block = (8, 8) x_blocks = (L + threads_per_block[0] - 1) // threads_per_block[0] y_blocks = (L + threads_per_block[1] - 1) // threads_per_block[1] # --------------------------- # CPU simulation # --------------------------- print('CPU simulation begins') spins_cpu = np.ones((L, L), dtype=np.int32) start_cpu = time.time() for sweep in range(N_sweeps): ising_step_cpu(spins_cpu, J, L, beta, 0) ising_step_cpu(spins_cpu, J, L, beta, 1) end_cpu = time.time() cpu_time = end_cpu - start_cpu # --------------------------- # GPU simulation # --------------------------- print('GPU simulation begins') spins_gpu = np.ones((L, L), dtype=np.int32) d_spins = cuda.to_device(spins_gpu) # copy data to GPU rng_states = create_xoroshiro128p_states(L*L, seed=42) # threads need their own RNG state start_gpu = time.time() for sweep in range(N_sweeps): ising_step_gpu[(x_blocks, y_blocks), threads_per_block]( d_spins, J, L, beta, rng_states, 0) ising_step_gpu[(x_blocks, y_blocks), threads_per_block]( d_spins, J, L, beta, rng_states, 1) cuda.synchronize() # wait until GPU operations are done or timing means nothing end_gpu = time.time() gpu_time = end_gpu - start_gpu # Copy to host for plotting spins_gpu = d_spins.copy_to_host() # --------------------------- # Plot CPU and GPU results # --------------------------- plt.figure(figsize=(12,5)) plt.subplot(1,2,1) Lx = min(128, L) # plot only part of a large grid or no structures can be seen note = '' if Lx>=L else '(plot only a 128x128 corner)' plt.imshow(spins_cpu[0:Lx, 0:Lx], cmap='bwr') plt.title(f'CPU timing {cpu_time:.3f} s '+note) plt.subplot(1,2,2) plt.imshow(spins_gpu[0:Lx, 0:Lx], cmap='bwr') plt.title(f'GPU timing {gpu_time:.3f} s '+note) plt.show()