Demonstrating mathematical integrity under GPU parallelization and reduced precision.
This document explains the CUDA implementation details of the GPU-accelerated integrators for the Hénon–Heiles system.
The focus is on:
We simulate a large number (N) of independent trajectories:
[ z_i(t) = (x_i, y_i, p_{x,i}, p_{y,i}), \quad i = 1, \dots, N ]
Each trajectory evolves according to:
[ \dot{z}_i = f(z_i) ]
Key property:
All trajectories are independent → ideal for massive parallelism.
Each CUDA thread is responsible for one trajectory:
int i = blockIdx.x * blockDim.x + threadIdx.x;
If:
i < N
then thread i updates:
x[i], y[i], px[i], py[i]
Grid
├── Block 0
│ ├── Thread 0 → Trajectory 0
│ ├── Thread 1 → Trajectory 1
│ └── ...
├── Block 1
│ └── ...
└── ...
The implementation uses:
double* x;
double* y;
double* px;
double* py;
instead of:
struct State { double x, y, px, py; };
State* states;
Benefits:
Thread 0 → x[0]
Thread 1 → x[1]
Thread 2 → x[2]
...
This allows the GPU to fetch memory in contiguous blocks.
Each kernel follows the pattern:
__global__ void integrator_kernel(...) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= N) return;
// Load state
double xi = x[i];
double yi = y[i];
double pxi = px[i];
double pyi = py[i];
// Time stepping loop
for (int step = 0; step < steps; ++step) {
// integrator update
}
// Store state
x[i] = xi;
y[i] = yi;
px[i] = pxi;
py[i] = pyi;
}
__device__ void compute_gradients(double x, double y,
double& dVdx, double& dVdy)
[ \frac{\partial V}{\partial x} = x + 2xy ] [ \frac{\partial V}{\partial y} = y + x^2 - y^2 ]
// Half-step momentum update
px -= 0.5 * dt * dVdx;
py -= 0.5 * dt * dVdy;
// Full-step position update
x += dt * px;
y += dt * py;
// Recompute gradients
compute_gradients(x, y, dVdx, dVdy);
// Second half-step momentum update
px -= 0.5 * dt * dVdx;
py -= 0.5 * dt * dVdy;
This update is symplectic, meaning it preserves phase-space structure.
RK4 requires multiple intermediate evaluations:
k1 = f(z)
k2 = f(z + dt/2 * k1)
k3 = f(z + dt/2 * k2)
k4 = f(z + dt * k3)
x += dt * px;
y += dt * py;
px -= dt * dVdx;
py -= dt * dVdy;
Compute:
[ E_i = \frac{1}{2}(p_x^2 + p_y^2) + V(x,y) ]
__global__ void energy_kernel(...) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= N) return;
energy[i] = ...;
}
cudaMalloc)cudaMemcpy)cudaMemcpy(d_x, x, size, cudaMemcpyHostToDevice);
kernel<<<grid, block>>>(...);
cudaMemcpy(x, d_x, size, cudaMemcpyDeviceToHost);
Balance:
Symplectic integrator:
RK4:
Avoid:
Fix number of trajectories, vary GPU:
Increase (N):
[ \text{Runtime} \sim \mathcal{O}(N) ]
GPU handles millions of trajectories efficiently.
This CUDA implementation demonstrates:
GPUs excel when the mathematical structure matches the hardware model.
Here:
This alignment is what enables high-performance scientific computing.
This section adds line-by-line annotated CUDA kernels for the three integrators and key helpers. The goal is to make the mapping from mathematics → GPU execution completely explicit.
// Compute global thread index
int i = blockIdx.x * blockDim.x + threadIdx.x;
// Guard against out-of-bounds threads
if (i >= N) return;
Explanation
blockIdx.x: block id in the gridblockDim.x: threads per blockthreadIdx.x: thread id within blocki maps thread → trajectory__device__ __forceinline__
void compute_gradients(double x, double y,
double& dVdx, double& dVdy)
{
// ∂V/∂x = x + 2xy
dVdx = x + 2.0 * x * y;
// ∂V/∂y = y + x^2 - y^2
dVdy = y + x * x - y * y;
}
Explanation
__device__: runs on GPU__forceinline__: encourages inlining → avoids call overhead__global__
void symplectic_kernel(double* x, double* y,
double* px, double* py,
int N, int steps, double dt)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= N) return;
// --- Load state into registers (fast) ---
double xi = x[i];
double yi = y[i];
double pxi = px[i];
double pyi = py[i];
double dVdx, dVdy;
// --- Time integration loop ---
for (int s = 0; s < steps; ++s) {
// Compute gradient at current position
compute_gradients(xi, yi, dVdx, dVdy);
// (1) Half-step momentum update (Kick)
pxi -= 0.5 * dt * dVdx;
pyi -= 0.5 * dt * dVdy;
// (2) Full-step position update (Drift)
xi += dt * pxi;
yi += dt * pyi;
// Recompute gradient at new position
compute_gradients(xi, yi, dVdx, dVdy);
// (3) Half-step momentum update (Kick)
pxi -= 0.5 * dt * dVdx;
pyi -= 0.5 * dt * dVdy;
}
// --- Store results back to global memory ---
x[i] = xi;
y[i] = yi;
px[i] = pxi;
py[i] = pyi;
}
Key Points
Implements:
__global__
void rk4_kernel(double* x, double* y,
double* px, double* py,
int N, int steps, double dt)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= N) return;
double xi = x[i];
double yi = y[i];
double pxi = px[i];
double pyi = py[i];
for (int s = 0; s < steps; ++s) {
// k1
double dVdx1, dVdy1;
compute_gradients(xi, yi, dVdx1, dVdy1);
double k1_x = pxi;
double k1_y = pyi;
double k1_px = -dVdx1;
double k1_py = -dVdy1;
// k2 (midpoint using k1)
double x2 = xi + 0.5 * dt * k1_x;
double y2 = yi + 0.5 * dt * k1_y;
double px2 = pxi + 0.5 * dt * k1_px;
double py2 = pyi + 0.5 * dt * k1_py;
double dVdx2, dVdy2;
compute_gradients(x2, y2, dVdx2, dVdy2);
double k2_x = px2;
double k2_y = py2;
double k2_px = -dVdx2;
double k2_py = -dVdy2;
// k3 (another midpoint)
double x3 = xi + 0.5 * dt * k2_x;
double y3 = yi + 0.5 * dt * k2_y;
double px3 = pxi + 0.5 * dt * k2_px;
double py3 = pyi + 0.5 * dt * k2_py;
double dVdx3, dVdy3;
compute_gradients(x3, y3, dVdx3, dVdy3);
double k3_x = px3;
double k3_y = py3;
double k3_px = -dVdx3;
double k3_py = -dVdy3;
// k4 (full step)
double x4 = xi + dt * k3_x;
double y4 = yi + dt * k3_y;
double px4 = pxi + dt * k3_px;
double py4 = pyi + dt * k3_py;
double dVdx4, dVdy4;
compute_gradients(x4, y4, dVdx4, dVdy4);
double k4_x = px4;
double k4_y = py4;
double k4_px = -dVdx4;
double k4_py = -dVdy4;
// Combine increments
xi += dt / 6.0 * (k1_x + 2*k2_x + 2*k3_x + k4_x);
yi += dt / 6.0 * (k1_y + 2*k2_y + 2*k3_y + k4_y);
pxi += dt / 6.0 * (k1_px + 2*k2_px + 2*k3_px + k4_px);
pyi += dt / 6.0 * (k1_py + 2*k2_py + 2*k3_py + k4_py);
}
x[i] = xi;
y[i] = yi;
px[i] = pxi;
py[i] = pyi;
}
Key Points
__global__
void euler_kernel(double* x, double* y,
double* px, double* py,
int N, int steps, double dt)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= N) return;
double xi = x[i];
double yi = y[i];
double pxi = px[i];
double pyi = py[i];
double dVdx, dVdy;
for (int s = 0; s < steps; ++s) {
compute_gradients(xi, yi, dVdx, dVdy);
// Explicit Euler update
xi += dt * pxi;
yi += dt * pyi;
pxi -= dt * dVdx;
pyi -= dt * dVdy;
}
x[i] = xi;
y[i] = yi;
px[i] = pxi;
py[i] = pyi;
}
Key Points
__global__
void energy_kernel(const double* x, const double* y,
const double* px, const double* py,
double* E, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= N) return;
double xi = x[i];
double yi = y[i];
double pxi = px[i];
double pyi = py[i];
// Kinetic energy
double T = 0.5 * (pxi * pxi + pyi * pyi);
// Potential energy
double V = 0.5 * (xi * xi + yi * yi)
+ xi * xi * yi
- (1.0 / 3.0) * yi * yi * yi;
E[i] = T + V;
}
Key Points
int blockSize = 256;
int gridSize = (N + blockSize - 1) / blockSize;
symplectic_kernel<<<gridSize, blockSize>>>(d_x, d_y, d_px, d_py,
N, steps, dt);
Explanation
nvcc --ptxas-options=-v)These kernels illustrate a key HPC principle:
Performance emerges when mathematical structure aligns with hardware structure.