How to Build High-Performance GPU-Accelerated Simulations and Differentiable Physics Workflows Using NVIDIA Warp Kernels

angles = np.linspace(0.0, 2.0 * np.pi, n_particles, endpoint=False, dtype=np.float32)
px0_np = 0.4 * np.cos(angles).astype(np.float32)
py0_np = (0.7 + 0.15 * np.sin(angles)).astype(np.float32)
vx0_np = (-0.8 * np.sin(angles)).astype(np.float32)
vy0_np = (0.8 * np.cos(angles)).astype(np.float32)

px0_wp = wp.array(px0_np, dtype=wp.float32, device=device)
py0_wp = wp.array(py0_np, dtype=wp.float32, device=device)
vx0_wp = wp.array(vx0_np, dtype=wp.float32, device=device)
vy0_wp = wp.array(vy0_np, dtype=wp.float32, device=device)

state_size = (steps + 1) * n_particles
px_wp = wp.empty(state_size, dtype=wp.float32, device=device)
py_wp = wp.empty(state_size, dtype=wp.float32, device=device)
vx_wp = wp.empty(state_size, dtype=wp.float32, device=device)
vy_wp = wp.empty(state_size, dtype=wp.float32, device=device)

wp.launch(
kernel=init_particles_kernel,
dim=n_particles,
inputs=[n_particles, px0_wp, py0_wp, vx0_wp, vy0_wp],
outputs=[px_wp, py_wp, vx_wp, vy_wp],
device=device,
)

wp.launch(
kernel=simulate_particles_kernel,
dim=steps * n_particles,
inputs=[n_particles, dt, gravity, damping, bounce, radius],
outputs=[px_wp, py_wp, vx_wp, vy_wp],
device=device,
)
wp.synchronize()

px_traj = px_wp.numpy().reshape(steps + 1, n_particles)
py_traj = py_wp.numpy().reshape(steps + 1, n_particles)

sample_ids = np.linspace(0, n_particles – 1, 16, dtype=int)
plt.figure(figsize=(8, 6))
for idx in sample_ids:
plt.plot(px_traj[:, idx], py_traj[:, idx], linewidth=1.5)
plt.axhline(radius, linestyle=”–“)
plt.xlim(-1.05, 1.05)
plt.ylim(0.0, 1.25)
plt.title(f”Warp particle trajectories on {device}”)
plt.xlabel(“x”)
plt.ylabel(“y”)
plt.show()

proj_steps = 180
proj_dt = np.float32(0.025)
proj_g = np.float32(-9.8)
target_x = np.float32(3.8)
target_y = np.float32(0.0)

vx_value = np.float32(2.0)
vy_value = np.float32(6.5)
lr = 0.08
iters = 60

loss_history = []
vx_history = []
vy_history = []

for it in range(iters):
init_vx_wp = wp.array(np.array([vx_value], dtype=np.float32), dtype=wp.float32, device=device, requires_grad=True)
init_vy_wp = wp.array(np.array([vy_value], dtype=np.float32), dtype=wp.float32, device=device, requires_grad=True)

x_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
y_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
vx_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
vy_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
loss_wp = wp.zeros(1, dtype=wp.float32, device=device, requires_grad=True)

tape = wp.Tape()
with tape:
wp.launch(
kernel=init_projectile_kernel,
dim=1,
inputs=[],
outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp, init_vx_wp, init_vy_wp],
device=device,
)
wp.launch(
kernel=projectile_step_kernel,
dim=proj_steps,
inputs=[proj_dt, proj_g],
outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp],
device=device,
)
wp.launch(
kernel=projectile_loss_kernel,
dim=1,
inputs=[proj_steps, target_x, target_y],
outputs=[x_hist_wp, y_hist_wp, loss_wp],
device=device,
)

tape.backward(loss=loss_wp)
wp.synchronize()

current_loss = float(loss_wp.numpy()[0])
grad_vx = float(init_vx_wp.grad.numpy()[0])
grad_vy = float(init_vy_wp.grad.numpy()[0])

vx_value = np.float32(vx_value – lr * grad_vx)
vy_value = np.float32(vy_value – lr * grad_vy)

loss_history.append(current_loss)
vx_history.append(float(vx_value))
vy_history.append(float(vy_value))

if it % 10 == 0 or it == iters – 1:
print(f”iter={it:02d} loss={current_loss:.6f} vx={vx_value:.4f} vy={vy_value:.4f} grad=({grad_vx:.4f}, {grad_vy:.4f})”)

final_init_vx_wp = wp.array(np.array([vx_value], dtype=np.float32), dtype=wp.float32, device=device)
final_init_vy_wp = wp.array(np.array([vy_value], dtype=np.float32), dtype=wp.float32, device=device)
x_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)
y_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)
vx_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)
vy_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)

wp.launch(
kernel=init_projectile_kernel,
dim=1,
inputs=[],
outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp, final_init_vx_wp, final_init_vy_wp],
device=device,
)
wp.launch(
kernel=projectile_step_kernel,
dim=proj_steps,
inputs=[proj_dt, proj_g],
outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp],
device=device,
)
wp.synchronize()

x_path = x_hist_wp.numpy()
y_path = y_hist_wp.numpy()

fig = plt.figure(figsize=(15, 4))

ax1 = fig.add_subplot(1, 3, 1)
ax1.plot(loss_history)
ax1.set_title(“Optimization loss”)
ax1.set_xlabel(“Iteration”)
ax1.set_ylabel(“Squared distance”)

ax2 = fig.add_subplot(1, 3, 2)
ax2.plot(vx_history, label=”vx”)
ax2.plot(vy_history, label=”vy”)
ax2.set_title(“Learned initial velocity”)
ax2.set_xlabel(“Iteration”)
ax2.legend()

ax3 = fig.add_subplot(1, 3, 3)
ax3.plot(x_path, y_path, linewidth=2)
ax3.scatter([target_x], [target_y], s=80, marker=”x”)
ax3.set_title(“Differentiable projectile trajectory”)
ax3.set_xlabel(“x”)
ax3.set_ylabel(“y”)
ax3.set_ylim(-0.1, max(1.0, float(np.max(y_path)) + 0.3))

plt.tight_layout()
plt.show()

final_dx = float(x_path[-1] – target_x)
final_dy = float(y_path[-1] – target_y)
final_dist = math.sqrt(final_dx * final_dx + final_dy * final_dy)
print(f”Final target miss distance: {final_dist:.6f}”)
print(f”Optimized initial velocity: vx={vx_value:.6f}, vy={vy_value:.6f}”)

Source link

How to Build High-Performance GPU-Accelerated Simulations and Differentiable Physics Workflows Using NVIDIA Warp Kernels

U.S. Holds Off on New AI Chip Export Rules in Surprise Move in Tech Export Wars

Can AI help predict which heart-failure patients will worsen within a year? | MIT News

NanoClaw and Docker partner to make sandboxes the safest way for enterprises to deploy AI agents

How multi-agent AI economics influence business automation

Bitmine Immersion Technologies Crosses 4.59 million ETH After Purchasing 60,999 Tokens in a Single Week

DAOs May Need To Ditch Decentralization To Court Institutions

Bitcoin Bollinger Bands Setting Up BTC Price for “Powerful Move”

Stocks Settle Sharply Higher as Crude Oil Slumps

How to Build High-Performance GPU-Accelerated Simulations and Differentiable Physics Workflows Using NVIDIA Warp Kernels

Top Insights

Wife Uses CCTV To Pocket $176 Million

10 High Income Skills For Your 20s That AI Won’t Replace

How to Build High-Performance GPU-Accelerated Simulations and Differentiable Physics Workflows Using NVIDIA Warp Kernels

Related Posts