From 26c98ba721a47ef697894248b7d1a68afc464bc9 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Thu, 14 Nov 2024 18:35:01 +0100
Subject: [PATCH] Make benchmarks run on the GPU (#77)

* Make WCSPH benchmark run on the GPU

* Add FP32 WCSPH benchmark

* Add GPU benchmark of N-Body as well

* Fix typo
---
 benchmarks/n_body.jl                          | 14 +++-
 benchmarks/smoothed_particle_hydrodynamics.jl | 76 +++++++++++++++++--
 2 files changed, 80 insertions(+), 10 deletions(-)

diff --git a/benchmarks/n_body.jl b/benchmarks/n_body.jl
index 5b04a26..6cd0272 100644
--- a/benchmarks/n_body.jl
+++ b/benchmarks/n_body.jl
@@ -12,8 +12,15 @@ This is a more realistic benchmark for particle-based simulations than
 However, due to the higher computational cost, differences between neighborhood search
 implementations are less pronounced.
 """
-function benchmark_n_body(neighborhood_search, coordinates; parallel = true)
-    mass = 1e10 * (rand(size(coordinates, 2)) .+ 1)
+function benchmark_n_body(neighborhood_search, coordinates_; parallel = true)
+    # Passing a different backend like `CUDA.CUDABackend`
+    # allows us to change the type of the array to run the benchmark on the GPU.
+    # Passing `parallel = true` or `parallel = false` will not change anything here.
+    coordinates = PointNeighbors.Adapt.adapt(parallel, coordinates_)
+    nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search)
+
+    # This preserves the data type of `coordinates`, which makes it work for GPU types
+    mass = 1e10 * (rand!(similar(coordinates, size(coordinates, 2))) .+ 1)
     G = 6.6743e-11
 
     dv = similar(coordinates)
@@ -36,6 +43,5 @@ function benchmark_n_body(neighborhood_search, coordinates; parallel = true)
         return dv
     end
 
-    return @belapsed $compute_acceleration!($dv, $coordinates, $mass, $G,
-                                            $neighborhood_search, $parallel)
+    return @belapsed $compute_acceleration!($dv, $coordinates, $mass, $G, $nhs, $parallel)
 end
diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl
index bb90c5a..2ccfb77 100644
--- a/benchmarks/smoothed_particle_hydrodynamics.jl
+++ b/benchmarks/smoothed_particle_hydrodynamics.jl
@@ -34,16 +34,80 @@ function benchmark_wcsph(neighborhood_search, coordinates; parallel = true)
                                                smoothing_length, viscosity = viscosity,
                                                density_diffusion = density_diffusion)
 
-    v = vcat(fluid.velocity, fluid.density')
-    u = copy(fluid.coordinates)
+    # Note that we cannot just disable parallelism in TrixiParticles.
+    # But passing a different backend like `CUDA.CUDABackend`
+    # allows us to change the type of the array to run the benchmark on the GPU.
+    if parallel isa Bool
+        system = fluid_system
+        nhs = neighborhood_search
+    else
+        system = PointNeighbors.Adapt.adapt(parallel, fluid_system)
+        nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search)
+    end
+
+    v = PointNeighbors.Adapt.adapt(parallel, vcat(fluid.velocity, fluid.density'))
+    u = PointNeighbors.Adapt.adapt(parallel, coordinates)
     dv = zero(v)
 
     # Initialize the system
-    TrixiParticles.initialize!(fluid_system, neighborhood_search)
-    TrixiParticles.compute_pressure!(fluid_system, v)
+    TrixiParticles.initialize!(system, nhs)
+    TrixiParticles.compute_pressure!(system, v)
 
-    return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, $neighborhood_search,
-                                              $fluid_system, $fluid_system)
+    return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, $nhs, $system, $system)
+end
+
+"""
+    benchmark_wcsph_fp32(neighborhood_search, coordinates; parallel = true)
+
+Like [`benchmark_wcsph`](@ref), but using single precision floating point numbers.
+"""
+function benchmark_wcsph_fp32(neighborhood_search, coordinates_; parallel = true)
+    coordinates = convert(Matrix{Float32}, coordinates_)
+    density = 1000.0f0
+    fluid = InitialCondition(; coordinates, density, mass = 0.1f0)
+
+    # Compact support == smoothing length for the Wendland kernel
+    smoothing_length = convert(Float32, PointNeighbors.search_radius(neighborhood_search))
+    if ndims(neighborhood_search) == 1
+        smoothing_kernel = SchoenbergCubicSplineKernel{1}()
+    else
+        smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}()
+    end
+
+    sound_speed = 10.0f0
+    state_equation = StateEquationCole(; sound_speed, reference_density = density,
+                                       exponent = 1)
+
+    fluid_density_calculator = ContinuityDensity()
+    viscosity = ArtificialViscosityMonaghan(alpha = 0.02f0, beta = 0.0f0)
+    density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1f0)
+
+    fluid_system = WeaklyCompressibleSPHSystem(fluid, fluid_density_calculator,
+                                               state_equation, smoothing_kernel,
+                                               smoothing_length, viscosity = viscosity,
+                                               acceleration = (0.0f0, 0.0f0, 0.0f0),
+                                               density_diffusion = density_diffusion)
+
+    # Note that we cannot just disable parallelism in TrixiParticles.
+    # But passing a different backend like `CUDA.CUDABackend`
+    # allows us to change the type of the array to run the benchmark on the GPU.
+    if parallel isa Bool
+        system = fluid_system
+        nhs = neighborhood_search
+    else
+        system = PointNeighbors.Adapt.adapt(parallel, fluid_system)
+        nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search)
+    end
+
+    v = PointNeighbors.Adapt.adapt(parallel, vcat(fluid.velocity, fluid.density'))
+    u = PointNeighbors.Adapt.adapt(parallel, coordinates)
+    dv = zero(v)
+
+    # Initialize the system
+    TrixiParticles.initialize!(system, nhs)
+    TrixiParticles.compute_pressure!(system, v)
+
+    return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, $nhs, $system, $system)
 end
 
 """