From 26c98ba721a47ef697894248b7d1a68afc464bc9 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 14 Nov 2024 18:35:01 +0100 Subject: [PATCH] Make benchmarks run on the GPU (#77) * Make WCSPH benchmark run on the GPU * Add FP32 WCSPH benchmark * Add GPU benchmark of N-Body as well * Fix typo --- benchmarks/n_body.jl | 14 +++- benchmarks/smoothed_particle_hydrodynamics.jl | 76 +++++++++++++++++-- 2 files changed, 80 insertions(+), 10 deletions(-) diff --git a/benchmarks/n_body.jl b/benchmarks/n_body.jl index 5b04a26..6cd0272 100644 --- a/benchmarks/n_body.jl +++ b/benchmarks/n_body.jl @@ -12,8 +12,15 @@ This is a more realistic benchmark for particle-based simulations than However, due to the higher computational cost, differences between neighborhood search implementations are less pronounced. """ -function benchmark_n_body(neighborhood_search, coordinates; parallel = true) - mass = 1e10 * (rand(size(coordinates, 2)) .+ 1) +function benchmark_n_body(neighborhood_search, coordinates_; parallel = true) + # Passing a different backend like `CUDA.CUDABackend` + # allows us to change the type of the array to run the benchmark on the GPU. + # Passing `parallel = true` or `parallel = false` will not change anything here. + coordinates = PointNeighbors.Adapt.adapt(parallel, coordinates_) + nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search) + + # This preserves the data type of `coordinates`, which makes it work for GPU types + mass = 1e10 * (rand!(similar(coordinates, size(coordinates, 2))) .+ 1) G = 6.6743e-11 dv = similar(coordinates) @@ -36,6 +43,5 @@ function benchmark_n_body(neighborhood_search, coordinates; parallel = true) return dv end - return @belapsed $compute_acceleration!($dv, $coordinates, $mass, $G, - $neighborhood_search, $parallel) + return @belapsed $compute_acceleration!($dv, $coordinates, $mass, $G, $nhs, $parallel) end diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl index bb90c5a..2ccfb77 100644 --- a/benchmarks/smoothed_particle_hydrodynamics.jl +++ b/benchmarks/smoothed_particle_hydrodynamics.jl @@ -34,16 +34,80 @@ function benchmark_wcsph(neighborhood_search, coordinates; parallel = true) smoothing_length, viscosity = viscosity, density_diffusion = density_diffusion) - v = vcat(fluid.velocity, fluid.density') - u = copy(fluid.coordinates) + # Note that we cannot just disable parallelism in TrixiParticles. + # But passing a different backend like `CUDA.CUDABackend` + # allows us to change the type of the array to run the benchmark on the GPU. + if parallel isa Bool + system = fluid_system + nhs = neighborhood_search + else + system = PointNeighbors.Adapt.adapt(parallel, fluid_system) + nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search) + end + + v = PointNeighbors.Adapt.adapt(parallel, vcat(fluid.velocity, fluid.density')) + u = PointNeighbors.Adapt.adapt(parallel, coordinates) dv = zero(v) # Initialize the system - TrixiParticles.initialize!(fluid_system, neighborhood_search) - TrixiParticles.compute_pressure!(fluid_system, v) + TrixiParticles.initialize!(system, nhs) + TrixiParticles.compute_pressure!(system, v) - return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, $neighborhood_search, - $fluid_system, $fluid_system) + return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, $nhs, $system, $system) +end + +""" + benchmark_wcsph_fp32(neighborhood_search, coordinates; parallel = true) + +Like [`benchmark_wcsph`](@ref), but using single precision floating point numbers. +""" +function benchmark_wcsph_fp32(neighborhood_search, coordinates_; parallel = true) + coordinates = convert(Matrix{Float32}, coordinates_) + density = 1000.0f0 + fluid = InitialCondition(; coordinates, density, mass = 0.1f0) + + # Compact support == smoothing length for the Wendland kernel + smoothing_length = convert(Float32, PointNeighbors.search_radius(neighborhood_search)) + if ndims(neighborhood_search) == 1 + smoothing_kernel = SchoenbergCubicSplineKernel{1}() + else + smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}() + end + + sound_speed = 10.0f0 + state_equation = StateEquationCole(; sound_speed, reference_density = density, + exponent = 1) + + fluid_density_calculator = ContinuityDensity() + viscosity = ArtificialViscosityMonaghan(alpha = 0.02f0, beta = 0.0f0) + density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1f0) + + fluid_system = WeaklyCompressibleSPHSystem(fluid, fluid_density_calculator, + state_equation, smoothing_kernel, + smoothing_length, viscosity = viscosity, + acceleration = (0.0f0, 0.0f0, 0.0f0), + density_diffusion = density_diffusion) + + # Note that we cannot just disable parallelism in TrixiParticles. + # But passing a different backend like `CUDA.CUDABackend` + # allows us to change the type of the array to run the benchmark on the GPU. + if parallel isa Bool + system = fluid_system + nhs = neighborhood_search + else + system = PointNeighbors.Adapt.adapt(parallel, fluid_system) + nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search) + end + + v = PointNeighbors.Adapt.adapt(parallel, vcat(fluid.velocity, fluid.density')) + u = PointNeighbors.Adapt.adapt(parallel, coordinates) + dv = zero(v) + + # Initialize the system + TrixiParticles.initialize!(system, nhs) + TrixiParticles.compute_pressure!(system, v) + + return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, $nhs, $system, $system) end """