Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid bounds checking where it is safe to do so #74

Merged
merged 10 commits into from
Nov 18, 2024
1 change: 1 addition & 0 deletions src/PointNeighbors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ using Reexport: @reexport

using Adapt: Adapt
using Atomix: Atomix
using Base: @propagate_inbounds
using GPUArraysCore: AbstractGPUArray
using KernelAbstractions: KernelAbstractions, @kernel, @index
using LinearAlgebra: dot
Expand Down
4 changes: 2 additions & 2 deletions src/cell_lists/full_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -156,15 +156,15 @@ function each_cell_index(cell_list::FullGridCellList{Nothing})
error("`search_radius` is not defined for this cell list")
end

@inline function cell_index(cell_list::FullGridCellList, cell::Tuple)
svchb marked this conversation as resolved.
Show resolved Hide resolved
@propagate_inbounds function cell_index(cell_list::FullGridCellList, cell::Tuple)
(; linear_indices) = cell_list

return linear_indices[cell...]
end

@inline cell_index(::FullGridCellList, cell::Integer) = cell

@inline function Base.getindex(cell_list::FullGridCellList, cell)
@propagate_inbounds function Base.getindex(cell_list::FullGridCellList, cell)
LasNikas marked this conversation as resolved.
Show resolved Hide resolved
(; cells) = cell_list

return cells[cell_index(cell_list, cell)]
Expand Down
21 changes: 18 additions & 3 deletions src/neighborhood_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,13 @@ end

@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, points, parallel::Val{true})
# Explicit bounds check before the hot loop (or GPU kernel)
@boundscheck checkbounds(system_coords, ndims(neighborhood_search))

@threaded system_coords for point in points
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
# Now we can assume that `point` is inbounds
@inbounds foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, point)
end

return nothing
Expand All @@ -175,17 +180,27 @@ end
@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, points,
backend::ParallelizationBackend)
# Explicit bounds check before the hot loop (or GPU kernel)
@boundscheck checkbounds(system_coords, ndims(neighborhood_search))

@threaded backend for point in points
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
# Now we can assume that `point` is inbounds
@inbounds foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, point)
end

return nothing
end

@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, points, parallel::Val{false})
# Explicit bounds check before the hot loop
@boundscheck checkbounds(system_coords, ndims(neighborhood_search))

for point in points
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
# Now we can assume that `point` is inbounds
@inbounds foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, point)
end

return nothing
Expand Down
31 changes: 25 additions & 6 deletions src/nhs_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -343,18 +343,37 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle
return neighborhood_search
end

@inline function foreach_neighbor(f, system_coords, neighbor_system_coords,
neighborhood_search::GridNeighborhoodSearch, point;
search_radius = search_radius(neighborhood_search))
@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_system_coords,
neighborhood_search::GridNeighborhoodSearch,
point;
search_radius = search_radius(neighborhood_search))
# Due to https://github.com/JuliaLang/julia/issues/30411, we cannot just remove
# a `@boundscheck` by calling this function with `@inbounds` because it has a kwarg.
# We have to use `@propagate_inbounds`, which will also remove boundschecks
# in the neighbor loop, which is not safe (see comment below).
# To avoid this, we have to use a function barrier to disable the `@inbounds` again.
point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point)

__foreach_neighbor(f, system_coords, neighbor_system_coords, neighborhood_search,
point, point_coords, search_radius)
end

@inline function __foreach_neighbor(f, system_coords, neighbor_system_coords,
neighborhood_search::GridNeighborhoodSearch,
point, point_coords, search_radius)
(; periodic_box) = neighborhood_search

point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point)
cell = cell_coords(point_coords, neighborhood_search)

for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
neighbor_cell = Tuple(neighbor_cell_)
neighbors = points_in_cell(neighbor_cell, neighborhood_search)

for neighbor_ in eachindex(neighbors)
neighbor = @inbounds neighbors[neighbor_]
svchb marked this conversation as resolved.
Show resolved Hide resolved

for neighbor in points_in_cell(neighbor_cell, neighborhood_search)
# Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100.
# But we don't know if `neighbor` (extracted from the cell list) is in bounds.
neighbor_coords = extract_svector(neighbor_system_coords,
Val(ndims(neighborhood_search)), neighbor)

Expand Down Expand Up @@ -392,7 +411,7 @@ end
for cell in neighboring_cells(cell, neighborhood_search))
end

@inline function points_in_cell(cell_index, neighborhood_search)
@propagate_inbounds function points_in_cell(cell_index, neighborhood_search)
(; cell_list) = neighborhood_search

return cell_list[periodic_cell_index(cell_index, neighborhood_search)]
Expand Down
21 changes: 16 additions & 5 deletions src/util.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Return the `i`-th column of the array `A` as an `SVector`.
@inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS}
return SVector(ntuple(@inline(dim->A[dim, i]), NDIMS))
# Explicit bounds check, which can be removed by calling this function with `@inbounds`
@boundscheck checkbounds(A, NDIMS, i)

# Assume inbounds access now
return SVector(ntuple(@inline(dim->@inbounds A[dim, i]), NDIMS))
end

# When particles end up with coordinates so big that the cell coordinates
Expand All @@ -13,13 +17,20 @@ end
# If we threw an error here, we would prevent the time integration method from
# retrying with a smaller time step, and we would thus crash perfectly fine simulations.
@inline function floor_to_int(i)
if isnan(i) || i > typemax(Int)
# `Base.floor(Int, i)` is defined as `trunc(Int, round(x, RoundDown))`
rounded = round(i, RoundDown)

# `Base.trunc(Int, x)` throws an `InexactError` in these cases, and otherwise
# returns `unsafe_trunc(Int, rounded)`.
if isnan(rounded) || rounded >= typemax(Int)
return typemax(Int)
elseif i < typemin(Int)
elseif rounded <= typemin(Int)
return typemin(Int)
end

return floor(Int, i)
# After making sure that `rounded` is in the range of `Int`,
# we can safely call `unsafe_trunc`.
return unsafe_trunc(Int, rounded)
end

abstract type AbstractThreadingBackend end
Expand Down Expand Up @@ -134,7 +145,7 @@ end
# Call the generic kernel that is defined below, which only calls a function with
# the global GPU index.
generic_kernel(backend)(ndrange = ndrange) do i
@inline f(iterator[indices[i]])
@inbounds @inline f(iterator[indices[i]])
end

KernelAbstractions.synchronize(backend)
Expand Down
3 changes: 2 additions & 1 deletion src/vector_of_vectors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ end
@inline function Base.getindex(vov::DynamicVectorOfVectors, i)
(; backend, lengths) = vov

# This is slightly faster than without explicit boundscheck and `@inbounds` below
@boundscheck checkbounds(vov, i)

return view(backend, 1:lengths[i], i)
return @inbounds view(backend, 1:lengths[i], i)
end

@inline function Base.push!(vov::DynamicVectorOfVectors, vector::AbstractVector)
Expand Down