Skip to content

Commit

Permalink
add some comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Abdelrahman912 committed Nov 18, 2024
1 parent c7f4b0f commit d4d5967
Show file tree
Hide file tree
Showing 4 changed files with 272 additions and 97 deletions.
9 changes: 7 additions & 2 deletions docs/src/literate-tutorials/gpu_qp_heat_equation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ using CUDA

left = Tensor{1, 2, Float32}((0, -0)) # define the left bottom corner of the grid.

right = Tensor{1, 2, Float32}((100.0, 100.0)) # define the right top corner of the grid.
right = Tensor{1, 2, Float32}((1000.0, 1000.0)) # define the right top corner of the grid.


grid = generate_grid(Quadrilateral, (100, 100), left, right)
grid = generate_grid(Quadrilateral, (1000, 1000), left, right)


ip = Lagrange{RefQuadrilateral, 2}() # define the interpolation function (i.e. Bilinear lagrange)
Expand Down Expand Up @@ -168,3 +168,8 @@ norm(K)
## norm(Kgpu)
Kstd, Fstd = stassy(cellvalues, dh);
norm(Kstd)


## GPU Benchmarking, remove when not needed ##
## CUDA.@time gpu_kernel()
## CUDA.@profile trace = true gpu_kernel()
90 changes: 76 additions & 14 deletions ext/GPU/CUDAKernelLauncher.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
## This file manifsts the launch of GPU kernel on CUDA backend ##

"""
Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
Initialize a CUDA kernel for the Ferrite framework.
# Arguments
- `::Type{BackendCUDA}`: Specifies the CUDA backend.
- `n_cells::Ti`: Number of cells in the problem.
- `n_basefuncs::Ti`: Number of shape functions per cell.
- `kernel::Function`: The CUDA kernel function to execute.
- `args::Tuple`: Tuple of arguments for the kernel.
# Returns
- A `LazyKernel` object encapsulating the kernel and its execution configuration.
# Errors
Throws an `ArgumentError` if CUDA is not functional (e.g., due to missing drivers or improper installation).
"""
function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
if CUDA.functional()
return LazyKernel(n_cells, n_basefuncs, kernel, args, BackendCUDA)
Expand All @@ -6,14 +26,16 @@ function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti,
end
end


"""
Ferrite.launch_kernel!(kernel_config::CUDAKernelLauncher{Ti}) where Ti
Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
Launch a CUDA kernel encapsulated in a `LazyKernel` object.
Launch a CUDA kernel with the given configuration.
# Arguments
- `kernel::LazyKernel`: The kernel to be launched, along with its configuration.
Arguments:
- `kernel_config`: The `CUDAKernelLauncher` object containing a higher level fields for kernel configuration.
# Returns
- `nothing`: Indicates that the kernel was launched and synchronized successfully.
"""
function Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
n_cells = kernel.n_cells
Expand All @@ -27,18 +49,33 @@ function Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
blocks = _calculate_nblocks(threads, n_cells)

## use dynamic shared memory if possible
_can_use_dynshmem(shared_mem) && return kernel(args...; threads, blocks, shmem = shared_mem)
_can_use_dynshmem(shared_mem) && return CUDA.@sync kernel(args...; threads, blocks, shmem = shared_mem)

## otherwise use global memory
nes = blocks * threads
kes = CUDA.zeros(Float32, nes, n_basefuncs, n_basefuncs)
fes = CUDA.zeros(Float32, nes, n_basefuncs)
args = _to_localdh(args, kes, fes)
@cuda blocks = blocks threads = threads ker(args...)
CUDA.@sync @cuda blocks = blocks threads = threads ker(args...)
return nothing
end

"""
_to_localdh(args::Tuple, kes::AbstractArray, fes::AbstractArray)
Convert a global degree-of-freedom handler to a local handler for use on the GPU.
# Arguments
- `args::Tuple`: Kernel arguments.
- `kes::AbstractArray`: GPU storage for element stiffness matrices.
- `fes::AbstractArray`: GPU storage for element force vectors.
# Returns
- `Tuple`: Updated arguments tuple with the degree-of-freedom handler replaced by a local GPU handler.
# Errors
Throws an `ErrorException` if no `AbstractDofHandler` is found in `args`.
"""
function _to_localdh(args::Tuple, kes::AbstractArray, fes::AbstractArray)
dh_index = findfirst(x -> x isa Ferrite.AbstractDofHandler, args)
dh_index !== nothing || throw(ErrorException("No subtype of AbstractDofHandler found in the arguments"))
Expand All @@ -48,29 +85,54 @@ function _to_localdh(args::Tuple, kes::AbstractArray, fes::AbstractArray)
return Tuple(arr)
end

"""
_calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
Calculate the shared memory required for kernel execution.
# Arguments
- `threads::Integer`: Number of threads per block.
- `n_basefuncs::Integer`: Number of basis functions per cell.
# Returns
- `Integer`: Amount of shared memory in bytes.
"""
function _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
return sizeof(Float32) * (threads) * (n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs
end

"""
_can_use_dynshmem(required_shmem::Integer)
Check if the GPU supports the required amount of dynamic shared memory.
# Arguments
- `required_shmem::Integer`: Required shared memory size in bytes.
# Returns
- `Bool`: `true` if the GPU can provide the required shared memory; `false` otherwise.
"""
function _can_use_dynshmem(required_shmem::Integer)
dev = device()
MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) #size of dynamic shared memory
MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
return required_shmem < MAX_DYN_SHMEM
end


"""
_calculate_nblocks(threads::Int, n_cells::Int)
_calculate_nblocks(threads::Integer, n_cells::Integer)
Calculate the number of blocks required for kernel execution.
# Arguments
- `threads::Integer`: Number of threads per block.
- `n_cells::Integer`: Total number of cells to process.
Calculate the number of blocks to be used in the kernel launch.
# Returns
- `Integer`: Number of blocks to launch.
"""
function _calculate_nblocks(threads::Integer, n_cells::Integer)
dev = device()
no_sms = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
# number of blocks is usually multiple of number of SMs
# occupancy test should be done on threads and blocks
# the goal is to calculate how many active block per SM and multiply it by the number of SMs
required_blocks = cld(n_cells, threads)
required_blocks < 2 * no_sms || return 2 * no_sms
return required_blocks
Expand Down
Loading

0 comments on commit d4d5967

Please sign in to comment.