add some comments

Ferrite-FEM · Nov 18, 2024 · d4d5967 · d4d5967
1 parent c7f4b0f
commit d4d5967
Show file tree

Hide file tree

Showing 4 changed files with 272 additions and 97 deletions.
diff --git a/docs/src/literate-tutorials/gpu_qp_heat_equation.jl b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl
@@ -6,10 +6,10 @@ using CUDA
 
 left = Tensor{1, 2, Float32}((0, -0)) # define the left bottom corner of the grid.
 
-right = Tensor{1, 2, Float32}((100.0, 100.0)) # define the right top corner of the grid.
+right = Tensor{1, 2, Float32}((1000.0, 1000.0)) # define the right top corner of the grid.
 
 
-grid = generate_grid(Quadrilateral, (100, 100), left, right)
+grid = generate_grid(Quadrilateral, (1000, 1000), left, right)
 
 
 ip = Lagrange{RefQuadrilateral, 2}() # define the interpolation function (i.e. Bilinear lagrange)
@@ -168,3 +168,8 @@ norm(K)
 ## norm(Kgpu)
 Kstd, Fstd = stassy(cellvalues, dh);
 norm(Kstd)
+
+
+## GPU Benchmarking, remove when not needed ##
+## CUDA.@time gpu_kernel()
+## CUDA.@profile trace = true gpu_kernel()
diff --git a/ext/GPU/CUDAKernelLauncher.jl b/ext/GPU/CUDAKernelLauncher.jl
@@ -1,3 +1,23 @@
+## This file manifsts the launch of GPU kernel on CUDA backend ##
+
+"""
+    Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
+
+Initialize a CUDA kernel for the Ferrite framework.
+
+# Arguments
+- `::Type{BackendCUDA}`: Specifies the CUDA backend.
+- `n_cells::Ti`: Number of cells in the problem.
+- `n_basefuncs::Ti`: Number of shape functions per cell.
+- `kernel::Function`: The CUDA kernel function to execute.
+- `args::Tuple`: Tuple of arguments for the kernel.
+
+# Returns
+- A `LazyKernel` object encapsulating the kernel and its execution configuration.
+
+# Errors
+Throws an `ArgumentError` if CUDA is not functional (e.g., due to missing drivers or improper installation).
+"""
 function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
     if CUDA.functional()
         return LazyKernel(n_cells, n_basefuncs, kernel, args, BackendCUDA)
@@ -6,14 +26,16 @@ function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti,
     end
 end
 
-
 """
-    Ferrite.launch_kernel!(kernel_config::CUDAKernelLauncher{Ti}) where Ti
+    Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
+
+Launch a CUDA kernel encapsulated in a `LazyKernel` object.
 
-Launch a CUDA kernel with the given configuration.
+# Arguments
+- `kernel::LazyKernel`: The kernel to be launched, along with its configuration.
 
-Arguments:
-- `kernel_config`: The `CUDAKernelLauncher` object containing a higher level fields for kernel configuration.
+# Returns
+- `nothing`: Indicates that the kernel was launched and synchronized successfully.
 """
 function Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
     n_cells = kernel.n_cells
@@ -27,18 +49,33 @@ function Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
     blocks = _calculate_nblocks(threads, n_cells)
 
     ## use dynamic shared memory if possible
-    _can_use_dynshmem(shared_mem) && return kernel(args...; threads, blocks, shmem = shared_mem)
+    _can_use_dynshmem(shared_mem) && return CUDA.@sync kernel(args...; threads, blocks, shmem = shared_mem)
 
     ## otherwise use global memory
     nes = blocks * threads
     kes = CUDA.zeros(Float32, nes, n_basefuncs, n_basefuncs)
     fes = CUDA.zeros(Float32, nes, n_basefuncs)
     args = _to_localdh(args, kes, fes)
-    @cuda blocks = blocks threads = threads ker(args...)
+    CUDA.@sync @cuda blocks = blocks threads = threads ker(args...)
     return nothing
 end
 
+"""
+    _to_localdh(args::Tuple, kes::AbstractArray, fes::AbstractArray)
+
+Convert a global degree-of-freedom handler to a local handler for use on the GPU.
+
+# Arguments
+- `args::Tuple`: Kernel arguments.
+- `kes::AbstractArray`: GPU storage for element stiffness matrices.
+- `fes::AbstractArray`: GPU storage for element force vectors.
+
+# Returns
+- `Tuple`: Updated arguments tuple with the degree-of-freedom handler replaced by a local GPU handler.
 
+# Errors
+Throws an `ErrorException` if no `AbstractDofHandler` is found in `args`.
+"""
 function _to_localdh(args::Tuple, kes::AbstractArray, fes::AbstractArray)
     dh_index = findfirst(x -> x isa Ferrite.AbstractDofHandler, args)
     dh_index !== nothing || throw(ErrorException("No subtype of AbstractDofHandler found in the arguments"))
@@ -48,29 +85,54 @@ function _to_localdh(args::Tuple, kes::AbstractArray, fes::AbstractArray)
     return Tuple(arr)
 end
 
+"""
+    _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
+
+Calculate the shared memory required for kernel execution.
+
+# Arguments
+- `threads::Integer`: Number of threads per block.
+- `n_basefuncs::Integer`: Number of basis functions per cell.
+
+# Returns
+- `Integer`: Amount of shared memory in bytes.
+"""
 function _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
     return sizeof(Float32) * (threads) * (n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs
 end
 
+"""
+    _can_use_dynshmem(required_shmem::Integer)
 
+Check if the GPU supports the required amount of dynamic shared memory.
+
+# Arguments
+- `required_shmem::Integer`: Required shared memory size in bytes.
+
+# Returns
+- `Bool`: `true` if the GPU can provide the required shared memory; `false` otherwise.
+"""
 function _can_use_dynshmem(required_shmem::Integer)
     dev = device()
-    MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) #size of dynamic shared memory
+    MAX_DYN_SHMEM = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
     return required_shmem < MAX_DYN_SHMEM
 end
 
-
 """
-    _calculate_nblocks(threads::Int, n_cells::Int)
+    _calculate_nblocks(threads::Integer, n_cells::Integer)
+
+Calculate the number of blocks required for kernel execution.
+
+# Arguments
+- `threads::Integer`: Number of threads per block.
+- `n_cells::Integer`: Total number of cells to process.
 
-Calculate the number of blocks to be used in the kernel launch.
+# Returns
+- `Integer`: Number of blocks to launch.
 """
 function _calculate_nblocks(threads::Integer, n_cells::Integer)
     dev = device()
     no_sms = CUDA.attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
-    # number of blocks is usually multiple of number of SMs
-    # occupancy test should be done on threads and blocks
-    # the goal is to calculate how many active block per SM and multiply it by the number of SMs
     required_blocks = cld(n_cells, threads)
     required_blocks < 2 * no_sms || return 2 * no_sms
     return required_blocks