Skip to content

Commit

Permalink
adding comments and renaming variables for gpu kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
danielpeter committed Nov 10, 2024
1 parent 40515c8 commit fadca31
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 49 deletions.
6 changes: 6 additions & 0 deletions src/gpu/compute_coupling_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@

/* ----------------------------------------------------------------------------------------------- */

// coupling direction: elastic wavefield/domain -> acoustic wavefield/domain
// (updates acoustic potential_dot_dot wavefield)

extern EXTERN_LANG
void FC_FUNC_(compute_coupling_ac_el_cuda,
COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer,
Expand Down Expand Up @@ -131,6 +134,9 @@ void FC_FUNC_(compute_coupling_ac_el_cuda,

/* ----------------------------------------------------------------------------------------------- */

// coupling direction: acoustic wavefield/domain -> elastic wavefield/domain
// (updates elastic acceleration wavefield)

extern EXTERN_LANG
void FC_FUNC_(compute_coupling_el_ac_cuda,
COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer,
Expand Down
76 changes: 38 additions & 38 deletions src/gpu/kernels/Kernel_2_acoustic_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
realw* d_gammax,realw* d_gammay,realw* d_gammaz,
const realw xix_regular, const realw jacobian_regular,
realw_const_p d_hprime_xx,
realw_const_p hprimewgll_xx,
realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz,
realw_const_p d_hprimewgll_xx,
realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
realw* d_rhostore,
const int use_mesh_coloring_gpu,
const int gravity,
Expand Down Expand Up @@ -184,7 +184,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
working_element = bx;
#else
//mesh coloring
if (use_mesh_coloring_gpu ){
if (use_mesh_coloring_gpu){
working_element = bx;
}else{
// iphase-1 and working_element-1 for Fortran->C array conventions
Expand Down Expand Up @@ -273,7 +273,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
sh_hprime_xx[tx] = d_hprime_xx[tx];
#endif
// loads hprimewgll into shared memory
sh_hprimewgll_xx[tx] = hprimewgll_xx[tx];
sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx];
}

// counts:
Expand All @@ -287,9 +287,9 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
__syncthreads();

// summed terms with added gll weights
fac1 = wgllwgll_yz[K*NGLLX+J];
fac2 = wgllwgll_xz[K*NGLLX+I];
fac3 = wgllwgll_xy[J*NGLLX+I];
fac1 = d_wgllwgll_yz[K*NGLLX+J];
fac2 = d_wgllwgll_xz[K*NGLLX+I];
fac3 = d_wgllwgll_xy[J*NGLLX+I];

// We make a loop over direct and adjoint wavefields inside the GPU kernel to increase arithmetic intensity
for (int k = 0 ; k < nb_field ; k++){
Expand Down Expand Up @@ -402,7 +402,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
#endif // USE_TEXTURES_FIELDS
#else // MESH_COLORING
//mesh coloring
if (use_mesh_coloring_gpu ){
if (use_mesh_coloring_gpu){
// no atomic operation needed, colors don't share global points between elements
#ifdef USE_TEXTURES_FIELDS
if (k==0) d_potential_dot_dot_acoustic[iglob] = texfetch_potential_dot_dot<FORWARD_OR_ADJOINT>(iglob) + sum_terms;
Expand Down Expand Up @@ -525,8 +525,8 @@ template __global__ void Kernel_2_acoustic_impl<1>(const int nb_blocks_to_comput
realw* d_gammax,realw* d_gammay,realw* d_gammaz,
const realw xix_regular, const realw jacobian_regular,
realw_const_p d_hprime_xx,
realw_const_p hprimewgll_xx,
realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz,
realw_const_p d_hprimewgll_xx,
realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
realw* d_rhostore,
const int use_mesh_coloring_gpu,
const int gravity,
Expand Down Expand Up @@ -558,8 +558,8 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
const int* d_irregular_element_number,
const realw xix_regular, const realw jacobian_regular,
realw_const_p d_hprime_xx,
realw_const_p hprimewgll_xx,
realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz,
realw_const_p d_hprimewgll_xx,
realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
realw* d_rhostore,
const int use_mesh_coloring_gpu,
const int gravity,
Expand All @@ -571,14 +571,23 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
// block-id == number of local element id in phase_ispec array
int bx = blockIdx.y*gridDim.x+blockIdx.x;

// checks if anything to do
if (bx >= nb_blocks_to_compute) return;

// thread-id == GLL node id
// note: use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses;
// to avoid execution branching and the need of registers to store an active state variable,
// the thread ids are put in valid range
int tx = threadIdx.x;
// limits thread ids to range [0,125-1]
if (tx >= NGLL3) tx = NGLL3-1;

// local index
int K = (tx/NGLL2);
int J = ((tx-K*NGLL2)/NGLLX);
int I = (tx-K*NGLL2-J*NGLLX);

int I,J,K;
int iglob,offset;
int working_element,ispec_irreg;

Expand All @@ -602,28 +611,24 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
__shared__ realw sh_hprime_xx[NGLL2];
__shared__ realw sh_hprimewgll_xx[NGLL2];

// checks if anything to do
if (bx >= nb_blocks_to_compute) return;

// limits thread ids to range [0,125-1]
if (tx >= NGLL3) tx = NGLL3-1;

// spectral-element id
#ifdef USE_MESH_COLORING_GPU
working_element = bx;
#else
//mesh coloring
if (use_mesh_coloring_gpu ){
if (use_mesh_coloring_gpu){
working_element = bx;
}else{
// iphase-1 and working_element-1 for Fortran->C array conventions
working_element = d_phase_ispec_inner_acoustic[bx + num_phase_ispec_acoustic*(d_iphase-1)]-1;
working_element = d_phase_ispec_inner_acoustic[bx + num_phase_ispec_acoustic*(d_iphase-1)] - 1;
}
#endif

ispec_irreg = d_irregular_element_number[working_element] - 1;

// local padded index
offset = working_element*NGLL3_PADDED + tx;
ispec_irreg = d_irregular_element_number[working_element] - 1;

// global index
iglob = d_ibool[offset] - 1;

Expand All @@ -644,11 +649,6 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
// gravity
if (gravity) kappa_invl = 1.f / d_kappastore[working_element*NGLL3 + tx];

// local index
K = (tx/NGLL2);
J = ((tx-K*NGLL2)/NGLLX);
I = (tx-K*NGLL2-J*NGLLX);

// calculates laplacian
if (ispec_irreg >= 0){
//irregular_element
Expand Down Expand Up @@ -680,7 +680,7 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
sh_hprime_xx[tx] = d_hprime_xx[tx];
#endif
// loads hprimewgll into shared memory
sh_hprimewgll_xx[tx] = hprimewgll_xx[tx];
sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx];
}

// synchronize all the threads (one thread for each of the NGLL grid points of the
Expand Down Expand Up @@ -753,9 +753,9 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
}

// summed terms with added gll weights
fac1 = wgllwgll_yz[K*NGLLX+J];
fac2 = wgllwgll_xz[K*NGLLX+I];
fac3 = wgllwgll_xy[J*NGLLX+I];
fac1 = d_wgllwgll_yz[K*NGLLX+J];
fac2 = d_wgllwgll_xz[K*NGLLX+I];
fac3 = d_wgllwgll_xy[J*NGLLX+I];

sum_terms = -(fac1*temp1l + fac2*temp2l + fac3*temp3l);

Expand All @@ -777,7 +777,7 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
#endif // USE_TEXTURES_FIELDS
#else // MESH_COLORING
//mesh coloring
if (use_mesh_coloring_gpu ){
if (use_mesh_coloring_gpu){
// no atomic operation needed, colors don't share global points between elements
#ifdef USE_TEXTURES_FIELDS
if (FORWARD_OR_ADJOINT == 3){
Expand Down Expand Up @@ -817,8 +817,8 @@ Kernel_2_acoustic_perf_impl(const int nb_blocks_to_compute,
realw* d_etax,realw* d_etay,realw* d_etaz,
realw* d_gammax,realw* d_gammay,realw* d_gammaz,
realw_const_p d_hprime_xx,
realw_const_p hprimewgll_xx,
realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz,
realw_const_p d_hprimewgll_xx,
realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
realw* d_rhostore,
const int use_mesh_coloring_gpu,
const int gravity,
Expand Down Expand Up @@ -927,7 +927,7 @@ Kernel_2_acoustic_perf_impl(const int nb_blocks_to_compute,
sh_hprime_xx[tx] = d_hprime_xx[tx];
#endif
// loads hprimewgll into shared memory
sh_hprimewgll_xx[tx] = hprimewgll_xx[tx];
sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx];
}
// synchronize all the threads (one thread for each of the NGLL grid points of the
Expand Down Expand Up @@ -984,9 +984,9 @@ Kernel_2_acoustic_perf_impl(const int nb_blocks_to_compute,
}
// summed terms with added gll weights
fac1 = wgllwgll_yz[K*NGLLX+J];
fac2 = wgllwgll_xz[K*NGLLX+I];
fac3 = wgllwgll_xy[J*NGLLX+I];
fac1 = d_wgllwgll_yz[K*NGLLX+J];
fac2 = d_wgllwgll_xz[K*NGLLX+I];
fac3 = d_wgllwgll_xy[J*NGLLX+I];
sum_terms = -(fac1*temp1l + fac2*temp2l + fac3*temp3l);
Expand Down
12 changes: 6 additions & 6 deletions src/gpu/kernels/kernel_3_acoustic_cuda_device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,18 @@ __global__ void kernel_3_acoustic_cuda_device(field* potential_dot_acoustic,

int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;

realw rmass;
field p_dot_dot;
// because of block and grid sizing problems, there is a small
// amount of buffer at the end of the calculation
if (id < size) {
rmass = rmass_acoustic[id];
realw rmass = rmass_acoustic[id];
// multiplies pressure with the inverse of the mass matrix
p_dot_dot = rmass*potential_dot_dot_acoustic[id];
field p_dot_dot = rmass * potential_dot_dot_acoustic[id];

potential_dot_dot_acoustic[id] = p_dot_dot;
potential_dot_acoustic[id] += deltatover2*p_dot_dot;
if (simulation_type==3) {
p_dot_dot = rmass*b_potential_dot_dot_acoustic[id];

if (simulation_type == 3) {
p_dot_dot = rmass * b_potential_dot_dot_acoustic[id];
b_potential_dot_dot_acoustic[id] = p_dot_dot;
b_potential_dot_acoustic[id] += b_deltatover2*p_dot_dot;
}
Expand Down
34 changes: 29 additions & 5 deletions src/gpu/mesh_constants_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,27 @@ typedef double realw;
#endif

// maximum function
#if !defined(MAX)
#define MAX(x,y) (((x) < (y)) ? (y) : (x))
#endif
// minimum function
#if !defined(MIN)
#define MIN(a,b) (((a) > (b)) ? (b) : (a))
#endif

// HIP
#ifdef USE_HIP
// for HIP-CPU installation
#if defined(__HIP_CPU_RT__)
// forces __forceinline__ keyword to be inline to avoid "duplicate symbol.." linking errors
#if defined(__forceinline__)
#undef __forceinline__
#endif
#define __forceinline__ inline
// or
//#define __forceinline__ __attribute__((always_inline)) inline
#endif
#endif

/* ----------------------------------------------------------------------------------------------- */

Expand Down Expand Up @@ -260,15 +278,11 @@ typedef double realw;

/* ----------------------------------------------------------------------------------------------- */

// type of "working" variables: see also CUSTOM_REAL
// double precision temporary variables leads to 10% performance decrease
// in Kernel_2_impl (not very much..)
typedef float realw;

// textures
// note: texture templates are supported only for CUDA versions <= 11.x
// since CUDA 12.x, these are deprecated and texture objects should be used instead
// see: https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
#if CUSTOM_REAL == 4
#if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS)
#ifdef USE_CUDA
typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture;
Expand All @@ -277,6 +291,16 @@ typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture
typedef texture<float, hipTextureType1D, hipReadModeElementType> realw_texture;
#endif
#endif
#elif CUSTOM_REAL == 8
#if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS)
#ifdef USE_CUDA
typedef texture<double, cudaTextureType1D, cudaReadModeElementType> realw_texture;
#endif
#ifdef USE_HIP
typedef texture<double, hipTextureType1D, hipReadModeElementType> realw_texture;
#endif
#endif
#endif

// pointer declarations
// restricted pointers: can improve performance on Kepler ~ 10%
Expand Down
1 change: 1 addition & 0 deletions src/gpu/update_displacement_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ void FC_FUNC_(kernel_3_acoustic_cuda,

Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
int FORWARD_OR_ADJOINT = *FORWARD_OR_ADJOINT_f;

// safety check
if (FORWARD_OR_ADJOINT != 0 && FORWARD_OR_ADJOINT != 1 && FORWARD_OR_ADJOINT != 3) {
exit_on_error("Error invalid FORWARD_OR_ADJOINT in Kernel_2_acoustic() routine");
Expand Down

0 comments on commit fadca31

Please sign in to comment.