From 2c95a81e8ed87aac347da582ccac62884ad403e7 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 5 Feb 2025 18:15:15 -0800 Subject: [PATCH 1/2] Ifpack2: BlockTriDi fix for large blocks In ExtractAndFactorize kernels that use scratch, fall back to level 1 when there isn't sufficient level 0. Signed-off-by: Brian Kelley --- .../src/Ifpack2_BlockTriDiContainer_impl.hpp | 51 ++++++++++++------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp index 256400e1470f..e5dd69c1a3d0 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp @@ -2750,7 +2750,7 @@ namespace Ifpack2 { */ Kokkos::Experimental::local_deep_copy(member, view1, view2); } - template + template struct ExtractAndFactorizeTridiags { public: using impl_type = BlockHelperDetails::ImplType; @@ -2785,6 +2785,8 @@ namespace Ifpack2 { using internal_vector_type = typename impl_type::internal_vector_type; static constexpr int vector_length = impl_type::vector_length; static constexpr int internal_vector_length = impl_type::internal_vector_length; + static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length"); + static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length"); /// team policy member type using team_policy_type = Kokkos::TeamPolicy; @@ -2812,7 +2814,6 @@ namespace Ifpack2 { // diagonal safety const magnitude_type tiny; const local_ordinal_type vector_loop_size; - const local_ordinal_type vector_length_value; bool hasBlockCrsMatrix; @@ -2873,8 +2874,7 @@ namespace Ifpack2 { blocksize_square(blocksize*blocksize), // diagonal weight to avoid zero pivots tiny(tiny_), - vector_loop_size(vector_length/internal_vector_length), - vector_length_value(vector_length) { + vector_loop_size(vector_length/internal_vector_length) { using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type; using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type; @@ -3191,7 +3191,7 @@ namespace Ifpack2 { const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0); internal_vector_scratch_type_3d_view - WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size); + WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size); #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx); @@ -3294,7 +3294,7 @@ namespace Ifpack2 { (void) npacks; internal_vector_scratch_type_3d_view - WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size); + WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size); if (local_subpartidx == 0) { Kokkos::parallel_for (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { @@ -3334,9 +3334,6 @@ namespace Ifpack2 { //const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx); //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0); - internal_vector_scratch_type_3d_view - WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size); - // Compute S = D - C E const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2; @@ -3440,7 +3437,7 @@ namespace Ifpack2 { const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1); internal_vector_scratch_type_3d_view - WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size); + WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size); #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size); @@ -3477,7 +3474,7 @@ namespace Ifpack2 { const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0); writeBTDValuesToFile(n_parts, scalar_values, "before.mm"); - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); + policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch)); Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", policy, *this); execution_space().fence(); @@ -3504,7 +3501,7 @@ namespace Ifpack2 { Kokkos::TeamPolicy policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size); - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); + policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch)); Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", policy, *this); execution_space().fence(); @@ -3523,7 +3520,7 @@ namespace Ifpack2 { Kokkos::TeamPolicy policy(packindices_sub.extent(0), team_size, vector_loop_size); - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); + policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch)); Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", policy, *this); execution_space().fence(); @@ -3544,7 +3541,6 @@ namespace Ifpack2 { Kokkos::TeamPolicy policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size); - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", policy, *this); writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm"); @@ -3561,7 +3557,7 @@ namespace Ifpack2 { IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0); Kokkos::TeamPolicy policy(packindices_schur.extent(0), team_size, vector_loop_size); - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); + policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch)); Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", policy, *this); execution_space().fence(); @@ -3587,9 +3583,29 @@ namespace Ifpack2 { const BlockHelperDetails::PartInterface &interf, BlockTridiags &btdm, const typename BlockHelperDetails::ImplType::magnitude_type tiny) { + using impl_type = BlockHelperDetails::ImplType; + using execution_space = typename impl_type::execution_space; + using team_policy_type = Kokkos::TeamPolicy; + using internal_vector_scratch_type_3d_view = Scratch; + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase); - ExtractAndFactorizeTridiags function(btdm, interf, A, G, tiny); - function.run(); + + int blocksize = btdm.values.extent(1); + // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched. + // For large block sizes, have to fall back to level 1 scratch. + int scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length); + int max_scratch = team_policy_type::scratch_size_max(0); + + if(scratch_required < max_scratch) { + // Can use level 0 scratch + ExtractAndFactorizeTridiags function(btdm, interf, A, G, tiny); + function.run(); + } + else { + // Not enough level 0 scratch, so fall back to level 1 + ExtractAndFactorizeTridiags function(btdm, interf, A, G, tiny); + function.run(); + } IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -3654,7 +3670,6 @@ namespace Ifpack2 { packed_multivector(pmv) {} // TODO:: modify this routine similar to the team level functions - // inline ---> FIXME HIP: should not need the KOKKOS_INLINE_FUNCTION below... KOKKOS_INLINE_FUNCTION void operator() (const local_ordinal_type &packidx) const { From 0e724f44a3a3a009769838c55fc582797346caef Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 6 Feb 2025 09:31:30 -0700 Subject: [PATCH 2/2] Ifpack2: Add BTDS tests to exercise large blocks Use max block size of 32 and a small grid. Test standard BTD, BTD with Schur line splitting, and Block Jacobi. Signed-off-by: Brian Kelley --- packages/ifpack2/example/CMakeLists.txt | 34 ++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/packages/ifpack2/example/CMakeLists.txt b/packages/ifpack2/example/CMakeLists.txt index 05f1956173d8..3df5e94665cb 100644 --- a/packages/ifpack2/example/CMakeLists.txt +++ b/packages/ifpack2/example/CMakeLists.txt @@ -35,6 +35,38 @@ ASSERT_DEFINED ( ${PACKAGE_NAME}_ENABLE_Galeri ) +IF(${PACKAGE_NAME}_ENABLE_Xpetra AND ${PACKAGE_NAME}_ENABLE_Galeri) +# Correctness test with maximum block size (32) +# Use a small grid so that GPU memory requirement isn't too large +# Block TriDi +TRIBITS_ADD_TEST( + BlockTriDiagonalSolver + NAME BlockTriDiLargeBlock + ARGS "--matrixType=Laplace3D --blockSize=32 --nx=20 --ny=20 --nz=20" + COMM serial mpi + NUM_MPI_PROCS 1-4 + STANDARD_PASS_OUTPUT +) +# Block TriDi with Schur line splitting +TRIBITS_ADD_TEST( + BlockTriDiagonalSolver + NAME BlockTriDiLargeBlockSchur + ARGS "--matrixType=Laplace3D --blockSize=32 --nx=20 --ny=20 --nz=20 --sublinesPerLine=1 --sublinesPerLineSchur=2" + COMM serial mpi + NUM_MPI_PROCS 1-4 + STANDARD_PASS_OUTPUT +) +# Block Jacobi +TRIBITS_ADD_TEST( + BlockTriDiagonalSolver + NAME BlockTriDiLargeBlockJacobi + ARGS "--matrixType=Laplace3D --blockSize=32 --nx=20 --ny=20 --nz=20 --sublinesPerLine=-1" + COMM serial mpi + NUM_MPI_PROCS 1-4 + STANDARD_PASS_OUTPUT +) +ENDIF() + IF(${PACKAGE_NAME}_ENABLE_Xpetra AND ${PACKAGE_NAME}_ENABLE_Galeri) set(blockSize 11) @@ -87,4 +119,4 @@ IF(${PACKAGE_NAME}_ENABLE_Xpetra AND ${PACKAGE_NAME}_ENABLE_Galeri) ENDWHILE() ENDIF() endforeach() -ENDIF() \ No newline at end of file +ENDIF()