Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ifpack2: BlockTriDi fix for large blocks #13792

Merged
merged 2 commits into from
Feb 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion packages/ifpack2/example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,38 @@ ASSERT_DEFINED (
${PACKAGE_NAME}_ENABLE_Galeri
)

IF(${PACKAGE_NAME}_ENABLE_Xpetra AND ${PACKAGE_NAME}_ENABLE_Galeri)
# Correctness test with maximum block size (32)
# Use a small grid so that GPU memory requirement isn't too large
# Block TriDi
TRIBITS_ADD_TEST(
BlockTriDiagonalSolver
NAME BlockTriDiLargeBlock
ARGS "--matrixType=Laplace3D --blockSize=32 --nx=20 --ny=20 --nz=20"
COMM serial mpi
NUM_MPI_PROCS 1-4
STANDARD_PASS_OUTPUT
)
# Block TriDi with Schur line splitting
TRIBITS_ADD_TEST(
BlockTriDiagonalSolver
NAME BlockTriDiLargeBlockSchur
ARGS "--matrixType=Laplace3D --blockSize=32 --nx=20 --ny=20 --nz=20 --sublinesPerLine=1 --sublinesPerLineSchur=2"
COMM serial mpi
NUM_MPI_PROCS 1-4
STANDARD_PASS_OUTPUT
)
# Block Jacobi
TRIBITS_ADD_TEST(
BlockTriDiagonalSolver
NAME BlockTriDiLargeBlockJacobi
ARGS "--matrixType=Laplace3D --blockSize=32 --nx=20 --ny=20 --nz=20 --sublinesPerLine=-1"
COMM serial mpi
NUM_MPI_PROCS 1-4
STANDARD_PASS_OUTPUT
)
ENDIF()

IF(${PACKAGE_NAME}_ENABLE_Xpetra AND ${PACKAGE_NAME}_ENABLE_Galeri)

set(blockSize 11)
Expand Down Expand Up @@ -87,4 +119,4 @@ IF(${PACKAGE_NAME}_ENABLE_Xpetra AND ${PACKAGE_NAME}_ENABLE_Galeri)
ENDWHILE()
ENDIF()
endforeach()
ENDIF()
ENDIF()
51 changes: 33 additions & 18 deletions packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2750,7 +2750,7 @@ namespace Ifpack2 {
*/
Kokkos::Experimental::local_deep_copy(member, view1, view2);
}
template<typename MatrixType>
template<typename MatrixType, int ScratchLevel>
struct ExtractAndFactorizeTridiags {
public:
using impl_type = BlockHelperDetails::ImplType<MatrixType>;
Expand Down Expand Up @@ -2785,6 +2785,8 @@ namespace Ifpack2 {
using internal_vector_type = typename impl_type::internal_vector_type;
static constexpr int vector_length = impl_type::vector_length;
static constexpr int internal_vector_length = impl_type::internal_vector_length;
static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");

/// team policy member type
using team_policy_type = Kokkos::TeamPolicy<execution_space>;
Expand Down Expand Up @@ -2812,7 +2814,6 @@ namespace Ifpack2 {
// diagonal safety
const magnitude_type tiny;
const local_ordinal_type vector_loop_size;
const local_ordinal_type vector_length_value;

bool hasBlockCrsMatrix;

Expand Down Expand Up @@ -2873,8 +2874,7 @@ namespace Ifpack2 {
blocksize_square(blocksize*blocksize),
// diagonal weight to avoid zero pivots
tiny(tiny_),
vector_loop_size(vector_length/internal_vector_length),
vector_length_value(vector_length) {
vector_loop_size(vector_length/internal_vector_length) {
using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;

Expand Down Expand Up @@ -3191,7 +3191,7 @@ namespace Ifpack2 {
const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);

internal_vector_scratch_type_3d_view
WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);

#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
Expand Down Expand Up @@ -3294,7 +3294,7 @@ namespace Ifpack2 {
(void) npacks;

internal_vector_scratch_type_3d_view
WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
if (local_subpartidx == 0) {
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
Expand Down Expand Up @@ -3334,9 +3334,6 @@ namespace Ifpack2 {
//const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
//const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);

internal_vector_scratch_type_3d_view
WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);

// Compute S = D - C E

const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
Expand Down Expand Up @@ -3440,7 +3437,7 @@ namespace Ifpack2 {
const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);

internal_vector_scratch_type_3d_view
WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);

#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
Expand Down Expand Up @@ -3477,7 +3474,7 @@ namespace Ifpack2 {
const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
writeBTDValuesToFile(n_parts, scalar_values, "before.mm");

policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
policy, *this);
execution_space().fence();
Expand All @@ -3504,7 +3501,7 @@ namespace Ifpack2 {
Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);

policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
policy, *this);
execution_space().fence();
Expand All @@ -3523,7 +3520,7 @@ namespace Ifpack2 {
Kokkos::TeamPolicy<execution_space,ComputeETag>
policy(packindices_sub.extent(0), team_size, vector_loop_size);

policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
policy, *this);
execution_space().fence();
Expand All @@ -3544,7 +3541,6 @@ namespace Ifpack2 {
Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);

policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
policy, *this);
writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
Expand All @@ -3561,7 +3557,7 @@ namespace Ifpack2 {
IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
policy(packindices_schur.extent(0), team_size, vector_loop_size);
policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
policy, *this);
execution_space().fence();
Expand All @@ -3587,9 +3583,29 @@ namespace Ifpack2 {
const BlockHelperDetails::PartInterface<MatrixType> &interf,
BlockTridiags<MatrixType> &btdm,
const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
using impl_type = BlockHelperDetails::ImplType<MatrixType>;
using execution_space = typename impl_type::execution_space;
using team_policy_type = Kokkos::TeamPolicy<execution_space>;
using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;

IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, G, tiny);
function.run();

int blocksize = btdm.values.extent(1);
// Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
// For large block sizes, have to fall back to level 1 scratch.
int scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
int max_scratch = team_policy_type::scratch_size_max(0);

if(scratch_required < max_scratch) {
// Can use level 0 scratch
ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
function.run();
}
else {
// Not enough level 0 scratch, so fall back to level 1
ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
function.run();
}
IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
}

Expand Down Expand Up @@ -3654,7 +3670,6 @@ namespace Ifpack2 {
packed_multivector(pmv) {}

// TODO:: modify this routine similar to the team level functions
// inline ---> FIXME HIP: should not need the KOKKOS_INLINE_FUNCTION below...
KOKKOS_INLINE_FUNCTION
void
operator() (const local_ordinal_type &packidx) const {
Expand Down
Loading