Skip to content

Commit

Permalink
ch4/init: Wait for ops on init comm to complete
Browse files Browse the repository at this point in the history
If the special init comm used for roots only address exchange still has
pending operations at destroy time, it could cause an assertion failure
during MPI_INIT. Instead, we should wait for pending ops to complete to
avoid a crash. Fixes pmodels#7200.
  • Loading branch information
raffenet committed Nov 7, 2024
1 parent 0afd4f5 commit f50f7f1
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 4 deletions.
2 changes: 1 addition & 1 deletion src/mpid/ch4/include/mpidch4.h
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,6 @@ extern MPL_dbg_class MPIDI_CH4_DBG_MEMORY;

/* routines only used during init */
int MPIDI_create_init_comm(MPIR_Comm ** comm_ptr);
void MPIDI_destroy_init_comm(MPIR_Comm ** comm_ptr);
int MPIDI_destroy_init_comm(MPIR_Comm ** comm_ptr);

#endif /* MPIDCH4_H_INCLUDED */
2 changes: 1 addition & 1 deletion src/mpid/ch4/netmod/ofi/init_addrxchg.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ int MPIDI_OFI_addr_exchange_root_ctx(void)

fn_exit:
if (init_comm && !mpi_errno) {
MPIDI_destroy_init_comm(&init_comm);
mpi_errno = MPIDI_destroy_init_comm(&init_comm);
}
return mpi_errno;
fn_fail:
Expand Down
2 changes: 1 addition & 1 deletion src/mpid/ch4/netmod/ucx/ucx_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ static int initial_address_exchange(void)

fn_exit:
if (init_comm && !mpi_errno) {
MPIDI_destroy_init_comm(&init_comm);
mpi_errno = MPIDI_destroy_init_comm(&init_comm);
}
return mpi_errno;
fn_fail:
Expand Down
8 changes: 7 additions & 1 deletion src/mpid/ch4/src/init_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,24 @@ int MPIDI_create_init_comm(MPIR_Comm ** comm)
goto fn_exit;
}

void MPIDI_destroy_init_comm(MPIR_Comm ** comm_ptr)
int MPIDI_destroy_init_comm(MPIR_Comm ** comm_ptr)
{
int mpi_errno = MPI_SUCCESS;
int in_use;
MPIR_Comm *comm = NULL;
if (*comm_ptr != NULL) {
comm = *comm_ptr;
MPIDIU_release_lut(MPIDI_COMM(comm, map).irreg.lut.t);
MPIDIG_destroy_comm(comm);
/* wait for any outstanding requests to complete */
MPIDIU_PROGRESS_WHILE(MPIR_Object_get_ref(comm) > 1, 0);
MPIR_Object_release_ref(comm, &in_use);
MPIR_Assertp(in_use == 0);
MPII_COMML_FORGET(comm);
MPIR_Handle_obj_free(&MPIR_Comm_mem, comm);
*comm_ptr = NULL;
}

fn_fail:
return mpi_errno;
}

0 comments on commit f50f7f1

Please sign in to comment.