Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v1.22.x]prov/efa: differentiate unresponsive receiver errors following rdma-core #10497

Open
wants to merge 1 commit into
base: v1.22.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions prov/efa/src/efa_errno.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,9 @@
_(10, REMOTE_ERROR_RNR, Destination resource not ready (no work queue entries posted on receive queue)) \
_(11, REMOTE_ERROR_BAD_LENGTH, Remote scatter-gather list too short) \
_(12, REMOTE_ERROR_BAD_STATUS, Unexpected status returned by responder) \
_(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (detected locally)) \
_(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations))
_(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (was previously responsive)) \
_(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations)) \
_(15, LOCAL_ERROR_UNREACH_REMOTE, Unreachable remote (never received a response))

/**
* @brief EFA provider proprietary error codes
Expand Down Expand Up @@ -105,7 +106,8 @@
_(4122, SHM_INTERNAL_ERROR, SHM internal error) \
_(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \
_(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) \
_(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON)
_(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) \
_(4126, UNESTABLISHED_RECV_UNRESP, Unresponsive receiver (reachable by EFA device but handshake failed))

/** @} */

Expand Down Expand Up @@ -156,13 +158,15 @@ static inline int to_fi_errno(enum efa_errno err) {
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNSUPPORTED_OP:
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS:
return FI_EINVAL;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE:
return FI_EHOSTUNREACH;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH:
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH:
return FI_EMSGSIZE;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP:
case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP:
return FI_ECONNABORTED;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN:
case EFA_IO_COMP_STATUS_REMOTE_ERROR_UNKNOWN_PEER:
Expand Down
16 changes: 11 additions & 5 deletions prov/efa/src/efa_strerror.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ void efa_show_help(enum efa_errno err) {
help = "This error is detected remotely; "
"typically encountered when the peer process is no longer present";
break;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE:
help = "This error is detected locally. "
"The connection status is unknown or was never established via "
"handshake. This typically indicates one or more misconfigured "
"The peer is not reachable by the EFA device. "
"This typically indicates one or more misconfigured "
"EC2 instances; most often due to incorrect inbound/outbound "
"security group rules and/or instances placed in different "
"subnets. Refer to the public AWS documentation for EFA for "
Expand All @@ -80,8 +80,14 @@ void efa_show_help(enum efa_errno err) {
case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP:
help = "This error is detected locally. "
"The connection was previously established via handshake, "
"which indicates the error is likely due to the peer process no "
"longer being present.";
"which indicates the error is likely due to a hardware failure "
"on the remote peer, or the peer process no longer being present.";
break;
case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP:
help = "This error is detected locally. "
"The peer is reachable by the EFA device but libfabric failed "
"to complete a handshake, which indicates the error is likely "
"due to the peer process no longer being present.";
break;
case FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX:
help = "This error is detected locally. "
Expand Down
9 changes: 6 additions & 3 deletions prov/efa/src/rdm/efa_rdm_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,9 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct
*
* @todo Currently, this only checks for unresponsive receiver
* (#EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE) and attempts to promote it to
* #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP. This should be expanded to handle other
* #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP if a handshake was made, or
* #FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP if the handshake failed.
* This should be expanded to handle other
* RDMA Core error codes (#EFA_IO_COMP_STATUSES) for the sake of more accurate
* error reporting
*/
Expand All @@ -418,8 +420,9 @@ static int efa_rdm_cq_get_prov_errno(struct ibv_cq_ex *ibv_cq_ex) {

switch (vendor_err) {
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: {
if (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)
vendor_err = FI_EFA_ERR_ESTABLISHED_RECV_UNRESP;
vendor_err = (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) ?
FI_EFA_ERR_ESTABLISHED_RECV_UNRESP :
FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP;
break;
}
default:
Expand Down
17 changes: 17 additions & 0 deletions prov/efa/test/efa_unit_test_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,23 @@ void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id
EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE);
}

/**
* @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns
* unreachable remote error for send.
*
* When send operation failed, fi_cq_read() should return -FI_EAVAIL, which means error available.
* then user should call fi_cq_readerr() to get an error CQ entry that contain error code.
*
* @param[in] state struct efa_resource that is managed by the framework
*/
void test_rdm_cq_read_bad_send_status_unreachable_receiver(struct efa_resource **state)
{
struct efa_resource *resource = *state;
test_rdm_cq_read_bad_send_status(resource,
0x1234567812345678, 0x8765432187654321,
EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE);
}

/**
* @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns
* invalid qpn error for send.
Expand Down
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ int main(void)
cmocka_unit_test_setup_teardown(test_rdm_cq_create_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unreachable_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_invalid_qpn, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_message_too_long, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_status, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
Expand Down
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ void test_ibv_cq_ex_read_failed_poll();
void test_rdm_cq_create_error_handling();
void test_rdm_cq_read_bad_send_status_unresponsive_receiver();
void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id();
void test_rdm_cq_read_bad_send_status_unreachable_receiver();
void test_rdm_cq_read_bad_send_status_invalid_qpn();
void test_rdm_cq_read_bad_send_status_message_too_long();
void test_ibv_cq_ex_read_bad_recv_status();
Expand Down
Loading