Skip to content

Commit ed1861e

Browse files
rautericrajachan
authored andcommitted
rdma: (fix) memory leak: don't alloc recv_conn_resp_req on EAGAIN
Move call to `prepare_recv_conn_resp_req` before entry point of `COMM_SEND_CONN` stage. This resolves a potential memory leak where `prepare_recv_conn_resp_req` is called multiple times on EAGAIN. Also fixup a similar case in `accept`, although there was no memory allocation there so no leak. Signed-off-by: Eric Raut <[email protected]> (cherry picked from commit 398b853)
1 parent 6969756 commit ed1861e

File tree

1 file changed

+8
-8
lines changed

1 file changed

+8
-8
lines changed

src/nccl_ofi_rdma.c

+8-8
Original file line numberDiff line numberDiff line change
@@ -3790,10 +3790,6 @@ static int accept(nccl_net_ofi_listen_comm_t *listen_comm,
37903790
/* Reset request state for connect response message */
37913791
prepare_send_conn_resp_req(l_comm);
37923792

3793-
l_comm->stage = COMM_SEND_CONN;
3794-
3795-
case COMM_SEND_CONN:
3796-
37973793
/* Initialize connect response message */
37983794
ret = prepare_conn_resp(ep, l_comm, dev_id);
37993795
if (ret != 0) {
@@ -3806,6 +3802,10 @@ static int accept(nccl_net_ofi_listen_comm_t *listen_comm,
38063802
/* Send r_comm's remote comm ID */
38073803
conn_msg->remote_comm_id = r_comm->remote_comm_id;
38083804

3805+
l_comm->stage = COMM_SEND_CONN;
3806+
3807+
case COMM_SEND_CONN:
3808+
38093809
/* COMM_SEND_CONN: Send connect response message to remote */
38103810
ret = post_send_conn_resp(r_comm, conn_msg, device, ep, req);
38113811
if (ret == -FI_EAGAIN) {
@@ -5159,17 +5159,17 @@ static int connect(nccl_net_ofi_ep_t *base_ep,
51595159
}
51605160
comm_state->req = &req->base;
51615161

5162-
comm_state->stage = COMM_SEND_CONN;
5163-
5164-
case COMM_SEND_CONN:
5165-
51665162
/* Prepare request to receive connect response message */
51675163
s_comm->conn_resp_req = prepare_recv_conn_resp_req(s_comm);
51685164
if (OFI_UNLIKELY(s_comm->conn_resp_req == NULL)) {
51695165
send_close(s_comm);
51705166
return -EINVAL;
51715167
}
51725168

5169+
comm_state->stage = COMM_SEND_CONN;
5170+
5171+
case COMM_SEND_CONN:
5172+
51735173
/* COMM_SEND_CONN: Post a connect message to send peer connections */
51745174
ret = post_send_conn(s_comm, device, ep, req);
51755175
if (ret == -FI_EAGAIN) {

0 commit comments

Comments
 (0)