Skip to content

Commit 4fcffef

Browse files
authored
Merge pull request #10581 from yosefe/topic/uct-dc-fix-segfault-in-uct-dc
UCT/DC: Fix segfault in uct_dc_mlx5_ep_fence
2 parents 9a56917 + 8e82e62 commit 4fcffef

File tree

8 files changed

+40
-33
lines changed

8 files changed

+40
-33
lines changed

src/uct/ib/mlx5/dc/dc_mlx5.c

+9-20
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,8 @@ static void uct_ib_mlx5dv_dci_qp_init_attr(uct_ib_qp_init_attr_t *qp_attr,
368368

369369
ucs_status_t uct_dc_mlx5_iface_create_dci(uct_dc_mlx5_iface_t *iface,
370370
uct_dci_index_t dci_index,
371-
uint8_t num_dci_channels)
371+
uint8_t num_dci_channels,
372+
uct_dc_dci_t **dci_p)
372373
{
373374
uct_ib_iface_t *ib_iface = &iface->super.super.super;
374375
uct_ib_mlx5_qp_attr_t attr = {};
@@ -455,11 +456,6 @@ ucs_status_t uct_dc_mlx5_iface_create_dci(uct_dc_mlx5_iface_t *iface,
455456
goto err_qp;
456457
}
457458

458-
ucs_assertv_always(!uct_dc_mlx5_is_dci_valid(
459-
uct_dc_mlx5_iface_dci(iface, dci_index)),
460-
"iface=%p dci_index=%d", iface, dci_index);
461-
ucs_array_elem(&iface->tx.dcis, dci_index) = dci;
462-
463459
if (uct_dc_mlx5_iface_is_policy_shared(iface)) {
464460
ucs_arbiter_group_init(&dci->arb_group);
465461
} else {
@@ -477,6 +473,7 @@ ucs_status_t uct_dc_mlx5_iface_create_dci(uct_dc_mlx5_iface_t *iface,
477473

478474
uct_rc_txqp_available_set(&dci->txqp, dci->txwq.bb_max);
479475

476+
*dci_p = dci;
480477
return UCS_OK;
481478

482479
err:
@@ -759,11 +756,10 @@ static void uct_dc_mlx5_iface_dci_pool_destroy(uct_dc_mlx5_dci_pool_t *dci_pool)
759756
ucs_array_cleanup_dynamic(&dci_pool->stack);
760757
}
761758

762-
void uct_dc_mlx5_destroy_dci(uct_dc_mlx5_iface_t *iface, uint16_t dci_index)
759+
void uct_dc_mlx5_destroy_dci(uct_dc_mlx5_iface_t *iface, uct_dc_dci_t *dci)
763760
{
764761
uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md,
765762
uct_ib_mlx5_md_t);
766-
uct_dc_dci_t *dci = uct_dc_mlx5_iface_dci(iface, dci_index);
767763

768764
uct_rc_txqp_cleanup(&iface->super.super, &dci->txqp);
769765
uct_ib_mlx5_destroy_qp(md, &dci->txwq.super);
@@ -774,17 +770,16 @@ void uct_dc_mlx5_destroy_dci(uct_dc_mlx5_iface_t *iface, uint16_t dci_index)
774770
uct_ib_mlx5_qp_mmio_cleanup(&dci->txwq.super, dci->txwq.reg);
775771

776772
ucs_free(dci);
777-
ucs_array_elem(&iface->tx.dcis, dci_index) = NULL;
778773
}
779774

780775
static void uct_dc_mlx5_iface_dcis_destroy(uct_dc_mlx5_iface_t *iface)
781776
{
782-
uct_dc_dci_t *dci UCS_V_UNUSED;
783777
uct_dci_index_t dci_index;
784778
uint8_t pool_index;
779+
uct_dc_dci_t *dci;
785780

786781
uct_dc_iface_for_each_valid_dci(dci_index, dci, iface) {
787-
uct_dc_mlx5_destroy_dci(iface, dci_index);
782+
uct_dc_mlx5_destroy_dci(iface, dci);
788783
}
789784

790785
for (pool_index = 0; pool_index < iface->tx.num_dci_pools; pool_index++) {
@@ -885,19 +880,13 @@ uct_dc_mlx5_iface_init_dcis_array(uct_dc_mlx5_iface_t *iface,
885880
return status;
886881
}
887882

888-
ucs_array_length(&iface->tx.dcis) = 0;
889-
890-
status = uct_dc_mlx5_iface_create_dci(iface, 0, 1);
883+
status = uct_dc_mlx5_iface_create_dci(iface, 0, 1, &dci);
891884
if (status != UCS_OK) {
892885
return status;
893886
}
894887

895-
dci = uct_dc_mlx5_iface_dci(iface, 0);
896-
897888
iface->tx.bb_max = dci->txwq.bb_max;
898-
uct_dc_mlx5_destroy_dci(iface, 0);
899-
900-
ucs_array_length(&iface->tx.dcis) = 0;
889+
uct_dc_mlx5_destroy_dci(iface, dci);
901890

902891
return UCS_OK;
903892
}
@@ -1603,7 +1592,7 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h wor
16031592
self->tx.ndci = uct_dc_mlx5_iface_is_hw_dcs(self) ? 1 : config->ndci;
16041593

16051594
if (config->dcis_initial_capacity == UCS_ULUNITS_AUTO) {
1606-
max_capacity = uct_dc_mlx5_iface_is_dci_rand(self) ? SIZE_MAX : 1;
1595+
max_capacity = uct_dc_mlx5_iface_is_dci_rand(self) ? SIZE_MAX : 0;
16071596
} else {
16081597
max_capacity = config->dcis_initial_capacity;
16091598
}

src/uct/ib/mlx5/dc/dc_mlx5.h

+6-2
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,8 @@ void uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface,
412412

413413
ucs_status_t uct_dc_mlx5_iface_create_dci(uct_dc_mlx5_iface_t *iface,
414414
uct_dci_index_t dci_index,
415-
uint8_t num_dci_channels);
415+
uint8_t num_dci_channels,
416+
uct_dc_dci_t **dci_p);
416417

417418
ucs_status_t uct_dc_mlx5_iface_resize_and_fill_dcis(uct_dc_mlx5_iface_t *iface,
418419
uint16_t size);
@@ -429,7 +430,7 @@ uct_dc_mlx5_dci_pool_get_or_create(uct_dc_mlx5_iface_t *iface,
429430
uint32_t
430431
uct_dc_mlx5_dci_config_hash(const uct_dc_mlx5_dci_config_t *dci_config);
431432

432-
void uct_dc_mlx5_destroy_dci(uct_dc_mlx5_iface_t *iface, uint16_t dci_index);
433+
void uct_dc_mlx5_destroy_dci(uct_dc_mlx5_iface_t *iface, uct_dc_dci_t *dci);
433434

434435
static UCS_F_ALWAYS_INLINE uint8_t uct_dc_mlx5_is_dci_valid(const uct_dc_dci_t *dci)
435436
{
@@ -439,6 +440,9 @@ static UCS_F_ALWAYS_INLINE uint8_t uct_dc_mlx5_is_dci_valid(const uct_dc_dci_t *
439440
static UCS_F_ALWAYS_INLINE uct_dc_dci_t *
440441
uct_dc_mlx5_iface_dci(uct_dc_mlx5_iface_t *iface, uct_dci_index_t dci_index)
441442
{
443+
ucs_assertv(dci_index < ucs_array_length(&iface->tx.dcis),
444+
"dci_index=%d dcis_length=%d", dci_index,
445+
ucs_array_length(&iface->tx.dcis));
442446
return ucs_array_elem(&iface->tx.dcis, dci_index);
443447
}
444448

src/uct/ib/mlx5/dc/dc_mlx5_ep.c

+10-3
Original file line numberDiff line numberDiff line change
@@ -273,10 +273,17 @@ ucs_status_t uct_dc_mlx5_ep_fence(uct_ep_h tl_ep, unsigned flags)
273273
{
274274
uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t);
275275
uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
276+
uct_dc_dci_t *dci;
276277

277-
return uct_rc_ep_fence(tl_ep,
278-
&uct_dc_mlx5_iface_dci(iface, ep->dci)->txwq.fi,
279-
ep->dci != UCT_DC_MLX5_EP_NO_DCI);
278+
if (ep->dci != UCT_DC_MLX5_EP_NO_DCI) {
279+
dci = uct_dc_mlx5_iface_dci(iface, ep->dci);
280+
ucs_assertv(uct_dc_mlx5_is_dci_valid(dci), "iface=%p ep=%p dci=%d",
281+
iface, ep, ep->dci);
282+
return uct_rc_ep_fence(tl_ep, &dci->txwq.fi);
283+
}
284+
285+
UCT_TL_EP_STAT_FENCE(&ep->super);
286+
return UCS_OK;
280287
}
281288

282289
static ucs_status_t UCS_F_ALWAYS_INLINE

src/uct/ib/mlx5/dc/dc_mlx5_ep.h

+8-3
Original file line numberDiff line numberDiff line change
@@ -365,17 +365,21 @@ uct_dc_mlx5_dci_pool_init_dci(uct_dc_mlx5_iface_t *iface, uint8_t pool_index,
365365
num_channels = iface->tx.num_dci_channels;
366366
}
367367

368-
status = uct_dc_mlx5_iface_create_dci(iface, dci_index, num_channels);
368+
status = uct_dc_mlx5_iface_create_dci(iface, dci_index, num_channels, &dci);
369369
if (status != UCS_OK) {
370370
ucs_error("iface %p: failed to create dci %u at pool %u", iface,
371371
dci_index, pool_index);
372372
goto err;
373373
}
374374

375-
dci = uct_dc_mlx5_iface_dci(iface, dci_index);
376375
dci->path_index = pool->config.path_index;
377376
dci->pool_index = pool_index;
378377

378+
ucs_assertv_always(!uct_dc_mlx5_is_dci_valid(
379+
uct_dc_mlx5_iface_dci(iface, dci_index)),
380+
"iface=%p dci_index=%d", iface, dci_index);
381+
ucs_array_elem(&iface->tx.dcis, dci_index) = dci;
382+
379383
status = uct_dc_mlx5_iface_dci_connect(iface, dci);
380384
if (status != UCS_OK) {
381385
ucs_error("iface %p: failed to connect dci %u at pool %u", iface,
@@ -393,7 +397,8 @@ uct_dc_mlx5_dci_pool_init_dci(uct_dc_mlx5_iface_t *iface, uint8_t pool_index,
393397
return UCS_OK;
394398

395399
err_destroy:
396-
uct_dc_mlx5_destroy_dci(iface, dci_index);
400+
ucs_array_elem(&iface->tx.dcis, dci_index) = NULL;
401+
uct_dc_mlx5_destroy_dci(iface, dci);
397402
err:
398403
return status;
399404
}

src/uct/ib/mlx5/rc/rc_mlx5_ep.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,7 @@ ucs_status_t uct_rc_mlx5_base_ep_fence(uct_ep_h tl_ep, unsigned flags)
520520
{
521521
uct_rc_mlx5_base_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_mlx5_base_ep_t);
522522

523-
return uct_rc_ep_fence(tl_ep, &ep->tx.wq.fi, 1);
523+
return uct_rc_ep_fence(tl_ep, &ep->tx.wq.fi);
524524
}
525525

526526
void uct_rc_mlx5_base_ep_post_check(uct_ep_h tl_ep)

src/uct/ib/rc/base/rc_ep.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -519,14 +519,14 @@ uct_rc_ep_fm(uct_rc_iface_t *iface, uct_ib_fence_info_t* fi, int flag)
519519
return fence;
520520
}
521521

522-
static UCS_F_ALWAYS_INLINE ucs_status_t
523-
uct_rc_ep_fence(uct_ep_h tl_ep, uct_ib_fence_info_t* fi, int fence)
522+
static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_ep_fence(uct_ep_h tl_ep,
523+
uct_ib_fence_info_t *fi)
524524
{
525525
uct_rc_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_iface_t);
526526

527527
/* in case if fence is requested and enabled by configuration
528528
* we need to schedule fence for next RDMA operation */
529-
if (fence && (iface->config.fence_mode != UCT_RC_FENCE_MODE_NONE)) {
529+
if (iface->config.fence_mode != UCT_RC_FENCE_MODE_NONE) {
530530
fi->fence_beat = iface->tx.fi.fence_beat - 1;
531531
}
532532

src/uct/ib/rc/verbs/rc_verbs_ep.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ ucs_status_t uct_rc_verbs_ep_fence(uct_ep_h tl_ep, unsigned flags)
516516
{
517517
uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t);
518518

519-
return uct_rc_ep_fence(tl_ep, &ep->fi, 1);
519+
return uct_rc_ep_fence(tl_ep, &ep->fi);
520520
}
521521

522522
void uct_rc_verbs_ep_post_check(uct_ep_h tl_ep)

test/gtest/uct/ib/test_dc.cc

+2
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ UCS_TEST_P(test_dc, dcs_single) {
186186
0;
187187

188188
EXPECT_EQ(UCT_DC_MLX5_EP_NO_DCI, ep->dci);
189+
status = uct_ep_fence(m_e1->ep(0), 0);
190+
EXPECT_UCS_OK(status);
189191
status = uct_ep_am_short(m_e1->ep(0), 0, 0, NULL, 0);
190192
EXPECT_UCS_OK(status);
191193
/* dci 0 must be assigned to the ep */

0 commit comments

Comments
 (0)