Skip to content

Commit 55290bb

Browse files
authored
Merge pull request #10372 from evgeny-leksikov/uct_gga_res_list
UCT/GGA: filter out unsupported resources from component list
2 parents fec7739 + 777757e commit 55290bb

File tree

5 files changed

+181
-75
lines changed

5 files changed

+181
-75
lines changed

src/uct/ib/base/ib_md.c

+18-3
Original file line numberDiff line numberDiff line change
@@ -817,9 +817,10 @@ int uct_ib_device_is_accessible(struct ibv_device *device)
817817
return uct_ib_device_is_supported(device);
818818
}
819819

820-
ucs_status_t uct_ib_query_md_resources(uct_component_t *component,
821-
uct_md_resource_desc_t **resources_p,
822-
unsigned *num_resources_p)
820+
ucs_status_t
821+
uct_ib_base_query_md_resources(uct_md_resource_desc_t **resources_p,
822+
unsigned *num_resources_p,
823+
uct_ib_check_device_cb_t check_device_cb)
823824
{
824825
int num_resources = 0;
825826
uct_md_resource_desc_t *resources;
@@ -859,6 +860,11 @@ ucs_status_t uct_ib_query_md_resources(uct_component_t *component,
859860
continue;
860861
}
861862

863+
/* Skip not applicable devices */
864+
if (!check_device_cb(device_list[i])) {
865+
continue;
866+
}
867+
862868
ucs_snprintf_zero(resources[num_resources].md_name,
863869
sizeof(resources[num_resources].md_name),
864870
"%s", ibv_get_device_name(device_list[i]));
@@ -1101,6 +1107,15 @@ uct_ib_fork_init(const uct_ib_md_config_t *md_config, int *fork_init_p)
11011107
return UCS_OK;
11021108
}
11031109

1110+
static ucs_status_t
1111+
uct_ib_query_md_resources(uct_component_t *component,
1112+
uct_md_resource_desc_t **resources_p,
1113+
unsigned *num_resources_p)
1114+
{
1115+
return uct_ib_base_query_md_resources(resources_p, num_resources_p,
1116+
ucs_empty_function_return_one_int);
1117+
}
1118+
11041119
static ucs_status_t
11051120
uct_ib_md_open(uct_component_t *component, const char *md_name,
11061121
const uct_md_config_t *uct_md_config, uct_md_h *md_p)

src/uct/ib/base/ib_md.h

+13-3
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@
3535
uct_ib_device_name(&(_md)->dev), ## __VA_ARGS__)
3636

3737

38+
/**
39+
* Callback function to check and filter out not applicable devices.
40+
*
41+
* @param [in] device IB device.
42+
* @return 1 for acceptable device, otherwise 0.
43+
*/
44+
typedef int (*uct_ib_check_device_cb_t)(struct ibv_device *device);
45+
46+
3847
/**
3948
* IB MD statistics counters
4049
*/
@@ -406,9 +415,10 @@ ucs_status_t uct_ib_rkey_unpack(uct_component_t *component,
406415
const void *rkey_buffer, uct_rkey_t *rkey_p,
407416
void **handle_p);
408417

409-
ucs_status_t uct_ib_query_md_resources(uct_component_t *component,
410-
uct_md_resource_desc_t **resources_p,
411-
unsigned *num_resources_p);
418+
ucs_status_t
419+
uct_ib_base_query_md_resources(uct_md_resource_desc_t **resources_p,
420+
unsigned *num_resources_p,
421+
uct_ib_check_device_cb_t check_device_cb);
412422

413423
ucs_status_t uct_ib_get_device_by_name(struct ibv_device **ib_device_list,
414424
int num_devices, const char *md_name,

src/uct/ib/mlx5/dv/ib_mlx5dv_md.c

+63-59
Original file line numberDiff line numberDiff line change
@@ -1765,27 +1765,34 @@ uct_ib_mlx5_devx_query_lag(uct_ib_mlx5_md_t *md, uint8_t *state)
17651765
return UCS_OK;
17661766
}
17671767

1768-
static struct ibv_context *
1769-
uct_ib_mlx5_devx_open_device(struct ibv_device *ibv_device)
1768+
struct ibv_context* uct_ib_mlx5_devx_open_device(struct ibv_device *ibv_device)
17701769
{
1771-
struct mlx5dv_context_attr dv_attr = {};
1772-
struct mlx5dv_devx_event_channel *event_channel;
1770+
struct mlx5dv_context_attr dv_attr = {
1771+
.flags = MLX5DV_CONTEXT_FLAGS_DEVX
1772+
};
17731773
struct ibv_context *ctx;
1774-
struct ibv_cq *cq;
17751774

1776-
dv_attr.flags |= MLX5DV_CONTEXT_FLAGS_DEVX;
17771775
ctx = mlx5dv_open_device(ibv_device, &dv_attr);
17781776
if (ctx == NULL) {
17791777
ucs_debug("mlx5dv_open_device(%s) failed: %m",
17801778
ibv_get_device_name(ibv_device));
17811779
return NULL;
17821780
}
17831781

1782+
return ctx;
1783+
}
1784+
1785+
static ucs_status_t
1786+
uct_ib_mlx5_devx_check_event_channel(struct ibv_context *ctx)
1787+
{
1788+
struct mlx5dv_devx_event_channel *event_channel;
1789+
struct ibv_cq *cq;
1790+
17841791
cq = ibv_create_cq(ctx, 1, NULL, NULL, 0);
17851792
if (cq == NULL) {
17861793
uct_ib_check_memlock_limit_msg(ctx, UCS_LOG_LEVEL_DEBUG,
17871794
"ibv_create_cq()");
1788-
goto close_ctx;
1795+
return UCS_ERR_UNSUPPORTED;
17891796
}
17901797

17911798
ibv_destroy_cq(cq);
@@ -1794,17 +1801,12 @@ uct_ib_mlx5_devx_open_device(struct ibv_device *ibv_device)
17941801
ctx, MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA);
17951802
if (event_channel == NULL) {
17961803
ucs_diag("mlx5dv_devx_create_event_channel(%s) failed: %m",
1797-
ibv_get_device_name(ibv_device));
1798-
goto close_ctx;
1804+
ibv_get_device_name(ctx->device));
1805+
return UCS_ERR_UNSUPPORTED;
17991806
}
18001807

18011808
mlx5dv_devx_destroy_event_channel(event_channel);
1802-
1803-
return ctx;
1804-
1805-
close_ctx:
1806-
ibv_close_device(ctx);
1807-
return NULL;
1809+
return UCS_OK;
18081810
}
18091811

18101812
static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops;
@@ -1851,8 +1853,21 @@ static void uct_ib_mlx5_devx_init_flush_mr(uct_ib_mlx5_md_t *md)
18511853
md->super.flush_rkey = uct_ib_mlx5_flush_rkey_make();
18521854
}
18531855

1854-
static ucs_status_t
1855-
uct_ib_mlx5_devx_query_cap_2(struct ibv_context *ctx, void *out, size_t size)
1856+
ucs_status_t uct_ib_mlx5_devx_query_cap(struct ibv_context *ctx, uint32_t opmod,
1857+
void *out, size_t size, char *msg_arg,
1858+
int silent)
1859+
{
1860+
char in[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_in)] = {};
1861+
1862+
UCT_IB_MLX5DV_SET(query_hca_cap_in, in, opcode,
1863+
UCT_IB_MLX5_CMD_OP_QUERY_HCA_CAP);
1864+
UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, opmod);
1865+
return uct_ib_mlx5_devx_general_cmd(ctx, in, ucs_static_array_size(in),
1866+
out, size, msg_arg, silent);
1867+
}
1868+
1869+
ucs_status_t uct_ib_mlx5_devx_query_cap_2(struct ibv_context *ctx,
1870+
void *out, size_t size)
18561871
{
18571872
char in[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_in)] = {};
18581873

@@ -1866,8 +1881,7 @@ uct_ib_mlx5_devx_query_cap_2(struct ibv_context *ctx, void *out, size_t size)
18661881
"QUERY_HCA_CAP, CAP2", 1);
18671882
}
18681883

1869-
static void uct_ib_mlx5_devx_check_xgvmi(uct_ib_mlx5_md_t *md, void *cap_2,
1870-
uct_ib_device_t *dev)
1884+
int uct_ib_mlx5_devx_check_xgvmi(void *cap_2, const char *dev_name)
18711885
{
18721886
uint64_t object_for_other_vhca;
18731887
uint32_t object_to_object;
@@ -1881,13 +1895,11 @@ static void uct_ib_mlx5_devx_check_xgvmi(uct_ib_mlx5_md_t *md, void *cap_2,
18811895
UCT_IB_MLX5_HCA_CAPS_2_CROSS_VHCA_OBJ_TO_OBJ_LOCAL_MKEY_TO_REMOTE_MKEY) &&
18821896
(object_for_other_vhca &
18831897
UCT_IB_MLX5_HCA_CAPS_2_ALLOWED_OBJ_FOR_OTHER_VHCA_ACCESS_MKEY)) {
1884-
md->flags |= UCT_IB_MLX5_MD_FLAG_INDIRECT_XGVMI;
1885-
md->super.cap_flags |= UCT_MD_FLAG_EXPORTED_MKEY;
1886-
ucs_debug("%s: cross gvmi alias mkey is supported",
1887-
uct_ib_device_name(dev));
1898+
ucs_debug("%s: cross gvmi alias mkey is supported", dev_name);
1899+
return 1;
18881900
} else {
1889-
ucs_debug("%s: crossing_vhca_mkey is not supported",
1890-
uct_ib_device_name(dev));
1901+
ucs_debug("%s: crossing_vhca_mkey is not supported", dev_name);
1902+
return 0;
18911903
}
18921904
}
18931905

@@ -2157,22 +2169,18 @@ ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
21572169
const uct_ib_md_config_t *md_config,
21582170
uct_ib_md_t **p_md)
21592171
{
2160-
size_t out_len = UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_out);
2161-
size_t in_len = UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_in);
2162-
size_t total_len = (2 * out_len) + in_len;
2163-
char *buf, *out, *in, *cap_2_out;
2172+
uint8_t lag_state = 0;
2173+
size_t out_len = UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_out);
2174+
size_t total_len = 2 * out_len;
2175+
char *buf, *out, *cap_2_out;
2176+
void *cap, *cap_2;
21642177
ucs_status_t status;
2165-
void *cap_2;
2166-
uint8_t lag_state = 0;
21672178
uint8_t log_max_qp;
21682179
uint16_t vhca_id;
21692180
struct ibv_context *ctx;
21702181
uct_ib_device_t *dev;
21712182
uct_ib_mlx5_md_t *md;
21722183
unsigned max_rd_atomic_dc;
2173-
void *cap;
2174-
int ret;
2175-
ucs_log_level_t log_level;
21762184
ucs_mpool_params_t mp_params;
21772185
int ksm_atomic;
21782186

@@ -2184,8 +2192,7 @@ ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
21842192
}
21852193

21862194
out = buf;
2187-
in = UCS_PTR_BYTE_OFFSET(out, out_len);
2188-
cap_2_out = UCS_PTR_BYTE_OFFSET(in, in_len);
2195+
cap_2_out = UCS_PTR_BYTE_OFFSET(out, out_len);
21892196

21902197
if (!mlx5dv_is_supported(ibv_device)) {
21912198
status = UCS_ERR_UNSUPPORTED;
@@ -2203,6 +2210,11 @@ ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
22032210
goto err_free_buffer;
22042211
}
22052212

2213+
status = uct_ib_mlx5_devx_check_event_channel(ctx);
2214+
if (status != UCS_OK) {
2215+
goto err_free_context;
2216+
}
2217+
22062218
md = ucs_derived_of(uct_ib_md_alloc(sizeof(*md), "ib_mlx5_devx_md", ctx),
22072219
uct_ib_mlx5_md_t);
22082220
if (md == NULL) {
@@ -2224,24 +2236,12 @@ ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
22242236
goto err_lru_cleanup;
22252237
}
22262238

2227-
cap = UCT_IB_MLX5DV_ADDR_OF(query_hca_cap_out, out, capability);
2228-
UCT_IB_MLX5DV_SET(query_hca_cap_in, in, opcode, UCT_IB_MLX5_CMD_OP_QUERY_HCA_CAP);
2229-
UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR |
2230-
(UCT_IB_MLX5_CAP_GENERAL << 1));
2231-
ret = mlx5dv_devx_general_cmd(ctx, in, in_len, out, out_len);
2232-
if (ret != 0) {
2233-
if ((errno == EPERM) || (errno == EPROTONOSUPPORT) ||
2234-
(errno == EOPNOTSUPP)) {
2235-
status = UCS_ERR_UNSUPPORTED;
2236-
log_level = UCS_LOG_LEVEL_DEBUG;
2237-
} else {
2238-
status = UCS_ERR_IO_ERROR;
2239-
log_level = UCS_LOG_LEVEL_ERROR;
2240-
}
2241-
ucs_log(log_level,
2242-
"mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed,"
2243-
" syndrome 0x%x: %m",
2244-
UCT_IB_MLX5DV_GET(query_hca_cap_out, out, syndrome));
2239+
cap = UCT_IB_MLX5DV_ADDR_OF(query_hca_cap_out, out, capability);
2240+
status = uct_ib_mlx5_devx_query_cap(ctx,
2241+
UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR |
2242+
(UCT_IB_MLX5_CAP_GENERAL << 1),
2243+
out, out_len, "QUERY_HCA_CAP", 0);
2244+
if (status != UCS_OK) {
22452245
goto err_lru_cleanup;
22462246
}
22472247

@@ -2350,8 +2350,11 @@ ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
23502350
status = uct_ib_mlx5_devx_query_cap_2(ctx, cap_2_out, out_len);
23512351
if (status == UCS_OK) {
23522352
cap_2 = UCT_IB_MLX5DV_ADDR_OF(query_hca_cap_out, cap_2_out, capability);
2353+
if (uct_ib_mlx5_devx_check_xgvmi(cap_2, uct_ib_device_name(dev))) {
2354+
md->flags |= UCT_IB_MLX5_MD_FLAG_INDIRECT_XGVMI;
2355+
md->super.cap_flags |= UCT_MD_FLAG_EXPORTED_MKEY;
2356+
}
23532357

2354-
uct_ib_mlx5_devx_check_xgvmi(md, cap_2, dev);
23552358
uct_ib_mlx5_devx_check_mkey_by_name(md, cap_2, dev);
23562359
} else {
23572360
cap_2 = NULL;
@@ -2367,10 +2370,11 @@ ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
23672370
uint8_t arg_size;
23682371
int cap_ops, mode8b;
23692372

2370-
UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR |
2371-
(UCT_IB_MLX5_CAP_ATOMIC << 1));
2372-
status = uct_ib_mlx5_devx_general_cmd(ctx, in, in_len, out, out_len,
2373-
"QUERY_HCA_CAP, ATOMIC", 0);
2373+
status = uct_ib_mlx5_devx_query_cap(ctx,
2374+
UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR |
2375+
(UCT_IB_MLX5_CAP_ATOMIC << 1),
2376+
out, out_len,
2377+
"QUERY_HCA_CAP, ATOMIC", 0);
23742378
if (status != UCS_OK) {
23752379
goto err_lru_cleanup;
23762380
}

0 commit comments

Comments
 (0)