Skip to content

Commit 97a4498

Browse files
committed
distances: don't enforce the matrix name for NVLink operations
It may work on custom matrices as long as ports are subtype'd "NVSwitch" for now. Makes #705 obsolete Signed-off-by: Brice Goglin <[email protected]>
1 parent a4b5501 commit 97a4498

File tree

2 files changed

+19
-26
lines changed

2 files changed

+19
-26
lines changed

hwloc/distances.c

+5-19
Original file line numberDiff line numberDiff line change
@@ -1386,19 +1386,12 @@ static __hwloc_inline int is_nvswitch(hwloc_obj_t obj)
13861386
}
13871387

13881388
static int
1389-
hwloc__distances_transform_merge_switch_ports(hwloc_topology_t topology,
1390-
struct hwloc_distances_s *distances)
1389+
hwloc__distances_transform_merge_switch_ports(struct hwloc_distances_s *distances)
13911390
{
1392-
struct hwloc_internal_distances_s *dist = hwloc__internal_distances_from_public(topology, distances);
13931391
hwloc_obj_t *objs = distances->objs;
13941392
hwloc_uint64_t *values = distances->values;
13951393
unsigned first, i, j, nbobjs = distances->nbobjs;
13961394

1397-
if (!dist || strcmp(dist->name, "NVLinkBandwidth")) {
1398-
errno = EINVAL;
1399-
return -1;
1400-
}
1401-
14021395
/* find the first port */
14031396
first = (unsigned) -1;
14041397
for(i=0; i<nbobjs; i++)
@@ -1434,20 +1427,13 @@ hwloc__distances_transform_merge_switch_ports(hwloc_topology_t topology,
14341427
}
14351428

14361429
static int
1437-
hwloc__distances_transform_transitive_closure(hwloc_topology_t topology,
1438-
struct hwloc_distances_s *distances)
1430+
hwloc__distances_transform_transitive_closure(struct hwloc_distances_s *distances)
14391431
{
1440-
struct hwloc_internal_distances_s *dist = hwloc__internal_distances_from_public(topology, distances);
14411432
hwloc_obj_t *objs = distances->objs;
14421433
hwloc_uint64_t *values = distances->values;
14431434
unsigned nbobjs = distances->nbobjs;
14441435
unsigned i, j, k;
14451436

1446-
if (!dist || strcmp(dist->name, "NVLinkBandwidth")) {
1447-
errno = EINVAL;
1448-
return -1;
1449-
}
1450-
14511437
for(i=0; i<nbobjs; i++) {
14521438
hwloc_uint64_t bw_i2sw = 0;
14531439
if (is_nvswitch(objs[i]))
@@ -1475,7 +1461,7 @@ hwloc__distances_transform_transitive_closure(hwloc_topology_t topology,
14751461
}
14761462

14771463
int
1478-
hwloc_distances_transform(hwloc_topology_t topology,
1464+
hwloc_distances_transform(hwloc_topology_t topology __hwloc_attribute_unused,
14791465
struct hwloc_distances_s *distances,
14801466
enum hwloc_distances_transform_e transform,
14811467
void *transform_attr,
@@ -1494,13 +1480,13 @@ hwloc_distances_transform(hwloc_topology_t topology,
14941480
case HWLOC_DISTANCES_TRANSFORM_MERGE_SWITCH_PORTS:
14951481
{
14961482
int err;
1497-
err = hwloc__distances_transform_merge_switch_ports(topology, distances);
1483+
err = hwloc__distances_transform_merge_switch_ports(distances);
14981484
if (!err)
14991485
err = hwloc__distances_transform_remove_null(distances);
15001486
return err;
15011487
}
15021488
case HWLOC_DISTANCES_TRANSFORM_TRANSITIVE_CLOSURE:
1503-
return hwloc__distances_transform_transitive_closure(topology, distances);
1489+
return hwloc__distances_transform_transitive_closure(distances);
15041490
default:
15051491
errno = EINVAL;
15061492
return -1;

include/hwloc/distances.h

+14-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright © 2010-2024 Inria. All rights reserved.
2+
* Copyright © 2010-2025 Inria. All rights reserved.
33
* See COPYING in top-level directory.
44
*/
55

@@ -234,17 +234,24 @@ enum hwloc_distances_transform_e {
234234
HWLOC_DISTANCES_TRANSFORM_LINKS = 1,
235235

236236
/** \brief Merge switches with multiple ports into a single object.
237-
* This currently only applies to NVSwitches where GPUs seem connected to different
238-
* separate switch ports in the NVLinkBandwidth matrix. This transformation will
239-
* replace all of them with the same port connected to all GPUs.
240-
* Other ports are removed by applying ::HWLOC_DISTANCES_TRANSFORM_REMOVE_NULL internally.
237+
*
238+
* This currently only applies to NVSwitches where GPUs seem connected
239+
* to different switch ports. Switch ports must be objects with subtype
240+
* "NVSwitch" as in the NVLinkBandwidth matrix.
241+
*
242+
* This transformation will replace all ports with only the first one,
243+
* now connected to all GPUs. Other ports are removed by applying
244+
* ::HWLOC_DISTANCES_TRANSFORM_REMOVE_NULL internally.
241245
* \hideinitializer
242246
*/
243247
HWLOC_DISTANCES_TRANSFORM_MERGE_SWITCH_PORTS = 2,
244248

245249
/** \brief Apply a transitive closure to the matrix to connect objects across switches.
246-
* This currently only applies to GPUs and NVSwitches in the NVLinkBandwidth matrix.
247-
* All pairs of GPUs will be reported as directly connected.
250+
*
251+
* All pairs of GPUs will be reported as directly connected instead GPUs being
252+
* only connected to switches.
253+
*
254+
* Switch ports must be objects with subtype "NVSwitch" as in the NVLinkBandwidth matrix.
248255
* \hideinitializer
249256
*/
250257
HWLOC_DISTANCES_TRANSFORM_TRANSITIVE_CLOSURE = 3

0 commit comments

Comments
 (0)