@@ -44,7 +44,7 @@ mpibind: Automatically assign CPU and GPU affinity using best-guess defaults.\n\
44
44
\n\
45
45
The default behavior attempts to bind MPI tasks to specific processing\n\
46
46
units. If OMP_NUM_THREADS is set, each thread will be similarly bound\n\
47
- to a processing unit. MPI+OpenMP programs must set OMP_NUM_THREADS. \n\
47
+ to a processing unit.\n\
48
48
\n\
49
49
Option Usage: --mpibind[=args...]\n\
50
50
where args... is a period (.) separated list of one or more of the\n\
@@ -649,6 +649,7 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
649
649
cpusets [j ] = hwloc_bitmap_dup (obj -> cpuset );
650
650
} else {
651
651
slurm_error ("mpibind: failed to get core %d" , i );
652
+ return (ESPANK_ERROR );
652
653
}
653
654
j ++ ;
654
655
}
@@ -675,6 +676,7 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
675
676
} else {
676
677
slurm_error ("mpibind: failed to get object %d at depth %d" , i ,
677
678
depth );
679
+ return (ESPANK_ERROR );
678
680
}
679
681
}
680
682
}
@@ -700,21 +702,25 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
700
702
for (obj = hwloc_get_next_osdev (topology , NULL ); obj ;
701
703
obj = hwloc_get_next_osdev (topology , obj )) {
702
704
if (obj -> attr -> osdev .type == HWLOC_OBJ_OSDEV_GPU ) {
703
- hwloc_obj_t numaobj ;
705
+ hwloc_obj_t ancestor ;
704
706
#if HWLOC_API_VERSION < 0x00010b00
705
- numaobj = hwloc_get_ancestor_obj_by_type (topology ,
707
+ ancestor = hwloc_get_ancestor_obj_by_type (topology ,
706
708
HWLOC_OBJ_NODE , obj );
707
709
#else
708
- numaobj = hwloc_get_ancestor_obj_by_type (topology ,
710
+ ancestor = hwloc_get_ancestor_obj_by_type (topology ,
709
711
HWLOC_OBJ_NUMANODE , obj );
710
712
#endif
711
- if (numaobj ) {
712
- gpusets [gpus ] = hwloc_bitmap_dup (numaobj -> cpuset );
713
+ if (!ancestor )
714
+ /* The parent of GPUs on KNL nodes may be the
715
+ * machine instead of a NUMA node*/
716
+ ancestor = hwloc_get_ancestor_obj_by_type (topology ,
717
+ HWLOC_OBJ_MACHINE , obj );
718
+ if (ancestor ) {
719
+ gpusets [gpus ] = hwloc_bitmap_dup (ancestor -> cpuset );
713
720
gpus ++ ;
714
721
} else {
715
- slurm_error ("mpibind: failed to get numa parent of NVIDIA "
716
- "obj" );
717
- break ;
722
+ slurm_error ("mpibind: failed to find ancestor of GPU obj" );
723
+ return (ESPANK_ERROR );
718
724
}
719
725
}
720
726
}
0 commit comments