@@ -43,7 +43,7 @@ static const char mpibind_help [] =
43
43
mpibind: Automatically assign CPU and GPU affinity using best-guess defaults.\n\
44
44
\n\
45
45
The default behavior attempts to bind MPI tasks to specific processing\n\
46
- units. If OMP_NUM_THREDS is set, each thread will be similarly bound\n\
46
+ units. If OMP_NUM_THREADS is set, each thread will be similarly bound\n\
47
47
to a processing unit. MPI+OpenMP programs must set OMP_NUM_THREADS.\n\
48
48
\n\
49
49
Option Usage: --mpibind[=args...]\n\
@@ -69,13 +69,15 @@ E.g., MPIBIND=w.0-9\n\
69
69
****************************************************************************/
70
70
71
71
static hwloc_topology_t topology ;
72
- static int32_t disabled = 0 ; /* True if disabled by --mpibind=off */
73
- static int32_t enabled = 1 ; /* True if enabled by configuration */
72
+ static int32_t disabled = 0 ; /* True if disabled by --mpibind=off */
73
+ static int32_t enabled = 1 ; /* True if enabled by configuration */
74
74
static int32_t verbose = 0 ;
75
- static uint32_t cpus = 0 ; /* a bitmap of <range> specified cores */
76
- static uint32_t level_size = 0 ; /* the number of available cores on this node */
77
- static uint32_t local_rank = 0 ; /* the rank relative to this node */
78
- static uint32_t local_size = 0 ; /* the number of tasks to run on this node */
75
+ static uint32_t cpus = 0 ; /* a bitmap of <range> specified cores */
76
+ static uint32_t level_size = 0 ; /* number of processing units available */
77
+ static uint32_t local_rank = 0 ; /* rank relative to this node */
78
+ static uint32_t local_size = 0 ; /* number of tasks to run on this node */
79
+ static uint32_t local_threads = 0 ; /* number of threads to run on this node */
80
+ static uint32_t num_cores = 0 ; /* number of physical cores available */
79
81
static uint32_t num_threads = 0 ;
80
82
static uint32_t rank = 0 ;
81
83
@@ -123,7 +125,7 @@ static int parse_option (const char *opt, int32_t remote)
123
125
else if (!strncmp (opt , "vv" , 3 )) {
124
126
verbose = 3 ;
125
127
if (remote )
126
- slurm_debug2 ("setting 'vv' verbosity" );
128
+ slurm_debug2 ("mpibind: setting 'vv' verbosity" );
127
129
else
128
130
printf ("setting 'vv' verbosity\n" );
129
131
} else if (!strncmp (opt , "v" , 2 ) || !strncmp (opt , "verbose" , 8 ))
@@ -208,6 +210,19 @@ static int get_local_env ()
208
210
char * val = NULL ;
209
211
int32_t rc = -1 ;
210
212
213
+ if ((val = getenv ("MPIBIND" ))) {
214
+ if (verbose > 1 )
215
+ printf ("mpibind: processing MPIBIND=%s\n" , val );
216
+ /* This next call is essentially a validation exercise. The
217
+ * MPIBIND options will be parsed and validated and the user
218
+ * will be informed or alerted at their requested
219
+ * verbosity. The actual options specified in MPIBIND will be
220
+ * processed in get_remote_env(). */
221
+ rc = parse_user_option (0 , val , 0 );
222
+ } else {
223
+ rc = 0 ;
224
+ }
225
+
211
226
/* Need the number of threads for the 'mem' policy */
212
227
if ((val = getenv ("OMP_NUM_THREADS" ))) {
213
228
num_threads = strtol (val , NULL , 10 );
@@ -222,19 +237,6 @@ static int get_local_env ()
222
237
"program\n" );
223
238
}
224
239
225
- if ((val = getenv ("MPIBIND" ))) {
226
- if (verbose > 1 )
227
- printf ("mpibind: processing MPIBIND=%s\n" , val );
228
- /* This next call is essentially a validation exercise. The
229
- * MPIBIND options will be parsed and validated and the user
230
- * will be informed or alerted at their requested
231
- * verbosity. The actual options specified in MPIBIND will be
232
- * processed in get_remote_env(). */
233
- rc = parse_user_option (0 , val , 0 );
234
- } else {
235
- rc = 0 ;
236
- }
237
-
238
240
return rc ;
239
241
}
240
242
@@ -248,7 +250,7 @@ static int get_remote_env (spank_t sp)
248
250
if (rank )
249
251
verbose = 0 ;
250
252
} else {
251
- slurm_error ("Failed to retrieve global rank from environment" );
253
+ slurm_error ("mpibind: Failed to retrieve global rank from environment" );
252
254
goto ret ;
253
255
}
254
256
@@ -260,7 +262,7 @@ static int get_remote_env (spank_t sp)
260
262
if (verbose > 1 )
261
263
slurm_debug ("mpibind: retrieved local rank %u" , local_rank );
262
264
} else {
263
- slurm_error ("Failed to retrieve local rank from environment" );
265
+ slurm_error ("mpibind: Failed to retrieve local rank from environment" );
264
266
goto ret ;
265
267
}
266
268
@@ -269,7 +271,7 @@ static int get_remote_env (spank_t sp)
269
271
if (verbose > 1 )
270
272
slurm_debug ("mpibind: retrieved local size %u" , local_size );
271
273
} else {
272
- slurm_error ("Failed to retrieve local size from environment" );
274
+ slurm_error ("mpibind: Failed to retrieve local size from environment" );
273
275
goto ret ;
274
276
}
275
277
@@ -413,15 +415,18 @@ static void decimate_gpusets (hwloc_cpuset_t *gpusets, uint32_t numaobjs,
413
415
static char * get_gomp_str (hwloc_cpuset_t cpuset )
414
416
{
415
417
char * str = NULL ;
416
- int32_t i ;
418
+ int32_t i , j ;
417
419
418
420
i = hwloc_bitmap_first (cpuset );
419
- while (i != -1 ) {
421
+ j = num_threads ;
422
+
423
+ while ((i != -1 ) && (j > 0 )) {
420
424
if (str )
421
425
asprintf (& str , "%s,%d" , str , i );
422
426
else
423
427
asprintf (& str , "%d" , i );
424
428
i = hwloc_bitmap_next (cpuset , i );
429
+ j -- ;
425
430
}
426
431
427
432
return str ;
@@ -504,16 +509,16 @@ int slurm_spank_user_init (spank_t sp, int32_t ac, char **av)
504
509
int slurm_spank_task_init (spank_t sp , int32_t ac , char * * av )
505
510
{
506
511
char * str ;
512
+ float num_pus_per_task ;
507
513
hwloc_cpuset_t * cpusets = NULL ;
508
514
hwloc_cpuset_t * gpusets = NULL ;
509
515
hwloc_cpuset_t cpuset ;
510
516
hwloc_obj_t obj ;
511
517
int32_t gpus = 0 ;
512
518
int32_t i ;
519
+ int32_t index ;
513
520
int32_t numaobjs ;
514
- int64_t index ;
515
521
uint32_t gpu_bits = 0 ;
516
- uint32_t num_pus_per_task ;
517
522
518
523
if (!spank_remote (sp ))
519
524
return (0 );
@@ -527,13 +532,18 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
527
532
display_cpubind ("starting binding" );
528
533
}
529
534
535
+ local_threads = local_size ;
536
+ if (num_threads )
537
+ local_threads *= num_threads ;
538
+
530
539
cpuset = hwloc_bitmap_alloc ();
531
540
532
541
if (cpus ) {
533
542
int32_t coreobjs = hwloc_get_nbobjs_by_type (topology , HWLOC_OBJ_CORE );
534
543
int j = 0 ;
535
544
536
545
/* level_size has been set in process_opt() */
546
+ num_cores = level_size ;
537
547
cpusets = calloc (level_size , sizeof (hwloc_cpuset_t ));
538
548
539
549
for (i = 0 ; i < coreobjs ; i ++ ) {
@@ -550,10 +560,11 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
550
560
} else {
551
561
uint32_t depth ;
552
562
uint32_t topodepth = hwloc_topology_get_depth (topology );
563
+ num_cores = hwloc_get_nbobjs_by_type (topology , HWLOC_OBJ_CORE );
553
564
554
565
for (depth = 0 ; depth < topodepth ; depth ++ ) {
555
566
level_size = hwloc_get_nbobjs_by_depth (topology , depth );
556
- if (level_size >= local_size )
567
+ if (level_size >= local_threads )
557
568
break ;
558
569
}
559
570
if (depth == topodepth )
@@ -609,17 +620,36 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
609
620
decimate_gpusets (gpusets , numaobjs , gpus );
610
621
}
611
622
612
- num_pus_per_task = level_size / local_size ;
613
- if (! num_pus_per_task )
614
- num_pus_per_task = 1 ;
623
+ num_pus_per_task = ( float ) level_size / local_size ;
624
+ if (num_pus_per_task < 1.0 )
625
+ num_pus_per_task = 1.0 ;
615
626
616
627
if (!local_rank && verbose > 2 )
617
- printf ("mpibind: level size: %u, local size: %u, pus per task %u\n" ,
618
- level_size , local_size , num_pus_per_task );
628
+ slurm_debug2 ("mpibind: level size: %u, local size: %u, pus per task "
629
+ "%f\n" , level_size , local_size , num_pus_per_task );
630
+
631
+ /* If the user did not set it, we set OMP_NUM_THREADS to the
632
+ * number of cores per task. */
633
+ if (!num_threads ) {
634
+ num_threads = num_cores / local_size ;
635
+ if (!num_threads )
636
+ num_threads = 1 ;
637
+ asprintf (& str , "%u" , num_threads );
638
+ spank_setenv (sp , "OMP_NUM_THREADS" , str , 0 );
639
+ if (verbose > 2 )
640
+ slurm_debug2 ("mpibind: setting OMP_NUM_THREADS to %s\n" , str );
641
+ free (str );
642
+ }
619
643
620
- index = (local_rank * num_pus_per_task ) % level_size ;
644
+ /*
645
+ * Note: num_pus_per_task is a float value. The next few
646
+ * statements result in an even distribution of tasks to cores
647
+ * across the available cores and also guarantees an even
648
+ * distribution of tasks to NUMA nodes.
649
+ */
650
+ index = (int32_t ) (local_rank * num_pus_per_task );
621
651
622
- for (i = index ; i < index + num_pus_per_task ; i ++ ) {
652
+ for (i = index ; i < index + ( int32_t ) num_pus_per_task ; i ++ ) {
623
653
hwloc_bitmap_or (cpuset , cpuset , cpusets [i ]);
624
654
if (gpus ) {
625
655
int32_t j ;
@@ -640,37 +670,35 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
640
670
numaobjs = hwloc_get_nbobjs_inside_cpuset_by_type (topology , cpuset ,
641
671
HWLOC_OBJ_NODE );
642
672
if ((local_size < numaobjs ) && (num_threads > 1 )) {
643
- printf ("mpibind: Consider using at least %d MPI tasks per node\n" ,
644
- numaobjs );
673
+ slurm_verbose ("mpibind: Consider using at least %d MPI tasks per "
674
+ "node\n" , numaobjs );
645
675
}
646
676
}
647
677
648
678
hwloc_bitmap_asprintf (& str , cpuset );
649
679
if (verbose > 2 )
650
- printf ("mpibind: resulting cpuset %s\n" , str );
680
+ slurm_debug2 ("mpibind: resulting cpuset %s\n" , str );
651
681
652
682
if (hwloc_set_cpubind (topology , cpuset , 0 )) {
653
683
slurm_error ("mpibind: could not bind to cpuset %s: %s" , str ,
654
684
strerror (errno ));
655
685
} else if (verbose > 2 ) {
656
- printf ("mpibind: bound cpuset %s\n" , str );
686
+ slurm_debug2 ("mpibind: bound cpuset %s\n" , str );
657
687
}
658
688
free (str );
659
689
660
- if (num_threads ) {
661
- if ((str = get_gomp_str (cpuset ))) {
662
- spank_setenv (sp , "GOMP_CPU_AFFINITY" , str , 1 );
663
- if (verbose > 1 )
664
- printf ("mpibind: GOMP_CPU_AFFINITY=%s\n" , str );
665
- free (str );
666
- }
690
+ if ((str = get_gomp_str (cpuset ))) {
691
+ spank_setenv (sp , "GOMP_CPU_AFFINITY" , str , 1 );
692
+ if (verbose > 1 )
693
+ slurm_debug ("mpibind: GOMP_CPU_AFFINITY=%s\n" , str );
694
+ free (str );
667
695
}
668
696
669
697
if (gpus ) {
670
698
if ((str = get_cuda_str (gpus , gpu_bits ))) {
671
699
spank_setenv (sp , "CUDA_VISIBLE_DEVICES" , str , 1 );
672
700
if (verbose > 1 )
673
- printf ("mpibind: CUDA_VISIBLE_DEVICES=%s\n" , str );
701
+ slurm_debug ("mpibind: CUDA_VISIBLE_DEVICES=%s\n" , str );
674
702
free (str );
675
703
}
676
704
0 commit comments