Skip to content

Commit 46c4929

Browse files
committed
Merge pull request #2 from lipari/set_OMP_NUM_THREADS
Set OMP_NUM_THREADS
2 parents 8299417 + ccedca4 commit 46c4929

File tree

3 files changed

+82
-49
lines changed

3 files changed

+82
-49
lines changed

META

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
Name: slurm-spank-plugins
2-
Version: 0.27
2+
Version: 0.28
33
Release: 1
44
Author: Mark Grondona <[email protected]>

NEWS

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
Version 0.28 (2016-03-08)
2+
- Improve mpibind
3+
- Address problem reported in TOSS-3112 where tasks were not distributed
4+
evenly to available NUMA nodes.
5+
16
Version 0.27 (2015-11-19):
27
- add mpibind
38

mpibind.c

+76-48
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ static const char mpibind_help [] =
4343
mpibind: Automatically assign CPU and GPU affinity using best-guess defaults.\n\
4444
\n\
4545
The default behavior attempts to bind MPI tasks to specific processing\n\
46-
units. If OMP_NUM_THREDS is set, each thread will be similarly bound\n\
46+
units. If OMP_NUM_THREADS is set, each thread will be similarly bound\n\
4747
to a processing unit. MPI+OpenMP programs must set OMP_NUM_THREADS.\n\
4848
\n\
4949
Option Usage: --mpibind[=args...]\n\
@@ -69,13 +69,15 @@ E.g., MPIBIND=w.0-9\n\
6969
****************************************************************************/
7070

7171
static hwloc_topology_t topology;
72-
static int32_t disabled = 0; /* True if disabled by --mpibind=off */
73-
static int32_t enabled = 1; /* True if enabled by configuration */
72+
static int32_t disabled = 0; /* True if disabled by --mpibind=off */
73+
static int32_t enabled = 1; /* True if enabled by configuration */
7474
static int32_t verbose = 0;
75-
static uint32_t cpus = 0; /* a bitmap of <range> specified cores */
76-
static uint32_t level_size = 0; /* the number of available cores on this node */
77-
static uint32_t local_rank = 0; /* the rank relative to this node */
78-
static uint32_t local_size = 0; /* the number of tasks to run on this node */
75+
static uint32_t cpus = 0; /* a bitmap of <range> specified cores */
76+
static uint32_t level_size = 0; /* number of processing units available */
77+
static uint32_t local_rank = 0; /* rank relative to this node */
78+
static uint32_t local_size = 0; /* number of tasks to run on this node */
79+
static uint32_t local_threads = 0; /* number of threads to run on this node */
80+
static uint32_t num_cores = 0; /* number of physical cores available */
7981
static uint32_t num_threads = 0;
8082
static uint32_t rank = 0;
8183

@@ -123,7 +125,7 @@ static int parse_option (const char *opt, int32_t remote)
123125
else if (!strncmp (opt, "vv", 3)) {
124126
verbose = 3;
125127
if (remote)
126-
slurm_debug2 ("setting 'vv' verbosity");
128+
slurm_debug2 ("mpibind: setting 'vv' verbosity");
127129
else
128130
printf ("setting 'vv' verbosity\n");
129131
} else if (!strncmp (opt, "v", 2) || !strncmp (opt, "verbose", 8))
@@ -208,6 +210,19 @@ static int get_local_env ()
208210
char *val = NULL;
209211
int32_t rc = -1;
210212

213+
if ((val = getenv ("MPIBIND"))) {
214+
if (verbose > 1)
215+
printf ("mpibind: processing MPIBIND=%s\n", val);
216+
/* This next call is essentially a validation exercise. The
217+
* MPIBIND options will be parsed and validated and the user
218+
* will be informed or alerted at their requested
219+
* verbosity. The actual options specified in MPIBIND will be
220+
* processed in get_remote_env(). */
221+
rc = parse_user_option (0, val, 0);
222+
} else {
223+
rc = 0;
224+
}
225+
211226
/* Need the number of threads for the 'mem' policy */
212227
if ((val = getenv ("OMP_NUM_THREADS"))) {
213228
num_threads = strtol (val, NULL, 10);
@@ -222,19 +237,6 @@ static int get_local_env ()
222237
"program\n");
223238
}
224239

225-
if ((val = getenv ("MPIBIND"))) {
226-
if (verbose > 1)
227-
printf ("mpibind: processing MPIBIND=%s\n", val);
228-
/* This next call is essentially a validation exercise. The
229-
* MPIBIND options will be parsed and validated and the user
230-
* will be informed or alerted at their requested
231-
* verbosity. The actual options specified in MPIBIND will be
232-
* processed in get_remote_env(). */
233-
rc = parse_user_option (0, val, 0);
234-
} else {
235-
rc = 0;
236-
}
237-
238240
return rc;
239241
}
240242

@@ -248,7 +250,7 @@ static int get_remote_env (spank_t sp)
248250
if (rank)
249251
verbose = 0;
250252
} else {
251-
slurm_error ("Failed to retrieve global rank from environment");
253+
slurm_error ("mpibind: Failed to retrieve global rank from environment");
252254
goto ret;
253255
}
254256

@@ -260,7 +262,7 @@ static int get_remote_env (spank_t sp)
260262
if (verbose > 1)
261263
slurm_debug ("mpibind: retrieved local rank %u", local_rank);
262264
} else {
263-
slurm_error ("Failed to retrieve local rank from environment");
265+
slurm_error ("mpibind: Failed to retrieve local rank from environment");
264266
goto ret;
265267
}
266268

@@ -269,7 +271,7 @@ static int get_remote_env (spank_t sp)
269271
if (verbose > 1)
270272
slurm_debug ("mpibind: retrieved local size %u", local_size);
271273
} else {
272-
slurm_error ("Failed to retrieve local size from environment");
274+
slurm_error ("mpibind: Failed to retrieve local size from environment");
273275
goto ret;
274276
}
275277

@@ -413,15 +415,18 @@ static void decimate_gpusets (hwloc_cpuset_t *gpusets, uint32_t numaobjs,
413415
static char *get_gomp_str (hwloc_cpuset_t cpuset)
414416
{
415417
char *str = NULL;
416-
int32_t i;
418+
int32_t i, j;
417419

418420
i = hwloc_bitmap_first (cpuset);
419-
while (i != -1) {
421+
j = num_threads;
422+
423+
while ((i != -1) && (j > 0)) {
420424
if (str)
421425
asprintf (&str, "%s,%d", str, i);
422426
else
423427
asprintf (&str, "%d", i);
424428
i = hwloc_bitmap_next (cpuset, i);
429+
j--;
425430
}
426431

427432
return str;
@@ -504,16 +509,16 @@ int slurm_spank_user_init (spank_t sp, int32_t ac, char **av)
504509
int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
505510
{
506511
char *str;
512+
float num_pus_per_task;
507513
hwloc_cpuset_t *cpusets = NULL;
508514
hwloc_cpuset_t *gpusets = NULL;
509515
hwloc_cpuset_t cpuset;
510516
hwloc_obj_t obj;
511517
int32_t gpus = 0;
512518
int32_t i;
519+
int32_t index;
513520
int32_t numaobjs;
514-
int64_t index;
515521
uint32_t gpu_bits = 0;
516-
uint32_t num_pus_per_task;
517522

518523
if (!spank_remote (sp))
519524
return (0);
@@ -527,13 +532,18 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
527532
display_cpubind ("starting binding");
528533
}
529534

535+
local_threads = local_size;
536+
if (num_threads)
537+
local_threads *= num_threads;
538+
530539
cpuset = hwloc_bitmap_alloc();
531540

532541
if (cpus) {
533542
int32_t coreobjs = hwloc_get_nbobjs_by_type (topology, HWLOC_OBJ_CORE);
534543
int j = 0;
535544

536545
/* level_size has been set in process_opt() */
546+
num_cores = level_size;
537547
cpusets = calloc (level_size, sizeof (hwloc_cpuset_t));
538548

539549
for (i = 0; i < coreobjs; i++) {
@@ -550,10 +560,11 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
550560
} else {
551561
uint32_t depth;
552562
uint32_t topodepth = hwloc_topology_get_depth (topology);
563+
num_cores = hwloc_get_nbobjs_by_type (topology, HWLOC_OBJ_CORE);
553564

554565
for (depth = 0; depth < topodepth; depth++) {
555566
level_size = hwloc_get_nbobjs_by_depth (topology, depth);
556-
if (level_size >= local_size)
567+
if (level_size >= local_threads)
557568
break;
558569
}
559570
if (depth == topodepth)
@@ -609,17 +620,36 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
609620
decimate_gpusets (gpusets, numaobjs, gpus);
610621
}
611622

612-
num_pus_per_task = level_size / local_size;
613-
if (!num_pus_per_task)
614-
num_pus_per_task = 1;
623+
num_pus_per_task = (float) level_size / local_size;
624+
if (num_pus_per_task < 1.0)
625+
num_pus_per_task = 1.0;
615626

616627
if (!local_rank && verbose > 2)
617-
printf ("mpibind: level size: %u, local size: %u, pus per task %u\n",
618-
level_size, local_size, num_pus_per_task);
628+
slurm_debug2 ("mpibind: level size: %u, local size: %u, pus per task "
629+
"%f\n", level_size, local_size, num_pus_per_task);
630+
631+
/* If the user did not set it, we set OMP_NUM_THREADS to the
632+
* number of cores per task. */
633+
if (!num_threads) {
634+
num_threads = num_cores / local_size;
635+
if (!num_threads)
636+
num_threads = 1;
637+
asprintf (&str, "%u", num_threads);
638+
spank_setenv (sp, "OMP_NUM_THREADS", str, 0);
639+
if (verbose > 2)
640+
slurm_debug2 ("mpibind: setting OMP_NUM_THREADS to %s\n", str);
641+
free (str);
642+
}
619643

620-
index = (local_rank * num_pus_per_task) % level_size;
644+
/*
645+
* Note: num_pus_per_task is a float value. The next few
646+
* statements result in an even distribution of tasks to cores
647+
* across the available cores and also guarantees an even
648+
* distribution of tasks to NUMA nodes.
649+
*/
650+
index = (int32_t) (local_rank * num_pus_per_task);
621651

622-
for (i = index; i < index + num_pus_per_task; i++) {
652+
for (i = index; i < index + (int32_t) num_pus_per_task; i++) {
623653
hwloc_bitmap_or (cpuset, cpuset, cpusets[i]);
624654
if (gpus) {
625655
int32_t j;
@@ -640,37 +670,35 @@ int slurm_spank_task_init (spank_t sp, int32_t ac, char **av)
640670
numaobjs = hwloc_get_nbobjs_inside_cpuset_by_type (topology, cpuset,
641671
HWLOC_OBJ_NODE);
642672
if ((local_size < numaobjs) && (num_threads > 1)) {
643-
printf ("mpibind: Consider using at least %d MPI tasks per node\n",
644-
numaobjs);
673+
slurm_verbose ("mpibind: Consider using at least %d MPI tasks per "
674+
"node\n", numaobjs);
645675
}
646676
}
647677

648678
hwloc_bitmap_asprintf (&str, cpuset);
649679
if (verbose > 2)
650-
printf ("mpibind: resulting cpuset %s\n", str);
680+
slurm_debug2 ("mpibind: resulting cpuset %s\n", str);
651681

652682
if (hwloc_set_cpubind (topology, cpuset, 0)) {
653683
slurm_error ("mpibind: could not bind to cpuset %s: %s", str,
654684
strerror(errno));
655685
} else if (verbose > 2) {
656-
printf ("mpibind: bound cpuset %s\n", str);
686+
slurm_debug2 ("mpibind: bound cpuset %s\n", str);
657687
}
658688
free (str);
659689

660-
if (num_threads) {
661-
if ((str = get_gomp_str (cpuset))) {
662-
spank_setenv (sp, "GOMP_CPU_AFFINITY", str, 1);
663-
if (verbose > 1)
664-
printf ("mpibind: GOMP_CPU_AFFINITY=%s\n", str);
665-
free (str);
666-
}
690+
if ((str = get_gomp_str (cpuset))) {
691+
spank_setenv (sp, "GOMP_CPU_AFFINITY", str, 1);
692+
if (verbose > 1)
693+
slurm_debug ("mpibind: GOMP_CPU_AFFINITY=%s\n", str);
694+
free (str);
667695
}
668696

669697
if (gpus) {
670698
if ((str = get_cuda_str (gpus, gpu_bits))) {
671699
spank_setenv (sp, "CUDA_VISIBLE_DEVICES", str, 1);
672700
if (verbose > 1)
673-
printf ("mpibind: CUDA_VISIBLE_DEVICES=%s\n", str);
701+
slurm_debug ("mpibind: CUDA_VISIBLE_DEVICES=%s\n", str);
674702
free (str);
675703
}
676704

0 commit comments

Comments
 (0)