Skip to content

Commit ae2cfe7

Browse files
AboorvaDevarajantyreld
authored andcommitted
ppc64_cpu: Fix handling of non-contiguous CPU IDs
In ppc64le environments, adding or removing CPUs dynamically through DLPAR can create gaps in CPU IDs, such as `0-103,120-151`, in this case CPUs 104-119 are missing. ppc64_cpu doesn't handles this scenario and always considers CPU IDs to be contiguous causing issues in core numbering, cpu info and SMT mode reporting. To illustrate the issues this patch fixes, consider the following system configuration: $ lscpu Architecture: ppc64le Byte Order: Little Endian CPU(s): 136 On-line CPU(s) list: 0-103,120-151 **Note: CPU IDs are non-contiguous** ----------------------------------------------------------------- Before Patch: ----------------------------------------------------------------- $ ppc64_cpu --info Core 0: 0* 1* 2* 3* 4* 5* 6* 7* Core 1: 8* 9* 10* 11* 12* 13* 14* 15* Core 2: 16* 17* 18* 19* 20* 21* 22* 23* Core 3: 24* 25* 26* 27* 28* 29* 30* 31* Core 4: 32* 33* 34* 35* 36* 37* 38* 39* Core 5: 40* 41* 42* 43* 44* 45* 46* 47* Core 6: 48* 49* 50* 51* 52* 53* 54* 55* Core 7: 56* 57* 58* 59* 60* 61* 62* 63* Core 8: 64* 65* 66* 67* 68* 69* 70* 71* Core 9: 72* 73* 74* 75* 76* 77* 78* 79* Core 10: 80* 81* 82* 83* 84* 85* 86* 87* Core 11: 88* 89* 90* 91* 92* 93* 94* 95* Core 12: 96* 97* 98* 99* 100* 101* 102* 103* ........................................................... *gap* Core 13: 120* 121* 122* 123* 124* 125* 126* 127* Core 14: 128* 129* 130* 131* 132* 133* 134* 135* Core 15: 136* 137* 138* 139* 140* 141* 142* 143* Core 16: 144* 145* 146* 147* 148* 149* 150* 151* **Although the CPU IDs are non contiguous, associated core IDs are represented in contiguous order, which makes it harder to interpret this clearly.** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ $ ppc64_cpu --cores-on Number of cores online = 15 **Expected: Number of online cores = 17** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ $ ppc64_cpu --offline-cores Cores offline = 13, 14 **Even though no cores are actually offline, two cores (13, 14) are displayed as offline.** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ $ ppc64_cpu --online-cores Cores online = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16 **The list of online cores is missing two cores (13, 14).** ----------------------------------------------------------------- To resolve this, use the present CPU list from sysfs to assign numbers to CPUs and cores, which will make this accurate. $ cat /sys/devices/system/cpu/present 0-103,120-151 With this patch, the command output correctly reflects the current CPU configuration, providing a more precise representation of the system state. ----------------------------------------------------------------- After Patch: ----------------------------------------------------------------- $ ppc64_cpu --info Core 0: 0* 1* 2* 3* 4* 5* 6* 7* Core 1: 8* 9* 10* 11* 12* 13* 14* 15* Core 2: 16* 17* 18* 19* 20* 21* 22* 23* Core 3: 24* 25* 26* 27* 28* 29* 30* 31* Core 4: 32* 33* 34* 35* 36* 37* 38* 39* Core 5: 40* 41* 42* 43* 44* 45* 46* 47* Core 6: 48* 49* 50* 51* 52* 53* 54* 55* Core 7: 56* 57* 58* 59* 60* 61* 62* 63* Core 8: 64* 65* 66* 67* 68* 69* 70* 71* Core 9: 72* 73* 74* 75* 76* 77* 78* 79* Core 10: 80* 81* 82* 83* 84* 85* 86* 87* Core 11: 88* 89* 90* 91* 92* 93* 94* 95* Core 12: 96* 97* 98* 99* 100* 101* 102* 103* ........................................................... *gap* Core 15: 120* 121* 122* 123* 124* 125* 126* 127* Core 16: 128* 129* 130* 131* 132* 133* 134* 135* Core 17: 136* 137* 138* 139* 140* 141* 142* 143* Core 18: 144* 145* 146* 147* 148* 149* 150* 151* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ $ ppc64_cpu --cores-on Number of cores online = 17 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ $ ppc64_cpu --offline-cores Cores offline = ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ $ ppc64_cpu --online-cores Cores online = 0,1,2,3,4,5,6,7,8,9,10,11,12,15,16,17,18 ----------------------------------------------------------------- Signed-off-by: Aboorva Devarajan <[email protected]> Signed-off-by: Tyrel Datwyler <[email protected]>
1 parent d6a771d commit ae2cfe7

File tree

2 files changed

+221
-106
lines changed

2 files changed

+221
-106
lines changed

src/common/cpu_info_helpers.c

+62-34
Original file line numberDiff line numberDiff line change
@@ -311,67 +311,94 @@ int get_present_core_list(int **present_cores, int *num_present_cores, int threa
311311
}
312312

313313
static void print_cpu_list(const cpu_set_t *cpuset, int cpuset_size,
314-
int cpus_in_system)
314+
int threads_per_cpu)
315315
{
316-
int core;
316+
int *present_cores = NULL;
317+
int num_present_cores;
318+
int start, end, i = 0;
317319
const char *comma = "";
318320

319-
for (core = 0; core < cpus_in_system; core++) {
320-
int begin = core;
321-
if (CPU_ISSET_S(core, cpuset_size, cpuset)) {
322-
while (CPU_ISSET_S(core+1, cpuset_size, cpuset))
323-
core++;
321+
if (get_present_core_list(&present_cores, &num_present_cores, threads_per_cpu) != 0) {
322+
fprintf(stderr, "Failed to get present_cores list\n");
323+
return;
324+
}
324325

325-
if (core > begin)
326-
printf("%s%d-%d", comma, begin, core);
327-
else
328-
printf("%s%d", comma, core);
326+
while (i < num_present_cores) {
327+
start = present_cores[i];
328+
if (CPU_ISSET_S(start, cpuset_size, cpuset)) {
329+
end = start;
330+
while (i + 1 < num_present_cores &&
331+
CPU_ISSET_S(present_cores[i + 1], cpuset_size, cpuset) &&
332+
present_cores[i + 1] == end + 1) {
333+
end = present_cores[++i];
334+
}
335+
if (start == end) {
336+
printf("%s%d", comma, start);
337+
} else {
338+
printf("%s%d-%d", comma, start, end);
339+
}
329340
comma = ",";
330341
}
342+
i++;
331343
}
344+
free(present_cores);
332345
}
333346

334-
int __do_smt(bool numeric, int cpus_in_system, int threads_per_cpu,
335-
bool print_smt_state)
347+
int __do_smt(bool numeric, int cpus_in_system, int threads_per_cpu, bool print_smt_state)
336348
{
337-
int thread, c, smt_state = 0;
338349
cpu_set_t **cpu_states = NULL;
339-
int cpu_state_size = CPU_ALLOC_SIZE(cpus_in_system);
340-
int start_cpu = 0, stop_cpu = cpus_in_system;
350+
int thread, smt_state = -1;
351+
int cpu_state_size;
341352
int rc = 0;
353+
int i, core_id, threads_online;
354+
int *present_cores = NULL;
355+
int num_present_cores;
342356

343-
cpu_states = (cpu_set_t **)calloc(threads_per_cpu, sizeof(cpu_set_t));
344-
if (!cpu_states)
357+
if (get_present_core_list(&present_cores, &num_present_cores, threads_per_cpu) != 0) {
358+
fprintf(stderr, "Failed to get present core list\n");
345359
return -ENOMEM;
360+
}
361+
cpu_state_size = CPU_ALLOC_SIZE(num_present_cores);
362+
cpu_states = (cpu_set_t **)calloc(threads_per_cpu, sizeof(cpu_set_t *));
363+
if (!cpu_states) {
364+
rc = -ENOMEM;
365+
goto cleanup_present_cores;
366+
}
346367

347368
for (thread = 0; thread < threads_per_cpu; thread++) {
348-
cpu_states[thread] = CPU_ALLOC(cpus_in_system);
369+
cpu_states[thread] = CPU_ALLOC(num_present_cores);
370+
if (!cpu_states[thread]) {
371+
rc = -ENOMEM;
372+
goto cleanup_cpu_states;
373+
}
349374
CPU_ZERO_S(cpu_state_size, cpu_states[thread]);
350375
}
351376

352-
for (c = start_cpu; c < stop_cpu; c++) {
353-
int threads_online = __get_one_smt_state(c, threads_per_cpu);
354-
377+
for (i = 0; i < num_present_cores; i++) {
378+
core_id = present_cores[i];
379+
threads_online = __get_one_smt_state(core_id, threads_per_cpu);
355380
if (threads_online < 0) {
356381
rc = threads_online;
357-
goto cleanup_get_smt;
382+
goto cleanup_cpu_states;
383+
}
384+
if (threads_online) {
385+
CPU_SET_S(core_id, cpu_state_size, cpu_states[threads_online - 1]);
358386
}
359-
if (threads_online)
360-
CPU_SET_S(c, cpu_state_size,
361-
cpu_states[threads_online - 1]);
362387
}
363388

364389
for (thread = 0; thread < threads_per_cpu; thread++) {
365390
if (CPU_COUNT_S(cpu_state_size, cpu_states[thread])) {
366-
if (smt_state == 0)
391+
if (smt_state == -1)
367392
smt_state = thread + 1;
368393
else if (smt_state > 0)
369394
smt_state = 0; /* mix of SMT modes */
370395
}
371396
}
372397

373-
if (!print_smt_state)
374-
return smt_state;
398+
if (!print_smt_state) {
399+
rc = smt_state;
400+
goto cleanup_cpu_states;
401+
}
375402

376403
if (smt_state == 1) {
377404
if (numeric)
@@ -380,21 +407,22 @@ int __do_smt(bool numeric, int cpus_in_system, int threads_per_cpu,
380407
printf("SMT is off\n");
381408
} else if (smt_state == 0) {
382409
for (thread = 0; thread < threads_per_cpu; thread++) {
383-
if (CPU_COUNT_S(cpu_state_size,
384-
cpu_states[thread])) {
410+
if (CPU_COUNT_S(cpu_state_size, cpu_states[thread])) {
385411
printf("SMT=%d: ", thread + 1);
386-
print_cpu_list(cpu_states[thread],
387-
cpu_state_size, cpus_in_system);
412+
print_cpu_list(cpu_states[thread], cpu_state_size, threads_per_cpu);
388413
printf("\n");
389414
}
390415
}
391416
} else {
392417
printf("SMT=%d\n", smt_state);
393418
}
394419

395-
cleanup_get_smt:
420+
cleanup_cpu_states:
396421
for (thread = 0; thread < threads_per_cpu; thread++)
397422
CPU_FREE(cpu_states[thread]);
423+
free(cpu_states);
424+
cleanup_present_cores:
425+
free(present_cores);
398426

399427
return rc;
400428
}

0 commit comments

Comments
 (0)