Skip to content

Commit ccc3dbb

Browse files
FedeDPAndreagit97
authored andcommitted
fix(driver): avoid crashing when an offline CPU prior to agent start, is hotplugged.
Signed-off-by: Federico Di Pierro <[email protected]> Co-authored-by: Andrea Terzolo <[email protected]>
1 parent f0419d5 commit ccc3dbb

File tree

2 files changed

+23
-122
lines changed

2 files changed

+23
-122
lines changed

driver/main.c

+22-122
Original file line numberDiff line numberDiff line change
@@ -277,10 +277,6 @@ static bool verbose = 0;
277277

278278
static unsigned int max_consumers = 5;
279279

280-
#if(LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
281-
static enum cpuhp_state hp_state = 0;
282-
#endif
283-
284280
#define vpr_info(fmt, ...) \
285281
do { \
286282
if(verbose) \
@@ -446,6 +442,7 @@ static int ppm_open(struct inode *inode, struct file *filp) {
446442
consumer->consumer_id = consumer_id;
447443
consumer->buffer_bytes_dim = g_buffer_bytes_dim;
448444
consumer->tracepoints_attached = 0; /* Start with no tracepoints */
445+
consumer->hotplug_cpu = -1;
449446

450447
/*
451448
* Initialize the ring buffers array
@@ -476,14 +473,6 @@ static int ppm_open(struct inode *inode, struct file *filp) {
476473
ring->info = NULL;
477474
}
478475

479-
/*
480-
* If a cpu is offline when the consumer is first created, we
481-
* will never get events for that cpu even if it later comes
482-
* online via hotplug. We could allocate these rings on-demand
483-
* later in this function if needed for hotplug, but that
484-
* requires the consumer to know to call open again, and that is
485-
* not supported.
486-
*/
487476
for_each_online_cpu(cpu) {
488477
ring = per_cpu_ptr(consumer->ring_buffers, cpu);
489478

@@ -1820,6 +1809,27 @@ static int record_event_consumer(struct ppm_consumer_t *consumer,
18201809
ASSERT(ring);
18211810

18221811
ring_info = ring->info;
1812+
if(!ring_info) {
1813+
// If we haven't got the ring info, it means
1814+
// the event was generated by a CPU that was not
1815+
// online when the ring buffers were initialized.
1816+
// Store info about hotplugged CPU here to later
1817+
// send hotplug events on cpu0.
1818+
consumer->hotplug_cpu = cpu;
1819+
put_cpu();
1820+
return res;
1821+
}
1822+
1823+
// Manage hotplug on cpu 0
1824+
if(consumer->hotplug_cpu != -1 && cpu == 0) {
1825+
event_type = PPME_CPU_HOTPLUG_E;
1826+
drop_flags = UF_NEVER_DROP;
1827+
tp_type = INTERNAL_EVENTS;
1828+
event_datap->category = PPMC_CONTEXT_SWITCH;
1829+
event_datap->event_info.context_data.sched_prev = (void *)(long)consumer->hotplug_cpu;
1830+
event_datap->event_info.context_data.sched_next = (void *)(long)0;
1831+
}
1832+
18231833
if(event_datap->category == PPMC_CONTEXT_SWITCH &&
18241834
event_datap->event_info.context_data.sched_prev != NULL) {
18251835
if(event_type != PPME_SCAPEVENT_E && event_type != PPME_CPU_HOTPLUG_E) {
@@ -2771,96 +2781,12 @@ static char *ppm_devnode(struct device *dev, mode_t *mode)
27712781
}
27722782
#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20) */
27732783

2774-
static int do_cpu_callback(unsigned long cpu, long sd_action) {
2775-
struct ppm_ring_buffer_context *ring;
2776-
struct ppm_consumer_t *consumer;
2777-
struct event_data_t event_data;
2778-
2779-
if(sd_action != 0) {
2780-
rcu_read_lock();
2781-
2782-
list_for_each_entry_rcu(consumer, &g_consumer_list, node) {
2783-
ring = per_cpu_ptr(consumer->ring_buffers, cpu);
2784-
if(sd_action == 1) {
2785-
/*
2786-
* If the cpu was offline when the consumer was created,
2787-
* this won't do anything because we never created a ring
2788-
* buffer. We can't safely create one here because we're
2789-
* in atomic context, and the consumer needs to call open
2790-
* on this device anyways, so do it in ppm_open.
2791-
*/
2792-
ring->cpu_online = true;
2793-
} else if(sd_action == 2) {
2794-
ring->cpu_online = false;
2795-
}
2796-
}
2797-
2798-
rcu_read_unlock();
2799-
2800-
event_data.category = PPMC_CONTEXT_SWITCH;
2801-
event_data.event_info.context_data.sched_prev = (void *)cpu;
2802-
event_data.event_info.context_data.sched_next = (void *)sd_action;
2803-
record_event_all_consumers(PPME_CPU_HOTPLUG_E, UF_NEVER_DROP, &event_data, INTERNAL_EVENTS);
2804-
}
2805-
return 0;
2806-
}
2807-
2808-
#if(LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
2809-
static int scap_cpu_online(unsigned int cpu) {
2810-
vpr_info("scap_cpu_online on cpu %d\n", cpu);
2811-
return do_cpu_callback(cpu, 1);
2812-
}
2813-
2814-
static int scap_cpu_offline(unsigned int cpu) {
2815-
vpr_info("scap_cpu_offline on cpu %d\n", cpu);
2816-
return do_cpu_callback(cpu, 2);
2817-
}
2818-
#else /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)) */
2819-
/*
2820-
* This gets called every time a CPU is added or removed
2821-
*/
2822-
static int cpu_callback(struct notifier_block *self, unsigned long action, void *hcpu) {
2823-
unsigned long cpu = (unsigned long)hcpu;
2824-
long sd_action = 0;
2825-
2826-
switch(action) {
2827-
case CPU_UP_PREPARE:
2828-
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
2829-
case CPU_UP_PREPARE_FROZEN:
2830-
#endif
2831-
sd_action = 1;
2832-
break;
2833-
case CPU_DOWN_PREPARE:
2834-
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
2835-
case CPU_DOWN_PREPARE_FROZEN:
2836-
#endif
2837-
sd_action = 2;
2838-
break;
2839-
default:
2840-
break;
2841-
}
2842-
2843-
if(do_cpu_callback(cpu, sd_action) < 0)
2844-
return NOTIFY_BAD;
2845-
else
2846-
return NOTIFY_OK;
2847-
}
2848-
2849-
static struct notifier_block cpu_notifier = {
2850-
.notifier_call = &cpu_callback,
2851-
.next = NULL,
2852-
};
2853-
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) */
2854-
28552784
static int scap_init(void) {
28562785
dev_t dev;
28572786
unsigned int cpu;
28582787
unsigned int num_cpus;
28592788
int ret;
28602789
int acrret = 0;
2861-
#if(LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
2862-
int hp_ret;
2863-
#endif
28642790
int j;
28652791
int n_created_devices = 0;
28662792
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
@@ -2964,25 +2890,6 @@ static int scap_init(void) {
29642890
goto init_module_err;
29652891
}
29662892

2967-
/*
2968-
* Set up our callback in case we get a hotplug even while we are
2969-
* initializing the cpu structures
2970-
*/
2971-
#if(LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
2972-
hp_ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
2973-
DRIVER_NAME "/driver:online",
2974-
scap_cpu_online,
2975-
scap_cpu_offline);
2976-
if(hp_ret <= 0) {
2977-
pr_err("error registering cpu hotplug callback\n");
2978-
ret = hp_ret;
2979-
goto init_module_err;
2980-
}
2981-
hp_state = hp_ret;
2982-
#else
2983-
register_cpu_notifier(&cpu_notifier);
2984-
#endif
2985-
29862893
// Initialize globals
29872894
g_tracepoints_attached = 0;
29882895
for(j = 0; j < KMOD_PROG_ATTACHED_MAX; j++) {
@@ -3041,13 +2948,6 @@ static void scap_exit(void) {
30412948
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
30422949
tracepoint_synchronize_unregister();
30432950
#endif
3044-
3045-
#if(LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
3046-
if(hp_state > 0)
3047-
cpuhp_remove_state_nocalls(hp_state);
3048-
#else
3049-
unregister_cpu_notifier(&cpu_notifier);
3050-
#endif
30512951
}
30522952

30532953
module_init(scap_init);

driver/ppm_consumer.h

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ or GPL2.txt for full copies of the license.
1616
struct ppm_consumer_t {
1717
unsigned int id; // numeric id for the consumer (ie: registration index)
1818
struct task_struct *consumer_id;
19+
int16_t hotplug_cpu;
1920
#ifdef __percpu
2021
struct ppm_ring_buffer_context __percpu *ring_buffers;
2122
#else

0 commit comments

Comments
 (0)