From a079feb820e78f2adb5a0d956e47a220ebd88f14 Mon Sep 17 00:00:00 2001 From: Dave Tucker Date: Tue, 23 Jul 2024 11:57:36 +0100 Subject: [PATCH 1/3] feat(bpf): Send events via Ring Buffer This uses a ring buffer to send events from eBPF back to userspace. Doing so allows for our eBPF probes to complete quickly, and pushes all delta calculation and summary back into userspace. Moves the logic that resolves comm from procfs into its own pkg. In addition, a cache is added to avoid hitting procfs every time we update process metrics. Removes the bpftest pkg and includes the test programs in the main kepler.bpf.c file. This prevents drift in the go generate flags and generally makes it easier to write tests. Signed-off-by: Dave Tucker --- Makefile | 7 +- bpf/kepler.bpf.c | 207 +++++++- bpf/kepler.bpf.h | 328 ++----------- bpf/test.bpf.c | 28 -- cmd/exporter/exporter.go | 7 + go.mod | 2 +- go.sum | 14 +- pkg/bpf/bpf_suite_test.go | 290 +++++++++++ pkg/bpf/exporter.go | 268 +++++++++-- pkg/bpf/exporter_test.go | 45 ++ pkg/bpf/gen.go | 2 +- pkg/bpf/kepler_bpfeb.go | 81 ++-- pkg/bpf/kepler_bpfeb.o | Bin 15224 -> 16488 bytes pkg/bpf/kepler_bpfel.go | 81 ++-- pkg/bpf/kepler_bpfel.o | Bin 15224 -> 16488 bytes pkg/bpf/test_utils.go | 32 +- pkg/bpf/types.go | 33 +- pkg/bpftest/bpf_suite_test.go | 450 ------------------ pkg/bpftest/gen.go | 3 - pkg/bpftest/test_bpfeb.go | 159 ------- pkg/bpftest/test_bpfeb.o | Bin 13848 -> 0 bytes pkg/bpftest/test_bpfel.go | 159 ------- pkg/bpftest/test_bpfel.o | Bin 13848 -> 0 bytes .../bpf/process_bpf_collector.go | 47 +- pkg/comm/resolve_comm.go | 105 ++++ pkg/comm/resolve_comm_test.go | 83 ++++ pkg/config/config.go | 2 - vendor/github.com/cilium/ebpf/.vimto.toml | 12 + vendor/github.com/cilium/ebpf/CODEOWNERS | 2 + vendor/github.com/cilium/ebpf/Makefile | 2 +- vendor/github.com/cilium/ebpf/btf/btf.go | 22 +- vendor/github.com/cilium/ebpf/btf/handle.go | 58 ++- vendor/github.com/cilium/ebpf/btf/types.go | 4 + vendor/github.com/cilium/ebpf/collection.go | 2 +- vendor/github.com/cilium/ebpf/elf_reader.go | 3 + vendor/github.com/cilium/ebpf/info.go | 50 +- .../cilium/ebpf/internal/epoll/poller.go | 278 +++++++++++ .../github.com/cilium/ebpf/internal/errors.go | 29 +- .../cilium/ebpf/internal/sys/types.go | 23 +- vendor/github.com/cilium/ebpf/link/cgroup.go | 19 + vendor/github.com/cilium/ebpf/link/kprobe.go | 10 +- .../cilium/ebpf/link/kprobe_multi.go | 21 +- vendor/github.com/cilium/ebpf/link/link.go | 115 +---- .../github.com/cilium/ebpf/link/netfilter.go | 20 + vendor/github.com/cilium/ebpf/link/netkit.go | 18 + vendor/github.com/cilium/ebpf/link/netns.go | 19 + .../github.com/cilium/ebpf/link/perf_event.go | 102 +++- vendor/github.com/cilium/ebpf/link/tcx.go | 18 + .../github.com/cilium/ebpf/link/tracepoint.go | 2 + vendor/github.com/cilium/ebpf/link/tracing.go | 19 + vendor/github.com/cilium/ebpf/link/uprobe.go | 4 + .../cilium/ebpf/link/uprobe_multi.go | 8 - vendor/github.com/cilium/ebpf/link/xdp.go | 28 +- vendor/github.com/cilium/ebpf/map.go | 68 ++- vendor/github.com/cilium/ebpf/prog.go | 98 ++-- vendor/github.com/cilium/ebpf/ringbuf/doc.go | 6 + .../github.com/cilium/ebpf/ringbuf/reader.go | 197 ++++++++ vendor/github.com/cilium/ebpf/ringbuf/ring.go | 137 ++++++ vendor/github.com/cilium/ebpf/run-tests.sh | 144 ------ vendor/modules.txt | 6 +- 60 files changed, 2330 insertions(+), 1647 deletions(-) delete mode 100644 bpf/test.bpf.c create mode 100644 pkg/bpf/exporter_test.go delete mode 100644 pkg/bpftest/bpf_suite_test.go delete mode 100644 pkg/bpftest/gen.go delete mode 100644 pkg/bpftest/test_bpfeb.go delete mode 100644 pkg/bpftest/test_bpfeb.o delete mode 100644 pkg/bpftest/test_bpfel.go delete mode 100644 pkg/bpftest/test_bpfel.o create mode 100644 pkg/comm/resolve_comm.go create mode 100644 pkg/comm/resolve_comm_test.go create mode 100644 vendor/github.com/cilium/ebpf/.vimto.toml create mode 100644 vendor/github.com/cilium/ebpf/internal/epoll/poller.go create mode 100644 vendor/github.com/cilium/ebpf/ringbuf/doc.go create mode 100644 vendor/github.com/cilium/ebpf/ringbuf/reader.go create mode 100644 vendor/github.com/cilium/ebpf/ringbuf/ring.go delete mode 100644 vendor/github.com/cilium/ebpf/run-tests.sh diff --git a/Makefile b/Makefile index d6c281b7a8..df3774b349 100644 --- a/Makefile +++ b/Makefile @@ -176,7 +176,6 @@ build: clean_build_local _build_local copy_build_local ## Build binary and copy .PHONY: generate generate: ## Generate BPF code locally. +@$(GOENV) go generate ./pkg/bpf - +@$(GOENV) go generate ./pkg/bpftest _build_local: generate ## Build Kepler binary locally. @echo TAGS=$(GO_BUILD_TAGS) @@ -275,7 +274,7 @@ container_test: TEST_PKGS := $(shell go list -tags $(GO_BUILD_TAGS) ./... | grep -v pkg/bpf | grep -v e2e) SUDO?=sudo -SUDO_TEST_PKGS := $(shell go list -tags $(GO_BUILD_TAGS) ./... | grep pkg/bpftest) +SUDO_TEST_PKGS := $(shell go list -tags $(GO_BUILD_TAGS) ./... | grep pkg/bpf) ##@ testing @@ -305,9 +304,9 @@ bpf-test: generate ginkgo-set ## Run BPF tests.$(GOBIN) -tags $(GO_TEST_TAGS) \ -cover \ --covermode=atomic \ - ./pkg/bpftest + ./pkg/bpf $(SUDO) $(ENVTEST_ASSETS_DIR)/ginkgo \ - ./pkg/bpftest/bpftest.test + ./pkg/bpf/bpf.test escapes_detect: tidy-vendor @$(GOENV) go build -tags $(GO_BUILD_TAGS) -gcflags="-m -l" ./... 2>&1 | grep "escapes to heap" || true diff --git a/bpf/kepler.bpf.c b/bpf/kepler.bpf.c index 80fb332471..251f1bcfec 100644 --- a/bpf/kepler.bpf.c +++ b/bpf/kepler.bpf.c @@ -3,6 +3,168 @@ #include "kepler.bpf.h" +// Ring buffer sizing +// 256kB is sufficient to store around 1000 events/sec for 5 seconds +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); // 256 KB +} rb SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, int); + __type(value, u32); +} cpu_cycles_event_reader SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, int); + __type(value, u32); +} cpu_instructions_event_reader SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, int); + __type(value, u32); +} cache_miss_event_reader SEC(".maps"); + +SEC(".rodata.config") +__attribute__((btf_decl_tag("Hardware Events Enabled"))) volatile const int HW = 1; + +static __always_inline u64 get_on_cpu_cycles(u32 *cpu_id) +{ + long error; + struct bpf_perf_event_value c = {}; + + error = bpf_perf_event_read_value( + &cpu_cycles_event_reader, *cpu_id, &c, sizeof(c)); + if (error) + return 0; + + return c.counter; +} + +static __always_inline u64 get_on_cpu_instr(u32 *cpu_id) +{ + long error; + struct bpf_perf_event_value c = {}; + + error = bpf_perf_event_read_value( + &cpu_instructions_event_reader, *cpu_id, &c, sizeof(c)); + if (error) + return 0; + + return c.counter; +} + +static __always_inline u64 get_on_cpu_cache_miss(u32 *cpu_id) +{ + long error; + struct bpf_perf_event_value c = {}; + + error = bpf_perf_event_read_value( + &cache_miss_event_reader, *cpu_id, &c, sizeof(c)); + if (error) + return 0; + + return c.counter; +} + +// Wake up userspace if there are at least 1000 events unprocessed +const long wakeup_data_size = sizeof(struct event) * 1000; + +// Get the flags for the ring buffer submit +static inline long get_flags() +{ + long sz; + + if (!wakeup_data_size) + return 0; + + sz = bpf_ringbuf_query(&rb, BPF_RB_AVAIL_DATA); + return sz >= wakeup_data_size ? BPF_RB_FORCE_WAKEUP : BPF_RB_NO_WAKEUP; +} + +static inline int do_kepler_sched_switch_trace( + u32 prev_pid, u32 prev_tgid, u32 next_pid, u32 next_tgid) +{ + struct event *e; + u64 cpu_cycles, cpu_instr, cache_miss = 0; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + + e->ts = bpf_ktime_get_ns(); + e->event_type = SCHED_SWITCH; + e->cpu_id = bpf_get_smp_processor_id(); + e->pid = next_tgid; + e->tid = next_pid; + e->offcpu_pid = prev_tgid; + e->offcpu_tid = prev_pid; + if (HW) { + e->cpu_cycles = get_on_cpu_cycles(&e->cpu_id); + e->cpu_instr = get_on_cpu_instr(&e->cpu_id); + e->cache_miss = get_on_cpu_cache_miss(&e->cpu_id); + } + e->offcpu_cgroup_id = bpf_get_current_cgroup_id(); + + bpf_ringbuf_submit(e, get_flags()); + + return 0; +} + +static inline int do_kepler_irq_trace(u32 vec) +{ + struct event *e; + + // We are interested in NET_TX, NET_RX, and BLOCK + if (vec == NET_TX || vec == NET_RX || vec == BLOCK) { + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + e->event_type = IRQ; + e->ts = bpf_ktime_get_ns(); + e->cpu_id = bpf_get_smp_processor_id(); + e->pid = bpf_get_current_pid_tgid() >> 32; + e->tid = (u32)bpf_get_current_pid_tgid(); + e->irq_number = vec; + + bpf_ringbuf_submit(e, get_flags()); + } + + return 0; +} + +static inline int do_page_cache_hit_increment(u32 curr_tgid) +{ + struct event *e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + e->event_type = PAGE_CACHE_HIT; + e->ts = bpf_ktime_get_ns(); + e->pid = curr_tgid; + + bpf_ringbuf_submit(e, get_flags()); + + return 0; +} + +static inline int do_process_free(u32 curr_tgid) +{ + struct event *e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + e->event_type = FREE; + e->ts = bpf_ktime_get_ns(); + e->pid = curr_tgid; + + bpf_ringbuf_submit(e, get_flags()); + + return 0; +} + SEC("tp_btf/sched_switch") int kepler_sched_switch_trace(u64 *ctx) { @@ -12,21 +174,17 @@ int kepler_sched_switch_trace(u64 *ctx) next_task = (struct task_struct *)ctx[2]; return do_kepler_sched_switch_trace( - prev_task->pid, next_task->pid, prev_task->tgid, next_task->tgid); + prev_task->pid, prev_task->tgid, next_task->pid, next_task->tgid); } SEC("tp_btf/softirq_entry") int kepler_irq_trace(u64 *ctx) { - u32 curr_tgid; - struct process_metrics_t *process_metrics; unsigned int vec; - - curr_tgid = bpf_get_current_pid_tgid() >> 32; vec = (unsigned int)ctx[0]; - process_metrics = bpf_map_lookup_elem(&processes, &curr_tgid); - if (process_metrics != 0 && vec < 10) - process_metrics->vec_nr[vec] += 1; + + do_kepler_irq_trace(vec); + return 0; } @@ -52,4 +210,37 @@ int kepler_write_page_trace(void *ctx) return 0; } +SEC("tp_btf/sched_process_free") +int kepler_sched_process_free(u64 *ctx) +{ + struct task_struct *task; + task = (struct task_struct *)ctx[0]; + do_process_free(task->tgid); + return 0; +} + +// TEST PROGRAMS - These programs are never attached in production + +SEC("raw_tp") +int test_kepler_write_page_trace(void *ctx) +{ + do_page_cache_hit_increment(42); + return 0; +} + +SEC("raw_tp") +int test_kepler_sched_switch_trace(u64 *ctx) +{ + // 42 going offcpu, 43 going on cpu + do_kepler_sched_switch_trace(42, 42, 43, 43); + return 0; +} + +SEC("raw_tp") +int test_kepler_sched_process_free(u64 *ctx) +{ + do_process_free(42); + return 0; +} + char __license[] SEC("license") = "Dual BSD/GPL"; diff --git a/bpf/kepler.bpf.h b/bpf/kepler.bpf.h index cb2da309a5..6eb4b96e5b 100644 --- a/bpf/kepler.bpf.h +++ b/bpf/kepler.bpf.h @@ -27,16 +27,7 @@ typedef struct pid_time_t { __u32 pid; } pid_time_t; -#ifndef NUM_CPUS -# define NUM_CPUS 128 -#endif - -#ifndef MAP_SIZE -# define MAP_SIZE 32768 -#endif - #include -#include enum bpf_map_type { BPF_MAP_TYPE_UNSPEC = 0, @@ -77,6 +68,29 @@ enum { BPF_F_LOCK = 4, }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +enum irq_type { + NET_TX = 2, + NET_RX = 3, + BLOCK = 4 +}; +const enum irq_type *unused2 __attribute__((unused)); + enum { BPF_F_INDEX_MASK = 0xffffffffULL, BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, @@ -90,285 +104,35 @@ struct bpf_perf_event_value { __u64 running; }; -typedef struct process_metrics_t { - u64 cgroup_id; - u64 pid; // pid is the kernel space view of the thread id - u64 process_run_time; +enum event_type { + SCHED_SWITCH = 1, + IRQ = 2, + PAGE_CACHE_HIT = 3, + FREE = 4 +}; + +// Force emitting enum event_type into the ELF. +const enum event_type *unused_event_type __attribute__((unused)); + +struct event { + u64 event_type; + u64 ts; + u32 pid; // kernel tgid == userspace pid + u32 tid; // kernel pid == userspace tid + u32 offcpu_pid; // kernel tgid == userspace pid + u32 offcpu_tid; // kernel pid == userspace tid + u64 offcpu_cgroup_id; // cgroup id is only known for processes going off cpu u64 cpu_cycles; u64 cpu_instr; u64 cache_miss; - u64 page_cache_hit; - u16 vec_nr[10]; - char comm[16]; -} process_metrics_t; - -struct { - __uint(type, BPF_MAP_TYPE_LRU_HASH); - __type(key, u32); - __type(value, process_metrics_t); - __uint(max_entries, MAP_SIZE); -} processes SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_LRU_HASH); - __type(key, u32); - __type(value, u64); - __uint(max_entries, MAP_SIZE); -} pid_time_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __type(key, int); - __type(value, u32); - __uint(max_entries, NUM_CPUS); -} cpu_cycles_event_reader SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, NUM_CPUS); -} cpu_cycles SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __type(key, int); - __type(value, u32); - __uint(max_entries, NUM_CPUS); -} cpu_instructions_event_reader SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, NUM_CPUS); -} cpu_instructions SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __type(key, int); - __type(value, u32); - __uint(max_entries, NUM_CPUS); -} cache_miss_event_reader SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, NUM_CPUS); -} cache_miss SEC(".maps"); - -// Test mode skips unsupported helpers -SEC(".rodata.config") -__attribute__((btf_decl_tag("Test"))) static volatile const int TEST = 0; - -// Test mode skips unsupported helpers -SEC(".rodata.config") -__attribute__((btf_decl_tag( - "Hardware Events Enabled"))) static volatile const int HW = 1; - -// The sampling rate should be disabled by default because its impact on the -// measurements is unknown. -SEC(".rodata.config") -__attribute__(( - btf_decl_tag("Sample Rate"))) static volatile const int SAMPLE_RATE = 0; + u32 cpu_id; + u32 irq_number; // one of NET_TX, NET_RX, BLOCK +}; -int counter_sched_switch = 0; +// Force emitting struct event into the ELF. +const struct event *unused_event __attribute__((unused)); struct task_struct { int pid; unsigned int tgid; } __attribute__((preserve_access_index)); - -static inline u64 calc_delta(u64 *prev_val, u64 val) -{ - u64 delta = 0; - // Probably a clock issue where the recorded on-CPU event had a - // timestamp later than the recorded off-CPU event, or vice versa. - if (prev_val && val > *prev_val) - delta = val - *prev_val; - - return delta; -} - -static inline u64 get_on_cpu_elapsed_time_us(u32 prev_pid, u64 curr_ts) -{ - u64 cpu_time = 0; - u64 *prev_ts; - - prev_ts = bpf_map_lookup_elem(&pid_time_map, &prev_pid); - if (prev_ts) { - cpu_time = calc_delta(prev_ts, curr_ts) / 1000; - bpf_map_delete_elem(&pid_time_map, &prev_pid); - } - - return cpu_time; -} - -static inline u64 get_on_cpu_cycles(u32 *cpu_id) -{ - u64 delta, val, *prev_val; - long error; - struct bpf_perf_event_value c = {}; - - error = bpf_perf_event_read_value( - &cpu_cycles_event_reader, *cpu_id, &c, sizeof(c)); - if (error) - return 0; - - val = c.counter; - prev_val = bpf_map_lookup_elem(&cpu_cycles, cpu_id); - delta = calc_delta(prev_val, val); - bpf_map_update_elem(&cpu_cycles, cpu_id, &val, BPF_ANY); - - return delta; -} - -static inline u64 get_on_cpu_instr(u32 *cpu_id) -{ - u64 delta, val, *prev_val; - long error; - struct bpf_perf_event_value c = {}; - - error = bpf_perf_event_read_value( - &cpu_instructions_event_reader, *cpu_id, &c, sizeof(c)); - if (error) - return 0; - - val = c.counter; - prev_val = bpf_map_lookup_elem(&cpu_instructions, cpu_id); - delta = calc_delta(prev_val, val); - bpf_map_update_elem(&cpu_instructions, cpu_id, &val, BPF_ANY); - - return delta; -} - -static inline u64 get_on_cpu_cache_miss(u32 *cpu_id) -{ - u64 delta, val, *prev_val; - long error; - struct bpf_perf_event_value c = {}; - - error = bpf_perf_event_read_value( - &cache_miss_event_reader, *cpu_id, &c, sizeof(c)); - if (error) - return 0; - val = c.counter; - prev_val = bpf_map_lookup_elem(&cache_miss, cpu_id); - delta = calc_delta(prev_val, val); - bpf_map_update_elem(&cache_miss, cpu_id, &val, BPF_ANY); - - return delta; -} - -static inline void register_new_process_if_not_exist(u32 tgid) -{ - u64 cgroup_id; - struct process_metrics_t *curr_tgid_metrics; - - // create new process metrics - curr_tgid_metrics = bpf_map_lookup_elem(&processes, &tgid); - if (!curr_tgid_metrics) { - cgroup_id = bpf_get_current_cgroup_id(); - // the Kernel tgid is the user-space PID, and the Kernel pid is the - // user-space TID - process_metrics_t new_process = { - .pid = tgid, - .cgroup_id = cgroup_id, - }; - - if (!TEST) - bpf_get_current_comm( - &new_process.comm, sizeof(new_process.comm)); - - bpf_map_update_elem(&processes, &tgid, &new_process, BPF_NOEXIST); - } -} - -static inline void collect_metrics_and_reset_counters( - struct process_metrics_t *buf, u32 prev_pid, u64 curr_ts, u32 cpu_id) -{ - if (HW) { - buf->cpu_cycles = get_on_cpu_cycles(&cpu_id); - buf->cpu_instr = get_on_cpu_instr(&cpu_id); - buf->cache_miss = get_on_cpu_cache_miss(&cpu_id); - } - // Get current time to calculate the previous task on-CPU time - buf->process_run_time = get_on_cpu_elapsed_time_us(prev_pid, curr_ts); -} - -static inline void do_page_cache_hit_increment(u32 curr_pid) -{ - struct process_metrics_t *process_metrics; - - process_metrics = bpf_map_lookup_elem(&processes, &curr_pid); - if (process_metrics) - process_metrics->page_cache_hit++; -} - -static inline int do_kepler_sched_switch_trace( - u32 prev_pid, u32 next_pid, u32 prev_tgid, u32 next_tgid) -{ - u32 cpu_id; - u64 curr_ts = bpf_ktime_get_ns(); - - struct process_metrics_t *curr_tgid_metrics, *prev_tgid_metrics; - struct process_metrics_t buf = {}; - - cpu_id = bpf_get_smp_processor_id(); - - // Collect metrics - // Regardless of skipping the collection, we need to update the hardware - // counter events to keep the metrics map current. - collect_metrics_and_reset_counters(&buf, prev_pid, curr_ts, cpu_id); - - // Skip some samples to minimize overhead - // Note that we can only skip samples after updating the metric maps to - // collect the right values - if (SAMPLE_RATE > 0) { - if (counter_sched_switch > 0) { - counter_sched_switch--; - return 0; - } - counter_sched_switch = SAMPLE_RATE; - } - - // The process_run_time is 0 if we do not have the previous timestamp of - // the task or due to a clock issue. In either case, we skip collecting - // all metrics to avoid discrepancies between the hardware counter and CPU - // time. - if (buf.process_run_time > 0) { - prev_tgid_metrics = bpf_map_lookup_elem(&processes, &prev_tgid); - if (prev_tgid_metrics) { - prev_tgid_metrics->process_run_time += buf.process_run_time; - prev_tgid_metrics->cpu_cycles += buf.cpu_cycles; - prev_tgid_metrics->cpu_instr += buf.cpu_instr; - prev_tgid_metrics->cache_miss += buf.cache_miss; - } - } - - // Add task on-cpu running start time - bpf_map_update_elem(&pid_time_map, &next_pid, &curr_ts, BPF_ANY); - - // create new process metrics - register_new_process_if_not_exist(next_tgid); - - return 0; -} - -static __always_inline void * -bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) -{ - void *val; - int err; - - val = bpf_map_lookup_elem(map, key); - if (val) - return val; - - err = bpf_map_update_elem(map, key, init, BPF_NOEXIST); - if (err && err != -17) - return 0; - - return bpf_map_lookup_elem(map, key); -} diff --git a/bpf/test.bpf.c b/bpf/test.bpf.c deleted file mode 100644 index 83b84ae3c1..0000000000 --- a/bpf/test.bpf.c +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -// Copyright 2021. - -#include "kepler.bpf.h" - -SEC("raw_tp") -int test_kepler_write_page_trace(void *ctx) -{ - do_page_cache_hit_increment(0); - return 0; -} - -SEC("raw_tp") -int test_register_new_process_if_not_exist(void *ctx) -{ - register_new_process_if_not_exist(42); - return 0; -} - -SEC("raw_tp/sched_switch") -int test_kepler_sched_switch_trace(u64 *ctx) -{ - do_kepler_sched_switch_trace(42, 43, 42, 43); - - return 0; -} - -char __license[] SEC("license") = "Dual BSD/GPL"; diff --git a/cmd/exporter/exporter.go b/cmd/exporter/exporter.go index 64f62b9525..a30c9d6fda 100644 --- a/cmd/exporter/exporter.go +++ b/cmd/exporter/exporter.go @@ -150,6 +150,11 @@ func main() { klog.Fatalf("failed to create eBPF exporter: %v", err) } defer bpfExporter.Detach() + stopCh := make(chan struct{}) + bpfErrCh := make(chan error) + go func() { + bpfErrCh <- bpfExporter.Start(stopCh) + }() m := manager.New(bpfExporter) @@ -199,6 +204,8 @@ func main() { select { case err := <-errChan: klog.Fatalf("%s", fmt.Sprintf("failed to listen and serve: %v", err)) + case err := <-bpfErrCh: + klog.Fatalf("%s", fmt.Sprintf("failed to start eBPF exporter: %v", err)) case <-signalChan: klog.Infof("Received shutdown signal") ctx, cancel := context.WithDeadline(ctx, time.Now().Add(5*time.Second)) diff --git a/go.mod b/go.mod index 3eb68a2c6d..09ddd13da4 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f github.com/NVIDIA/go-nvml v0.12.0-1 github.com/beevik/etree v1.4.0 - github.com/cilium/ebpf v0.15.0 + github.com/cilium/ebpf v0.16.0 github.com/digitalocean/go-libvirt v0.0.0-20240709142323-d8406205c752 github.com/jaypipes/ghw v0.12.0 github.com/joho/godotenv v1.5.1 diff --git a/go.sum b/go.sum index 9848db4c72..a63f42cf01 100644 --- a/go.sum +++ b/go.sum @@ -18,8 +18,8 @@ github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJR github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cilium/ebpf v0.15.0 h1:7NxJhNiBT3NG8pZJ3c+yfrVdHY8ScgKD27sScgjLMMk= -github.com/cilium/ebpf v0.15.0/go.mod h1:DHp1WyrLeiBh19Cf/tfiSMhqheEiK8fXFZ4No0P1Hso= +github.com/cilium/ebpf v0.16.0 h1:+BiEnHL6Z7lXnlGUsXQPPAE7+kenAd4ES8MQ5min0Ok= +github.com/cilium/ebpf v0.16.0/go.mod h1:L7u2Blt2jMM/vLAVgjxluxtBKlz3/GWjB0dMOEngfwE= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= @@ -81,8 +81,12 @@ github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= +github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= +github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM= +github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jszwec/csvutil v1.10.0 h1:upMDUxhQKqZ5ZDCs/wy+8Kib8rZR8I8lOR34yJkdqhI= @@ -99,6 +103,10 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= +github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= +github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U= +github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= @@ -170,6 +178,8 @@ golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbht golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/pkg/bpf/bpf_suite_test.go b/pkg/bpf/bpf_suite_test.go index 8fc64463b8..8baecb3e15 100644 --- a/pkg/bpf/bpf_suite_test.go +++ b/pkg/bpf/bpf_suite_test.go @@ -1,13 +1,303 @@ package bpf import ( + "bytes" + "encoding/binary" "testing" + "time" + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/ringbuf" + "github.com/cilium/ebpf/rlimit" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/onsi/gomega/gmeasure" + "golang.org/x/sys/unix" ) func TestBpf(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "Bpf Suite") } + +var _ = Describe("BPF Exporter", func() { + It("should send a page cache hit event", func() { + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + out, err := obj.TestKeplerWritePageTrace.Run(&ebpf.RunOptions{}) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) + + // Read the event from the ring buffer + rd, err := ringbuf.NewReader(obj.Rb) + Expect(err).NotTo(HaveOccurred()) + defer rd.Close() + + var event keplerEvent + record, err := rd.Read() + Expect(err).NotTo(HaveOccurred()) + + err = binary.Read(bytes.NewBuffer(record.RawSample), binary.NativeEndian, &event) + Expect(err).NotTo(HaveOccurred()) + Expect(event.Pid).To(Equal(uint32(42))) + Expect(event.Ts).To(BeNumerically(">", uint64(0))) + Expect(event.EventType).To(Equal(uint64(keplerEventTypePAGE_CACHE_HIT))) + }) + + It("should send a process free event", func() { + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + out, err := obj.TestKeplerSchedProcessFree.Run(&ebpf.RunOptions{ + Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU + CPU: uint32(0), + }) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) + + // Read the event from the ring buffer + rd, err := ringbuf.NewReader(obj.Rb) + Expect(err).NotTo(HaveOccurred()) + defer rd.Close() + + var event keplerEvent + record, err := rd.Read() + Expect(err).NotTo(HaveOccurred()) + + err = binary.Read(bytes.NewBuffer(record.RawSample), binary.NativeEndian, &event) + Expect(err).NotTo(HaveOccurred()) + Expect(event.Pid).To(Equal(uint32(42))) + Expect(event.Ts).To(BeNumerically(">", uint64(0))) + Expect(event.EventType).To(Equal(uint64(keplerEventTypeFREE))) + }) + + It("should increment the page hit counter efficiently", func() { + experiment := gmeasure.NewExperiment("Increment the page hit counter") + AddReportEntry(experiment.Name, experiment) + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + experiment.Sample(func(idx int) { + experiment.MeasureDuration("page hit counter increment", func() { + out, err := obj.TestKeplerWritePageTrace.Run(&ebpf.RunOptions{}) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) + }, gmeasure.Precision(time.Nanosecond)) + }, gmeasure.SamplingConfig{N: 1000000, Duration: 10 * time.Second}) + }) + + It("collects hardware counter metrics for sched_switch events", Label("perf_event"), func() { + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + perfEvents, err := createTestHardwarePerfEvents( + obj.CpuInstructionsEventReader, + obj.CpuCyclesEventReader, + obj.CacheMissEventReader, + ) + Expect(err).NotTo(HaveOccurred()) + defer func() { + for _, fd := range perfEvents { + unix.Close(fd) + } + }() + + out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ + Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU + CPU: uint32(0), + }) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) + + // Read the event from the ring buffer + rd, err := ringbuf.NewReader(obj.Rb) + Expect(err).NotTo(HaveOccurred()) + defer rd.Close() + + var event keplerEvent + record := new(ringbuf.Record) + + err = rd.ReadInto(record) + Expect(err).NotTo(HaveOccurred()) + + err = binary.Read(bytes.NewBuffer(record.RawSample), binary.NativeEndian, &event) + Expect(err).NotTo(HaveOccurred()) + Expect(event.Pid).To(Equal(uint32(43))) + Expect(event.Tid).To(Equal(uint32(43))) + Expect(event.Ts).To(BeNumerically(">", uint64(0))) + Expect(event.EventType).To(Equal(uint64(keplerEventTypeSCHED_SWITCH))) + Expect(event.CpuCycles).To(BeNumerically(">", uint64(0))) + Expect(event.CpuInstr).To(BeNumerically(">", uint64(0))) + Expect(event.CacheMiss).To(BeNumerically(">", uint64(0))) + Expect(event.OffcpuPid).To(Equal(uint32(42))) + Expect(event.OffcpuTid).To(Equal(uint32(42))) + }) + + It("collects metrics for sched_switch events when no hardware events are enabled", Label("perf_event"), func() { + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + err = specs.RewriteConstants(map[string]interface{}{ + "HW": int32(-1), + }) + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ + Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU + CPU: uint32(0), + }) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) + + // Read the event from the ring buffer + rd, err := ringbuf.NewReader(obj.Rb) + Expect(err).NotTo(HaveOccurred()) + defer rd.Close() + + var event keplerEvent + record := new(ringbuf.Record) + + err = rd.ReadInto(record) + Expect(err).NotTo(HaveOccurred()) + + err = binary.Read(bytes.NewBuffer(record.RawSample), binary.NativeEndian, &event) + Expect(err).NotTo(HaveOccurred()) + Expect(event.Pid).To(Equal(uint32(43))) + Expect(event.Tid).To(Equal(uint32(43))) + Expect(event.Ts).To(BeNumerically(">", uint64(0))) + Expect(event.EventType).To(Equal(uint64(keplerEventTypeSCHED_SWITCH))) + Expect(event.CpuCycles).To(BeNumerically("==", uint64(0))) + Expect(event.CpuInstr).To(BeNumerically("==", uint64(0))) + Expect(event.CacheMiss).To(BeNumerically("==", uint64(0))) + Expect(event.OffcpuPid).To(Equal(uint32(42))) + Expect(event.OffcpuTid).To(Equal(uint32(42))) + }) + + It("efficiently collects hardware counter metrics for sched_switch events", Label("perf_event"), func() { + experiment := gmeasure.NewExperiment("sched_switch tracepoint") + AddReportEntry(experiment.Name, experiment) + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + perfEvents, err := createTestHardwarePerfEvents( + obj.CpuInstructionsEventReader, + obj.CpuCyclesEventReader, + obj.CacheMissEventReader, + ) + Expect(err).NotTo(HaveOccurred()) + defer func() { + for _, fd := range perfEvents { + unix.Close(fd) + } + }() + experiment.Sample(func(idx int) { + experiment.MeasureDuration("sched_switch tracepoint", func() { + runSchedSwitchTracepoint(&obj) + }, gmeasure.Precision(time.Nanosecond)) + Expect(err).NotTo(HaveOccurred()) + }, gmeasure.SamplingConfig{N: 1000000, Duration: 10 * time.Second}) + }) +}) + +func runSchedSwitchTracepoint(obj *keplerObjects) { + out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ + Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU + CPU: uint32(0), + }) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) +} + +// This function is used to create hardware perf events for CPU cycles, instructions and cache misses. +// Instead of using hardware perf events, we use the software perf event for testing purposes. +func createTestHardwarePerfEvents(cpuCyclesMap, cpuInstructionsMap, cacheMissMap *ebpf.Map) ([]int, error) { + cpuCyclesFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK, 1) + if err != nil { + return nil, err + } + err = cpuCyclesMap.Update(uint32(0), uint32(cpuCyclesFd[0]), ebpf.UpdateAny) + if err != nil { + return nil, err + } + + cpuInstructionsFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK, 1) + if err != nil { + return nil, err + } + err = cpuInstructionsMap.Update(uint32(0), uint32(cpuInstructionsFd[0]), ebpf.UpdateAny) + if err != nil { + return nil, err + } + + cacheMissFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK, 1) + if err != nil { + return nil, err + } + err = cacheMissMap.Update(uint32(0), uint32(cacheMissFd[0]), ebpf.UpdateAny) + if err != nil { + return nil, err + } + + return []int{cpuCyclesFd[0], cpuInstructionsFd[0], cacheMissFd[0]}, nil +} diff --git a/pkg/bpf/exporter.go b/pkg/bpf/exporter.go index ded8a88887..c9814fcc69 100644 --- a/pkg/bpf/exporter.go +++ b/pkg/bpf/exporter.go @@ -17,15 +17,18 @@ limitations under the License. package bpf import ( + "bytes" + "encoding/binary" "errors" "fmt" "os" "runtime" - "time" + "sync" "unsafe" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" + "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" "github.com/jaypipes/ghw" "github.com/sustainable-computing-io/kepler/pkg/config" @@ -35,23 +38,38 @@ import ( ) type exporter struct { - bpfObjects keplerObjects - + bpfObjects keplerObjects + cpus int schedSwitchLink link.Link irqLink link.Link pageWriteLink link.Link pageReadLink link.Link + processFreeLink link.Link perfEvents *hardwarePerfEvents enabledHardwareCounters sets.Set[string] enabledSoftwareCounters sets.Set[string] + + // Locks processMetrics and freedPIDs. + // Acquired in CollectProcesses - to prevent new events from being processed + // while summarizing the metrics and resetting the counters. + // Acquired in handleEvents - to prevent CollectProcesses from summarizing + // the metrics while we're handling an event from the ring buffer. + // Note: Release this lock as soon as possible as it will block the + // processing of new events from the ring buffer. + mu *sync.Mutex + processMetrics map[uint32]*bpfMetrics + freedPIDs []int } func NewExporter() (Exporter, error) { e := &exporter{ + cpus: ebpf.MustPossibleCPU(), enabledHardwareCounters: sets.New[string](), enabledSoftwareCounters: sets.New[string](), + mu: &sync.Mutex{}, + processMetrics: make(map[uint32]*bpfMetrics), } err := e.attach() if err != nil { @@ -89,20 +107,20 @@ func (e *exporter) attach() error { } } - // Set program global variables - err = specs.RewriteConstants(map[string]interface{}{ - "SAMPLE_RATE": int32(config.BPFSampleRate), - }) - if err != nil { - return fmt.Errorf("error rewriting program constants: %v", err) - } - // Load the eBPF program(s) if err := specs.LoadAndAssign(&e.bpfObjects, nil); err != nil { return fmt.Errorf("error loading eBPF objects: %v", err) } // Attach the eBPF program(s) + e.processFreeLink, err = link.AttachTracing(link.TracingOptions{ + Program: e.bpfObjects.KeplerSchedProcessFree, + AttachType: ebpf.AttachTraceRawTp, + }) + if err != nil { + return fmt.Errorf("error attaching sched_process_free tracepoint: %v", err) + } + e.schedSwitchLink, err = link.AttachTracing(link.TracingOptions{ Program: e.bpfObjects.KeplerSchedSwitchTrace, AttachType: ebpf.AttachTraceRawTp, @@ -192,38 +210,212 @@ func (e *exporter) Detach() { } // Perf events - e.perfEvents.close() - e.perfEvents = nil + if e.perfEvents != nil { + e.perfEvents.close() + e.perfEvents = nil + } // Objects e.bpfObjects.Close() } -func (e *exporter) CollectProcesses() ([]ProcessMetrics, error) { - start := time.Now() - // Get the max number of entries in the map - maxEntries := e.bpfObjects.Processes.MaxEntries() - total := 0 - deleteKeys := make([]uint32, maxEntries) - deleteValues := make([]ProcessMetrics, maxEntries) - var cursor ebpf.MapBatchCursor +func (e *exporter) Start(stopChan <-chan struct{}) error { + rd, err := ringbuf.NewReader(e.bpfObjects.Rb) + if err != nil { + return fmt.Errorf("failed to create ring buffer reader: %w", err) + } + defer rd.Close() + for { - count, err := e.bpfObjects.Processes.BatchLookupAndDelete( - &cursor, - deleteKeys, - deleteValues, - &ebpf.BatchOptions{}, - ) - total += count - if errors.Is(err, ebpf.ErrKeyNotExist) { - break + var record *ringbuf.Record + + select { + case <-stopChan: + if err := rd.Close(); err != nil { + return fmt.Errorf("closing ring buffer reader: %w", err) + } + return nil + default: + var event keplerEvent + record = new(ringbuf.Record) + + err := rd.ReadInto(record) + if err != nil { + if errors.Is(err, ringbuf.ErrClosed) { + return nil + } + if errors.Is(err, ringbuf.ErrFlushed) { + record.RawSample = record.RawSample[:0] + } + klog.Errorf("reading from reader: %s", err) + continue + } + + if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.NativeEndian, &event); err != nil { + klog.Errorf("parsing ringbuf event: %s", err) + continue + } + + // Process the event + e.handleEvent(event) } - if err != nil { - return nil, fmt.Errorf("failed to batch lookup and delete: %v", err) + } +} + +type bpfMetrics struct { + CGroupID uint64 + CPUCyles PerCPUCounter + CPUInstructions PerCPUCounter + CacheMiss PerCPUCounter + CPUTime PerCPUCounter + TxIRQ uint64 + RxIRQ uint64 + BlockIRQ uint64 + PageCacheHit uint64 +} + +func (p *bpfMetrics) Reset() { + p.CPUCyles.Reset() + p.CPUInstructions.Reset() + p.CacheMiss.Reset() + p.CPUTime.Reset() + p.TxIRQ = 0 + p.RxIRQ = 0 + p.BlockIRQ = 0 + p.PageCacheHit = 0 +} + +func newBpfMetrics() *bpfMetrics { + return &bpfMetrics{ + CPUCyles: NewPerCPUCounter(), + CPUInstructions: NewPerCPUCounter(), + CacheMiss: NewPerCPUCounter(), + CPUTime: NewPerCPUCounter(), + } +} + +type PerCPUCounter struct { + Values map[uint64]uint64 + Total uint64 +} + +func NewPerCPUCounter() PerCPUCounter { + return PerCPUCounter{ + Values: make(map[uint64]uint64), + } +} + +func (p *PerCPUCounter) Start(cpu, taskID uint32, value uint64) { + key := uint64(cpu)<<32 | uint64(taskID) + + // TODO: The eBPF code would blindly overwrite the value if it already exists. + // We will preserve the old behavior for now, but we should consider + // returning an error if the value already exists. + p.Values[key] = value +} + +func (p *PerCPUCounter) Stop(cpu, taskID uint32, value uint64) { + if value == 0 { + return + } + + key := uint64(cpu)<<32 | uint64(taskID) + + if _, ok := p.Values[key]; !ok { + return + } + + delta := uint64(0) + + // Probably a clock issue where the recorded on-CPU event had a + // timestamp later than the recorded off-CPU event, or vice versa. + if value > p.Values[key] { + delta = value - p.Values[key] + } + + p.Total += delta + + delete(p.Values, key) +} + +func (p *PerCPUCounter) Reset() { + // Leave values in place since we may have in-flight + p.Total = 0 +} + +func (e *exporter) handleEvent(event keplerEvent) { + e.mu.Lock() + defer e.mu.Unlock() + + var p *bpfMetrics + + if _, ok := e.processMetrics[event.Pid]; !ok { + e.processMetrics[event.Pid] = newBpfMetrics() + } + p = e.processMetrics[event.Pid] + + switch event.EventType { + case uint64(keplerEventTypeSCHED_SWITCH): + // Handle the new task going on CPU + p.CPUCyles.Start(event.CpuId, event.Tid, event.CpuCycles) + p.CPUInstructions.Start(event.CpuId, event.Tid, event.CpuInstr) + p.CacheMiss.Start(event.CpuId, event.Tid, event.CacheMiss) + p.CPUTime.Start(event.CpuId, event.Tid, event.Ts) + + // Handle the task going OFF CPU + if _, ok := e.processMetrics[event.OffcpuPid]; !ok { + e.processMetrics[event.OffcpuPid] = newBpfMetrics() + } + offcpu := e.processMetrics[event.OffcpuPid] + offcpu.CPUCyles.Stop(event.CpuId, event.OffcpuTid, event.CpuCycles) + offcpu.CPUInstructions.Stop(event.CpuId, event.OffcpuTid, event.CpuInstr) + offcpu.CacheMiss.Stop(event.CpuId, event.OffcpuTid, event.CacheMiss) + offcpu.CPUTime.Stop(event.CpuId, event.OffcpuTid, event.Ts) + offcpu.CGroupID = event.OffcpuCgroupId + case uint64(keplerEventTypePAGE_CACHE_HIT): + p.PageCacheHit += 1 + case uint64(keplerEventTypeIRQ): + switch event.IrqNumber { + case uint32(keplerIrqTypeNET_TX): + p.TxIRQ += 1 + case uint32(keplerIrqTypeNET_RX): + p.RxIRQ += 1 + case uint32(keplerIrqTypeBLOCK): + p.BlockIRQ += 1 } + return + case uint64(keplerEventTypeFREE): + e.freedPIDs = append(e.freedPIDs, int(event.Pid)) + } +} + +func (e *exporter) CollectProcesses() (ProcessMetricsCollection, error) { + e.mu.Lock() + defer e.mu.Unlock() + + result := ProcessMetricsCollection{ + Metrics: make([]ProcessMetrics, len(e.processMetrics)), + FreedPIDs: e.freedPIDs, } - klog.V(5).Infof("collected %d process samples in %v", total, time.Since(start)) - return deleteValues[:total], nil + for pid, m := range e.processMetrics { + result.Metrics = append(result.Metrics, ProcessMetrics{ + CGroupID: m.CGroupID, + Pid: uint64(pid), + ProcessRunTime: m.CPUTime.Total / 1000, // convert nanoseconds to milliseconds + CPUCyles: m.CPUCyles.Total, + CPUInstructions: m.CPUInstructions.Total, + CacheMiss: m.CacheMiss.Total, + PageCacheHit: m.PageCacheHit, + NetTxIRQ: m.TxIRQ, + NetRxIRQ: m.RxIRQ, + NetBlockIRQ: m.BlockIRQ, + }) + m.Reset() + } + // Clear the cache of any PIDs freed this sample period + e.freedPIDs = []int{} + + return result, nil } /////////////////////////////////////////////////////////////////////////// @@ -281,12 +473,12 @@ func (h *hardwarePerfEvents) close() { func createHardwarePerfEvents(cpuInstructionsMap, cpuCyclesMap, cacheMissMap *ebpf.Map, numCPU int) (*hardwarePerfEvents, error) { var err error events := &hardwarePerfEvents{ - cpuCyclesPerfEvents: []int{}, - cpuInstructionsPerfEvents: []int{}, - cacheMissPerfEvents: []int{}, + cpuCyclesPerfEvents: make([]int, 0, numCPU), + cpuInstructionsPerfEvents: make([]int, 0, numCPU), + cacheMissPerfEvents: make([]int, 0, numCPU), } defer func() { - if err != nil { + if err != nil && events != nil { unixClosePerfEvents(events.cpuCyclesPerfEvents) unixClosePerfEvents(events.cpuInstructionsPerfEvents) unixClosePerfEvents(events.cacheMissPerfEvents) diff --git a/pkg/bpf/exporter_test.go b/pkg/bpf/exporter_test.go new file mode 100644 index 0000000000..57736a25dc --- /dev/null +++ b/pkg/bpf/exporter_test.go @@ -0,0 +1,45 @@ +package bpf + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("PerCPUCounter", func() { + + var counter PerCPUCounter + BeforeEach(func() { + counter = NewPerCPUCounter() + }) + + It("should record the correct delta for one time period", func() { + counter.Start(1, 1, 10) + key := uint64(1)<<32 | uint64(1) + Expect(counter.Values[key]).To(Equal(uint64(10))) + counter.Stop(1, 1, 21) + Expect(counter.Values).NotTo(ContainElement(key)) + Expect(counter.Total).To(Equal(uint64(11))) + }) + + It("should record the correct delta for an additional time period", func() { + counter.Start(1, 1, 10) + key := uint64(1)<<32 | uint64(1) + Expect(counter.Values[key]).To(Equal(uint64(10))) + counter.Stop(1, 1, 21) + + Expect(counter.Values).NotTo(ContainElement(key)) + Expect(counter.Total).To(Equal(uint64(11))) + counter.Start(1, 1, 30) + + Expect(counter.Values[key]).To(Equal(uint64(30))) + counter.Stop(1, 1, 42) + Expect(counter.Values).NotTo(ContainElement(key)) + Expect(counter.Total).To(Equal(uint64(23))) + }) + + It("should not increment Total if Start() has not been called", func() { + counter.Stop(1, 1, 42) + Expect(counter.Total).To(Equal(uint64(0))) + }) + +}) diff --git a/pkg/bpf/gen.go b/pkg/bpf/gen.go index 95ac1f552e..b22c0ca7bb 100644 --- a/pkg/bpf/gen.go +++ b/pkg/bpf/gen.go @@ -1,3 +1,3 @@ package bpf -//go:generate go run github.com/cilium/ebpf/cmd/bpf2go@v0.15.0 kepler ../../bpf/kepler.bpf.c -- -I../../bpf/include +//go:generate go run github.com/cilium/ebpf/cmd/bpf2go@v0.15.0 -type event -type event_type -type irq_type kepler ../../bpf/kepler.bpf.c -- -I../../bpf/include diff --git a/pkg/bpf/kepler_bpfeb.go b/pkg/bpf/kepler_bpfeb.go index 1f82307638..eda2abc0f9 100644 --- a/pkg/bpf/kepler_bpfeb.go +++ b/pkg/bpf/kepler_bpfeb.go @@ -12,19 +12,38 @@ import ( "github.com/cilium/ebpf" ) -type keplerProcessMetricsT struct { - CgroupId uint64 - Pid uint64 - ProcessRunTime uint64 +type keplerEvent struct { + EventType uint64 + Ts uint64 + Pid uint32 + Tid uint32 + OffcpuPid uint32 + OffcpuTid uint32 + OffcpuCgroupId uint64 CpuCycles uint64 CpuInstr uint64 CacheMiss uint64 - PageCacheHit uint64 - VecNr [10]uint16 - Comm [16]int8 - _ [4]byte + CpuId uint32 + IrqNumber uint32 } +type keplerEventType uint32 + +const ( + keplerEventTypeSCHED_SWITCH keplerEventType = 1 + keplerEventTypeIRQ keplerEventType = 2 + keplerEventTypePAGE_CACHE_HIT keplerEventType = 3 + keplerEventTypeFREE keplerEventType = 4 +) + +type keplerIrqType uint32 + +const ( + keplerIrqTypeNET_TX keplerIrqType = 2 + keplerIrqTypeNET_RX keplerIrqType = 3 + keplerIrqTypeBLOCK keplerIrqType = 4 +) + // loadKepler returns the embedded CollectionSpec for kepler. func loadKepler() (*ebpf.CollectionSpec, error) { reader := bytes.NewReader(_KeplerBytes) @@ -66,24 +85,24 @@ type keplerSpecs struct { // // It can be passed ebpf.CollectionSpec.Assign. type keplerProgramSpecs struct { - KeplerIrqTrace *ebpf.ProgramSpec `ebpf:"kepler_irq_trace"` - KeplerReadPageTrace *ebpf.ProgramSpec `ebpf:"kepler_read_page_trace"` - KeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"kepler_sched_switch_trace"` - KeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"kepler_write_page_trace"` + KeplerIrqTrace *ebpf.ProgramSpec `ebpf:"kepler_irq_trace"` + KeplerReadPageTrace *ebpf.ProgramSpec `ebpf:"kepler_read_page_trace"` + KeplerSchedProcessFree *ebpf.ProgramSpec `ebpf:"kepler_sched_process_free"` + KeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"kepler_sched_switch_trace"` + KeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"kepler_write_page_trace"` + TestKeplerSchedProcessFree *ebpf.ProgramSpec `ebpf:"test_kepler_sched_process_free"` + TestKeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"test_kepler_sched_switch_trace"` + TestKeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"test_kepler_write_page_trace"` } // keplerMapSpecs contains maps before they are loaded into the kernel. // // It can be passed ebpf.CollectionSpec.Assign. type keplerMapSpecs struct { - CacheMiss *ebpf.MapSpec `ebpf:"cache_miss"` CacheMissEventReader *ebpf.MapSpec `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.MapSpec `ebpf:"cpu_cycles"` CpuCyclesEventReader *ebpf.MapSpec `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.MapSpec `ebpf:"cpu_instructions"` CpuInstructionsEventReader *ebpf.MapSpec `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.MapSpec `ebpf:"pid_time_map"` - Processes *ebpf.MapSpec `ebpf:"processes"` + Rb *ebpf.MapSpec `ebpf:"rb"` } // keplerObjects contains all objects after they have been loaded into the kernel. @@ -105,26 +124,18 @@ func (o *keplerObjects) Close() error { // // It can be passed to loadKeplerObjects or ebpf.CollectionSpec.LoadAndAssign. type keplerMaps struct { - CacheMiss *ebpf.Map `ebpf:"cache_miss"` CacheMissEventReader *ebpf.Map `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.Map `ebpf:"cpu_cycles"` CpuCyclesEventReader *ebpf.Map `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.Map `ebpf:"cpu_instructions"` CpuInstructionsEventReader *ebpf.Map `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.Map `ebpf:"pid_time_map"` - Processes *ebpf.Map `ebpf:"processes"` + Rb *ebpf.Map `ebpf:"rb"` } func (m *keplerMaps) Close() error { return _KeplerClose( - m.CacheMiss, m.CacheMissEventReader, - m.CpuCycles, m.CpuCyclesEventReader, - m.CpuInstructions, m.CpuInstructionsEventReader, - m.PidTimeMap, - m.Processes, + m.Rb, ) } @@ -132,18 +143,26 @@ func (m *keplerMaps) Close() error { // // It can be passed to loadKeplerObjects or ebpf.CollectionSpec.LoadAndAssign. type keplerPrograms struct { - KeplerIrqTrace *ebpf.Program `ebpf:"kepler_irq_trace"` - KeplerReadPageTrace *ebpf.Program `ebpf:"kepler_read_page_trace"` - KeplerSchedSwitchTrace *ebpf.Program `ebpf:"kepler_sched_switch_trace"` - KeplerWritePageTrace *ebpf.Program `ebpf:"kepler_write_page_trace"` + KeplerIrqTrace *ebpf.Program `ebpf:"kepler_irq_trace"` + KeplerReadPageTrace *ebpf.Program `ebpf:"kepler_read_page_trace"` + KeplerSchedProcessFree *ebpf.Program `ebpf:"kepler_sched_process_free"` + KeplerSchedSwitchTrace *ebpf.Program `ebpf:"kepler_sched_switch_trace"` + KeplerWritePageTrace *ebpf.Program `ebpf:"kepler_write_page_trace"` + TestKeplerSchedProcessFree *ebpf.Program `ebpf:"test_kepler_sched_process_free"` + TestKeplerSchedSwitchTrace *ebpf.Program `ebpf:"test_kepler_sched_switch_trace"` + TestKeplerWritePageTrace *ebpf.Program `ebpf:"test_kepler_write_page_trace"` } func (p *keplerPrograms) Close() error { return _KeplerClose( p.KeplerIrqTrace, p.KeplerReadPageTrace, + p.KeplerSchedProcessFree, p.KeplerSchedSwitchTrace, p.KeplerWritePageTrace, + p.TestKeplerSchedProcessFree, + p.TestKeplerSchedSwitchTrace, + p.TestKeplerWritePageTrace, ) } diff --git a/pkg/bpf/kepler_bpfeb.o b/pkg/bpf/kepler_bpfeb.o index be2eedd1dfdb270336320a50a2f9197fc7a5fc55..c624302b7a1fa8e9625fb67d9ad424fa61a121b7 100644 GIT binary patch literal 16488 zcmeHNeQaFWRlo0z9nU1~)}_0x_oMXfZjz1@#~J6-`;{Fhadx7OcHL~)b$3}kkH^pU zDE^qtj1#*_g%JTPw3M_gV59LFuSE^F+~^{F(` zr9IgX6IH1Pn$<2UFBj+uzrJ!~oy#>Y^_UD1P%1db<<%!mr3wDrxN&1u>c1Ko{^hhT zTiEV{uutoIfaieyQ}8cIlJ)w}Za041uHQT850>B4{yM2Y9*3UueJUU<$BpB}%KaQ; zKceyS9(9)Bq1wl(WG z^Xk}nt9Zr~SK`V)&jG~#SK^y-JjZd$_&ytFplgdQJvXe5l3vhtyw|a-GJcUAw*s|N zm2tcJNsX=ChVirgT{Z3M&C{*?{}A=S@g1}4_ngreu>MA@W9=W}LGLrS^XG2k;s2UH z*pGF}o-zAwopHO>{M_fvPlKJGdp$3AVfRhr%G%BGw!`lK?{{3?h5Z>CcWbY;kL})Y z&awU-da{+;fqBw98q;?&eJAL89p(Q2ysLM@M41wXnz+YrP$(K2iH^qOtmEUv=SEbOvY$|D({+{&Zq9G|Jo6vB z2_N=0eT(^9Z^DQE_@|h^{U&_0k54c^S%?27^Yxje0}wGqvK%0AFaAe77i}P8?BZ|p zb)_A5>>DJ@?K;5wD;iv+ILb7N?{Hh%uc__#m``hv=E;rsy?~e!8pilPal3uMx(Sc- zL15coC*=Mei;?0Q?c&!_LM%ZTIYDNDo@i!W2Y_8&PXSw9PeZN+x~P9@^SF^sklmz{ z>>!;-n4e*t^ycX5d=}X1Ohf)vD8mrb+DcHoQv4z%`tYE78qEh#(psJdJ;(Jq)afmQ z=!>Y+n>f{1Q2#7Sw54tnpW}KO^h z1U-Dz=?N}rj-GMqUDbNxuOScjbkcK)8toSx9(u3&h+*rxeuo0aI3$+QR*xI-9+rWQ z#EL_2XFB>w{HsIX&-4$f{8(9qVb{r%U7rqLI{s>Rc273vY$%|I%9SV4abYAXh9xu~2utVk<=k*s zE@pF)Di^~3^5AZlquSed&dHkRMRxAv{t~cBn+Pzg>sGX$va-?>ax5=ou)}!N7HrQ|hAr2s2cuXVFS4hOYKrMl>ra|tjYtk7OQToq z1`TO=#>c`cExDaV^enPfbr;61Y_omo#1|*u(X2$h++( zK$2$Z(lFCvu+tA}NLyL5H8^xA*t^GesyBor4w%;ZEm)SubZla@KcW?iF(UGv%x3B- zJ7s!37qUYUPuyTMnJ@1i%@$9Gnwrhg-WCnuW`XF{c5}Gi#wEC4vzhjEJ=Yx`>B5#8 zW{#hN!Ikl;D6SZFu9z=Jrhj{)?bfvZZ0>Y8kS~^}!ol&8{J8bP9Cf@Ahl9mP9D+5R z#BGl|9gm23w^@=}nM3MkXT`CT-Q76&qEgwelMPN3v**Hcp+?3%qV39YRNTa^%quFB zEe@Q^7Nek>ZW*PZdo0^Of*H~`1Gyil+%UevgyBd&7mbx7rI#q(Zp<3f#mQc!*As3U z6PW5fsycDXJp&b1h^n1OMSgQTK791uNo!E$lER6mswbYUI@HQ$=-i*GS0+{6Y7KEf zdthR}reD?GUn;5g(QE9nG2W#-{pvl-_+{n2N&Gp+XOwq| z@I2#RRNfo(s<^=Tx0N?Tc#$#bqxYXB#&0Wcmh4**c#ZL&!8G{mk&W=ze^;qD@t*Ak z2IHxdmqY=64D)&k{&_u&F_xrXzrz#%dF6c(?;~Ep@O|k22I-$+`3o=&bM3Le75O>l zL;tr){~}}Pe~0uhGrp|6xn|%MgGv7y>0f6)?7t5CefB5D@;=&F``4l0PceQL5efZ% z#^8pBmG@=X@Aok7QQq&j0QWi=@A-ZoR%7xZVE!A*`!30|U5$T^Awa)sn)&~xyzddd#Q3`M z{+Zqj*^W&gQQq1X;5ozBnBs=*+SH-E|G+DsjVq1m{r3vXzlI^ExLS1ZCsAK=Fva~f z#xu&hPW>-4ehaU|gjX07pX65!Zs2jWacIZUYhn4nz?5Py@KTK5Q~nypQST{a7bT2i zQ=5w2PwTbWVDIagYRs=nGhS8xKEgc)H@&9(r%AS#F|}_+z0Y6|{iMmxVS}44DE}yp zTftz>kE1?mu=T@^v-QJ{Yr~I}e*$(_#_>HBdxZS>Drmxw^#9P0^gZ??Lw?X6?)vcq z6&v^jXf#HcpZFsxHcWHs%#Yqb(}wS1e#*}L#Gh3D1!sO}yr*eC`iwly$3>bCo*&Jp zzIc8r^D%4WEAuhW81wN4&BqmkH(@?Tfc4pP5Hw1#yAnq!#x{=923O)J!+aYzz09|9 z)93KX&S8Tqc3xu4`$#2@Y}_muKG}JR>|Qh&_L*~Ojj=tK)*17;HL&HZ8D*5}SF<9GSoVgEqY$-9CA z<6k*TW3XL!u}jS7br+j6*siAQ%vXPt>C{L>D7 zwMzF2_)y4kWvM>K&-*)E((E?`%V-Au2cXma&`q>P&pYyv)2Ri|*ByG&p?5p0}-lf63K6I=)gKrxyzy3ek{Ke{6~>JEp6`J zk5#dkME!9G&!N7xigiqFb8sK(uD%|DR|O_}T=_|X@04E2UH{i=_33$b{L^QdgBwpkp8i)ragWs4i$@CC;sMk<9E>}M_Q&#PfN6d# zeiQYr4xU4u?6Z9GXBRTB)5dG4lfM?z{InypyrQ$kbMVus#~n=ef55>%i8`%m+kOOf zidBore)@)J@kN2(5O@yQ?SDbwMS;m5SAIp{wOUM~uDrgxI_g2xKH4Mz|#U>5_nGF1%Vd@ zz9#UBz-zTwhrQ!(tXc2_fl~sf1*Y}utdH2Rz>@;c2s|tByuh@6UHwY}FAKaX@Omw7 zAYG2Wh8BU70(T0W5x7_2VSy(Fo)LHs*x64S76e`t_?o~g0-7k^pcZvgB4WJjuF&)$9e543ge+||AtpADlC`x~m7A276+@e?S`Za;kLSUUmp z$<%!MbU#5>zgynXkHPBvgp4fW&&S5NI-kQ`%bnkasrHePv!h{lV4#R!uF21$8$KCV zzZSdRh5Ey;`-S@^pQkq%;C@24U+bNpiEQ}TUHd~ehrQu*I{zF+{(yljsQV)Z{EV+1 zKbb0Lshk?sC4Vc%hjnU76RS=fK71g2f{K(W4uqWs>Eb|mfKZ1iQWote;=ynq6$ebQ z-xT{yvDXxPOp!9h!Ei4%*l&t`rr2wWJt*uqdEMb2qq{?wsnC=M!j2B21Jm2u|4Q=y z9%}f#Oe#FM-BA2*3J$F`oACQ+z4i!ivP3S^0$bQ%D=4obTYYuf)BksB674rO{Rt}e z|0!upH06#N#(gLsM7Dm~_RXNtWvx#C{yKQXPV1ujuF%1r{4)W5zMqO|SN zzxOTRI{i1$K1qr0BW|+oLG%6;k2?MDv~mGtsDJzaC`%{*e+Riv|BY7e7X7Q=MEiRE zC&8l>AXEQV&gv!q)1cSue;GVVOQxtiZF>+r`d@+7o32Rc%FYbq044Qr|G#G2lmEYl zT&Mp{bKoKA_ZL$&Purf*FC*9Mf7pntIh54D{Xd>E@@z23WEh`6(& S+m=mDzaLYZfH~Ixmj4FFd^-IA literal 15224 zcmeHNU2I&(b)LIFBv+Cx#*SjywVNwhmPm@EmXa;Y`twSW9HvlR!lDt`v3qy5T#{Sy zXS2JMM9W1tKx6p9K^fFVg~EXu*KhJl{eAJ8Zx zzgtu=yG?Zv&3;C8lI8|fg7nhwuVqPdd->4J17k8QF)r_WF#owo|b{hVrZyP^e zRO%seW`AvMp7hcOYnMqEd-x0eq1OMluV3^9Hm)ra^Q`DU@;Mdb_|NC~3w=8cwekD< zy?8GB_9PCQ_gL;VlkZmf^7P$xevpsP($k+cA9u{d8zv8*Hh#NqFkcJQ2x`{L2WB2~ zJtjYQ%7f&!zCP;lzb5P9JNLF8Zj1jRSqER0xW|4z@y|Z8R-z_cxAUZ4K5v_EyI$en zX7-=eqwjiLeD%EE` zyFZL|5~=X}La(MZ4kicGeQf#HLvEXvtFchi@cTk1_o_HK?ZDU<*76S!VFOd@i=E-RRm$%!l7s*#3<6~Eidi6U(7iC`;`TZ9RKg`dseazd!sDk;?D^u*DQ4r}-qoquYBJOIH%hT?dXiWlP_QYZ z8At*Xpdn>?(%CTGc$Q3|CT%~oLsYAStMcIvGO$EYACor8<1NN)fGjQjHnI&Pw2?(U zj=Bkzg>kT*K^>BI4Kye1CC~}e)|SpoI|KSPY1csKP_ZiqOQ64t`dN6N1${?+;66St z?fcRGKx;GCWd``qMAnJ+Ur3v$=kG~Brjs%^cf8j!cZ~J#q#tvwhdVjuJ%|1Dc$pWy z^E-XWMxPppZm@rd`uC`!TKy1h?iU%<6;$T4hWgK_*kaTY>W>*5%0sAnoY>R(hSIyy z`!L4VP+h$ri(c-;uHMIh-JD1q+Kyd6+6D0=Y|+0)zUSYgMjyggvIBcGfjUQ4G?^?ZTw zS|Tq*<>ArOk@{oNp`VCl5WD()u}5zBf%JW9efh?(i~MQId6AUK6zLe!mXpoFMhY@mdw@&v8hy=0<`b#gLjD}wt zIvGW3X5U`$`8zXJDUMAQMnZNim8bKCN+p^sRLjMDC910YSb2J;6ctBQiB;s~nW?B+ zoGhq(X(r0g<|hgj-7HR3s%4eW<;M%rWU*3FrQBE{GI+dLRp$%&XbK&sCnwd+{wL`I ze(NqwOizu4{82Ll`zge4U04~PF2kSkTv=6TO9geVFsshzCT0q1GIud5Ou=J#As(_- z9ML{Tlev=PXH+52UMzItM4e1)f?6o3aCvuy zH(e=|%hP4?#8J(45^4QiDn&oCPyuo{wHP1x+ck5H%?cv8*NL6*Bh|D+x5-muBY~n4%b!Q+11;+uH&S( zS6}qE<;UHaFk1HpuW2zMvZkJNhNzCNz5^(1h~F(jtX&W!f^ zca??(lSynf_KwtQ&DFPkn^4)GQjP129IM7eC>x-|?>pMqIl|r8wBTKX!@$-XbDRG% z(&x7tuQvnUIbDKu$&_bT&Njo$3RL&9WHH0=+D=cMLtGH;X4rL*le z4JPUJ1BEE$uFtN#Dc5Cs+efOs(+KWR7wqi;!!L0@`Vj-#G~ElVO%I}C;tjV4#{-03PyRYt)(ZjY>go=U9u4>H zg=aXIp<`G5tOfP@oRQ`^!A{IB%^M=v8gppWw`A~m=1ZjgW<#2ia1-IoxY>y zwD=W}SGmjw8Y#raCDH5qvp!{-v({+gVzIhsGFLtqX>Kmh18rgCXAcAEY7xb{PB)); zckixydZApz4(_;GE$z{~v$_1aXrx%K&PJot6UAx8<*SAZm8u&2vKq{lM=s>bh43WL zt(EZPRPO8q;&HZ8bvieRvuF4M&Hy-D>+?v{mW`wIL@{5OsuWcJS)9uIF?skcPvg{( z!(}lwS{zfhhrAaH^%I0rCqPl#-$%WP_9;|6a|CaI2B<7)Jk?zXwRj(%_X?siT@L0m6!h2S zeD`q%V+*LStHeL?J<_;?+ki`g-&KiQeBbt_;J;Rhw9>n`y*pCf~9PwPE{EUMsA9pb2C4&>gD*hqjjk)o`|Ja<7 z+jwJ_9ZY@mf+gPAHG}K^E*qSfQ}OGFH+IWl#&;9(#XCg~{1eP?T;j9i<6TD1cvsh{xY7_DR0D^P?R^~HS&__pG>Mm0`bPZc;j=TPv$r7#T)nHjeGINmqq^} zmDq%M6CDQIcoLl=pHcBg@fs)5CHM~!I{Zzf4Yv7CWW4c%(tlCKyW;3S?)2w)@h`cm z;ydtuEOEui+qSEu*8h=*R|Fqa$sO%@h1ua@@we?2mF(IId6&p9sN|E60E>TZ#FP(; z{97tH1b>oe1plQ<4&&W#@}j}*0YtP{6a1)3X14=h7CfYqufSGv!INJTJg1T)+aO;S z{8uVD&he{)f2xwNBmZqf26sH85(n^ZyKP+Xd6hT}|Jq7|zo`-#toJtYuj50NIE8oD zZ8t~*ZD#1QtgwvRlyDl7o;&^rQ-|*Y{IbRCJGDn8 zMiRh?S>r*KC}Dovv)B``Kc~K-)B(h!?*&-5ESmoh=C+|=OJmp@;l2&Mbp$+vOM?GEsVd@7H3x^l zmjy2?#eGoU)0i9->-Rg1!-?x1#u0jQ$=Ak>DzQXAGbVo12kdUY5;y>j&up}OPn*1bJHZp+%(Ch+xi5_ zXa2nsd>9~jVbTW{P~LxR|M;rr>Yw0W8+_eIL=46D53UFkB*wFh*o`@QV#5x9!Re1{ z&-RbI{KE}C_cB|8A36Lv%Z1$&4u8JE5B}DPw?p`hm%fRg+`DXvaZOl$m*qk~&m_Wc z&o=l0u0zXD-gNkjj$O-!-5B2U+y3pL6_7%C(#577<99R5$FXIQ**)KXmv%0-yV!TQDzb*y(R&A=b%Lh}Y)d@~?n@k(;<>2t^)( zEciB`R!shG=;!|87Mrg(!Nm6_g(btH3HMryvuN}CoyE(|I2Ho)j4X1V?ezCai`k#+ zS1`|3`a7k?>>oOE_IEJPE^1pto*48n^5OcQSOMOSD*AbLaOKG?@Z$}6o2Q@m?k4#P zu#=zmOtYNNCT@R4dyak|w^2_2IPWNJ8T_!n#oWUfuf=E3=6qN@f;Q*F;u6}-zs0Yk z&3s$@CffW?Yw_D?hYtQO+KexbiqdusKQ6xR;U5C;LKS_g7j5R+;=^d$Z*geq=SzN% zxAIre=DuPv{ddRLJUr*&c@Hmmc*(=d9$xYAYBQ!Qu0KJiheHpiJ)H6IkcYD#E_t}- z;W-b_dw9XaOCDbK@QR05o3XyPI`a|h^yHz3(;m)vc*w(950^Y#^YEO9=RLgO;Uy0* zdw9jetIb$n22T8ZR&>`xJoIqd!x;|`c{uCgl80*^p7Zd$hZj7&E3}Qk%w=3_*OG+@9=O4?A*WF8Lx{o9v<>=*25(a*F4O4 zo%>@u<8?9Pbur_0G2?YH<8?9Pb#MotU!D2w*x_M5Gq~~-7VqfalYSt6U*Ev)CwA`Y z>h8h&XMER|e~oDRxY1w5e>P}w|MB7H`U#wWDRJIl*FT)ee{VSd{n7Yp=ZtOoVtg0> z`(ggGrFoS8dcuEWz&msap~Y+bb>GChK|B8`qWUK$&QC_Uk&!a~9mjt)-*|oRfAg`Q zLEQf>k?%&*^|sFrEk2ykv6dfC`tfnHnmeodE3=cN_%5M8Qg9IGsOf9OspH4f(cWj2 zrUOFv8@N{}{;@}kpAxpv0Zj*@eTKbH(|yri!o7y1BmC2nF=6R|!2_1^lKXj-fM;Df z=lc5g@w!2Y@qFqQu6@pcANH`OWuE@T750X>*rZ_Joh78eu{>D zzROsqq1|s98j#Ox+Qw8TMPT`3y`r~F{J)$$CJUaE$sK)kP);)`}Vojjt4*Z zZSY&z?>HmlOF}KN<2m%V@YBk^%pc!zY5Vs1*p4TkKDM$?8OtZwSJxqLW&frgt>3X} z`?jBBKX?>0b4phe$v*Q)lcZUb{b1?{3?D{gHH%I*#+BshhUxIEh-x5d8skWk5tkb zo6ML`A?M^RDNB=0jW*8Sy1~X%2bSJvBHL`yWVbtc_5sJus*+1dW4fTaRO9qNi|A)? zU9NR}d+k#;LHSL%rfOZimrxFin%ZL3hI(ppr|-0a+?QMfa}eT`m_B0`|s=WFQdN< zmoO}BA=~{huE{rCdAG|fuzw6wThcCGr~Yhj#_F$E@BhBOwvV~Kc3U6Ep&Qp!ZD3iC z8~ucV`Ci3ErYMi`dg7A_%N(-HS^OX{-#J{D2nfI8A~Pv|dHJWEU;st|l4m5oJ#mvQ z*XuVz{zJ^8&bKvjr=6*4?fhVWzTH+#zhe8RPFu9?y@@RT&GuV=7C$97;PQT!t95?t z<+)dnKnwq`b^czhUDPu_$1fT;v;$p?#_e(ufWUm!#?RMx8FmcS>iOXQf5O_)zq#c7 z`aOg40x-+J30Lm4>z^NwYV{yn)SnOQ4?k}GnUwu2d&dS2&xe>FdKuOY=7%Nt`T4l# zrP_Rd7{BuEW`A~|KYsuJsQv23+t1!r{NJ~a?Z&?~^T2(9<+DEwjHkA^jg9gLt<=R+ z|9{?H=Y0L~KJRYa4=ej~we|mTuOs?CLk4F&!;$gqaB%3kBfFmJK4t|U(a&#wH(~G3 zSo|UmBwjzLiz-XG)xy2>3xX8xsyp}ic6#OJE zKMr>z&GIt1Wc@erDE4|JllAzKu%=wLM?#YK9zMMp33s<^Qr!J_J4K3~zEb`A345`3$jh0LfnG{bQ_qz&!v$vW)NwPVL(6t zt|{XvCK9qucR4kGS%nSpyq|&OJva;V22Q=EEb4#B-N$XZw~C(a{f6l8ucD{@XVG|; zzpaX%_T!>YucLom^!`jjwtDd?TGX*)e}p^;4K4UVUFL1aO|3}d7TZ2gAz4rV`)jB3 zbCCF!*mAd~?RP}aG?I)MtiSI?VgoMv^*@W_<9Y)}vqv^^YOZ?DE^P?n9b++YqzNeTZ4+VVB2rAx3C8fS-7o zPgt@2sm~yK0r{||HgqA)^69?}CncRhn!d+APvgo#&h#v<0ZA_)U6k|^(q9l;R{TXt zXOR9iNlzpF6-h53{f9`iOm^fgNe4(TAdS#)5I@Lf{tApf+Joc~Y7j7_=5d>H>5f?X zZ;&73Yx6T71NRHLm9_iutyYS@BX&FAjWQx6JeD5CQl38EElu6yeSvL-g5u0xW=@$ z?!v#`LjNvj*NS9o&eRo(`O{%JTRIgy9;96x17axY9otdYi@RTX)YOgUC(75)x(8Wf z1(?E6pD7REr|rOQQ&*{P=Rs&%f~fwT2brrSh-#4xF=mGCkD{P9QIeL|9V94Pa8x1S6 z^ft+2XHiw)wiO@!G1a=-Qg9FS6^_5t*&6Bp1n5P3WLZZ*HJCHeOikEV=Pw zk^4fVv))pAMMl}X^2_7-;-qurL)}k?#}9=apX)e!EIiV2qJuM2%$LWDqXFy=J|3LO zp30-1zHB)g((QvM0x^2>+2e;h!{;Tq8thT&6a8tX4p$Zd4e)Sb@LC)3e}hW*oSFKygj=+!pE*$z(BNo% zqJpDFohc5L^R9k>qV2M>-fZqv*f&%xPlo+t!$V`f7w)K&jX3Nt=G7s%MpJj)29`EeL!Iv+U{W`g(6UFSAuv}P^Cw@fxl@lwXO1HY6H(l9c-k0*l)3u!^hq`*%C~-WN@OH%8BQbF=@0J#_pdgw@jw8)jDCH?Sb?C zHT9a--crf5j${k?Esk+R^l@u$Y_xx9!1ysRe9jN#KCq0K*WcnEu?r8;eMs|O^6zl* zzLtT{^DnsPHp_e-ztufkS-uCqzZc{Cz~7GXR`4haivFBajG<2Tt2J`vJa|4*qKIA`veeH0< z3<@uk3kg#Y{vNpgY{E>$I0c>&ZoqRu<8;ED7v2eOIF~RNgg;09>k0cjV(TZLPnbE; z{{pz~wS>7S{04a@VaJB~S>7g;cSZDXgKIA&%#z|2;XjA|ZZs~1cGLb{@LjJP69_|O zLt6OL82frOzFYVSa4i}?DEu_I>B|@oVO##!j6oA^`DdZO4~;{UEc^Vk!nXWxVy=a4 z`S0NGO2T~q*@(xrI=-aG3rV{+Zj-TPJI5g3 zvQ+U~ata(kNBwKy+rEu<3I7FnBL-nw_}|Ijsrc1<;HH16_|>iWyU5Ke6~7|afxTbt z27lr|D}F`Z44#+rUj%Q$ATPxD0kHQo>g&N*MLz>>dM{xXh2JEbWaNj`*F(Q7dOwaS znMXg4&BA|){56==HsSAqlPgKPr|@rR3>>Ex>>&-e)kN#vu2)QUM)b?Duicl7#_>hu zZ+t9idSdq5VSiHCuIEfLTJL9(zwStK&3aEkKOy;jKW2q}KQ0Pee>j#j3&OS^ch=aw z@S1*Pe#Cyj{-m!5{jUG0`+<#_dw9Gb-v@uDuO`}$?@VDm4XS<>)clky^K$_C8_!ne z$Ii!eGU}I|k8_pzp*{tDLHh0IDC-^SrQc=d7^z8}CQ9o#$&JKl10&isZNL!zR2cY}<1d zgM)wUj~yp)qp&}h{Ce}}l3#BxWB=kg)Ft`-xzjD|&z&A&f9@0%PYe5V=Yp_5cPGxwGi?I45}SED8H_XF0|^cUFYGzck~#<9vF5NelaPr(M|k z%f%Xd?-cnnIx``9>n~V$Gs1q|!NqL()_(m*ySbR2>kd`e{JwpQ!oGdW!hYSY zdVB8Y@3?(|_jDKzyNBP2cm#9I5a1%eEu0qidalj5o_i6qrc^%Q2ZoyHuCtQuw=wR# z8=bRpMQ;CK4>|oN9{MEzptRNNT1396xHTwn3vUKnn+>o_j& zko?@Q;^A(Q58Ak*X%sp4hj{n{k#n8L!vT?VJ;lQ*k#pU}1NY@yVtzg^_53~LO~$`P zaBGlu-?wu6`oI>$^~o@Wi}@QZxpxgrz9e$4QHEw0nNGyapAq)*7i@cQ(<&hYX82tz zulV6rCpN!m<#+&<;N|yPdBsnzi2moT+`g5uk_C~U61nFY8Hc+>-X-Ja^((MzihAqz z78b=%M4RxR55A@^qP!-Iv=7{i7;d)tZ?St5*~@n#&HR2my}X^75MXBR!R7l!z86=T z2?R#>;hK%{{b1)$PQMj=Qu@hk_u%Tr#o&3_VZ^>YUO(|8>I=$VFsg4*=@%6DuptPH z*#G$$v;S?e{;~hFF=qc)VtfzS&ohf5^BiOFd`w)Ee!mM>PmJ5ZeKF>H*|RZbc{4F) z`(BT+ZC{MHLO(CeX3!_D#+dzGj^*cjyB1swY#-bF7y*GyA38=rAoKqGCkY5-uIga| z0-4@=mViL!d&F}D1kbNIjCeN2tpB3o1;tktFDYJ8Y&!)R>u33m{6X*>DE8-^*JqTz zTk%Q7+#loRPb)sJcvkU6#S4nBDqd2&qS%PP`u_O!>^V?+@Bdz(QTlGhClwbIPb)sJ zcvkU6#S4nBDqd2&qSzl*zCVq!K0ODD+Z1OM_r#d%V^Hyg;u*yk6wfK1SA0eBqT*%6 ztBO!*G3Nd^p*X%@Q-49}=M?+>n(bx&D@wnpcv&taw#%ii;W{>c8T&;&#PdihC3fDxOe0qxgd2ImPpeuP9zrysUUtaY}CG zqy8&SD{faT--kux*Q4}fu-DSln?R~3IP#@ugqwC&!rci%(%TiPGo*}4m_4fDhP8>%%QFtnEO36xd0 z9y;-4E5W@ob+10-uOKV$EpO;!uysBmV~gbVv1{BqufyXlk9`+rT8D>EkA&I2z9PO{ z)2~IZd1YLATO99QXkYBc-*8uXJ$+3B;;+#C+xpmNBGl%e!K{Sf?o`)*L%@Xn*(!qc#`q58EBtXn(k$wA)2(o*ts` z;cy?L{VsaQMf+T|*F}3=)aIgx!@V?k$VK~HwAV#@5czlVw!%HG?A{j68Y?Y&3*zoPEBMvlcuo{3Ux6Wk}_;WB__u4nK6 zzT<6V`}Ju46e(^=nMmRiGdo;-Qsn&ZoBj0mjUwmmOSSq}rQaFVVQ(+y*0x&x%Tj+J z8CgH;^7g*|Z=n2Y_51r^|GX*1ME0Kl493;!Pd(^z1d@^U^XS&~{}yD`>QBji-Sz7K z4cJ$!KV7AM=F|0ae5=*(?>n!+CLX-Kmp!&l|GTQx&%dsDdq4ia3R$)K{e5FfIjpz$ z_5U&w)%rgu_0LL1jz9ml=Iwp`{|FiDSIf-I#m+6LqoA5&{uRd6>@SMFug@2txkl{u z9xyNV{_la_o^{0oziVfi+Wv~z`@fTV8(lxYgJ+rR+4p!uiM9O?rTr?K2{_pZ|HOxy zSkUA5U$Cz>f2kH{G%L28KYsV%>-Y0_KV*7LXp-WGLuqcF1R%yD2o=Ain=Ays0e?w0MS2H5Cdg^qE*rUAks7? z3K#14&79et9qw9^Q=or3*x8xyoSE~OJ9B34Oh zP}Cw%uOHU*>GgSCl1`2D>+pw^c;g*S-wC6QWJ9;|Z&n*T-8yOH8*|moIz!)HmDH8# zZ_XJ#-@GB^tS*P3CqRpKV5E@Q`e zwL>?Y`kv7!s@$Z}Qt~6e*yoVW|^J*K_sLwkd_Z^2}EnfYi{@ZyoeM#oaC7Cam%yo=^ zV8-WuH@Q zHoqnOCWEJ{nhiZ}=?1%Y&A={qe&>Phy70%#j~`R-_v<*9*tK?kmy18v-}CBmZ4hw} z^7N%u6Q?h&NO@PvWhw7Sxg_PHlnYYcl2X?3dNZHn?;E?OcbRqV`(x+L`6?3_=ifKu zw6032f8#jdI&-9HCrfwS=$|`o<<<+w3!)0~_9p5~ziSFLFL9mn8uMFZV$x~P=6?JkrK+CQCH*D+a+ATnot5l+ zaJ$TUnyQMH=$mdb`2SiTjiUempUeoHnTU@Br_Y_~_;OD-DewJjl|KKl{x_II%ttp8 z=~T_DE90kEQc!?OUwAAdeX`*ktVKq$?F_+RBsax89ab7VpMCSHeOR5->X zDDm6afjaNY@T=ka>_)cN>k(8)3y_)6zC^|!V^L7XVZ-LX?OM|IC>TG+!e!g1rL8NA z{BIy|k^hYFQLH~+tT#~bpE9^%T|B1?rT*ODEzbkvIB;*lYIU6(hKub?EoF)r|IT|s#E~_7o`pBd!70L zr{3(;Tb=qLr{3<=AA>F!s&#QZm}moiXUR-Ic|BP_V z=f3i_vtjS6J!ivEO&mE4I{zoevZCAW{nazfy$y_Ft$c8zU7|M)Kq{GyJ zN;3;TGcgwCQlm+gNKb@`sl-S!tE;K8Y%Ze`@x*X498G1jDjgq6h6WF(a_UMl5ssn3 z_~@vbIChjK;J;wP$oSY$z&|z7d5lc_7ns@M@eJ%4j%QSEDxFl9lT+$Sd}Jc2M&pxV zats#33bBy2)PS}z9F3>5*h6 z%)V3a_~aX zjKPFm0bwZh)|NzW^6Q&3O`P`OT_Nsc5(n;oML1;z~OrOZOl0|x>MI2p8B5;QdoBu8>_C?l(_$U0Y?Nm;%Yd1^x^HdPQ-Sac;&(5}}?u zW`gCiaL=sAmY^f(j7Fm}|IVHI<(}@d;VY+l&tmRHjcQIzncO8ot=OWW9J{TJv#T5j zCOL{k^~H(7c7InXYA_>-P-AzbN^`Cd_U(X5d}=7JFH)=+?-qSjSlvpVZdZkl*9)^bB5+Y zM&1rOr_P==63n0%4iqAgQJ+n@DHnJ;>|?0C(+GB`t9E-p_i0{_cKCogRUZP@s(a6# z>t*0XlyiC@RF94pqQ1}2n?Qpex)PJn2a&(RkQxJ%SXc+N0fDQ}jOWY0a<=a)Fb9_< zlgv$I#)5(Iu;N*v*&ZWOnb*D3!}ws1L>_de6b@^F2r0B|=7!sf;{jYR41SqW))N84 z)!Y)CJQ*B049jpXL&N66SqtLzIU~w*hIbi6;@Yt03`RkHF{qVX%3#k6K{ViQ2gN54 zig>j=J#8n=Y4PhQUgI?HEDj+8mw2!DXMM^vXRX2HWGdG&8qZt~H8q~#fi^kt`Q5;9 zRq&#$(WNKeg9i(iUd^Ns!5vd`=?+c17*AXd2U3~bR5&<3k{VZ>zN$Bw&8e=hs;+ou z;A%XR49@c0nhnm5#V?M)A19QmbMaA}J%d+p2Ef@`pGQhUHjdIGsYG%tn^f%=aVl@e z#lwGQ9H)jjmc`g$YDn1@d@dH~I|!xDpk)4V9o|tOccadGj^D%$5`H>G@qgRgJ8FCc z|35YNhZ-Nl|95aNX4~^=ET894#%sETQooP-A0h9@4H4xNe?ko5Y)iZwH*znYFz+la zZUpANmBsDAlyC9lz(}Ldvf5U2xx+eIqf$QF`(a%WO-{M8# zFM;-WPeeagA^$&+AG;7ytHMWN5^N{G6*qM2`ywhT_+{X_{)mbR{u=os5!L13I^aGB zlix2GRkbM{QF*~10B>HYR@WT76L`kKb->pJ-)8%tRjXTqt+`8r{{j5!$%tAJ`~}=- z?z;j11Z)4_jVQi1qJK68g0;UN!XLrf-;W}yN3izy7W{K?9q_P&*8`^o_d@@sk0a`u zVC(OUgX!;e2h-nK!3BQ>TYHxT&p>`P8m$T@{c*gQYr;6tza{V=`&mQr> zA-{TKjjD%V?)7UDJOTd3-4WF+_z!{W;Cxgt?VIDtyM7x?*#4={2l7+ zsa1V~{}OmdZ>^dXT!nbI4$QpZ-N1YMYt?nZJ;2QuYSo;_za@AE{5=D;YDw^4vHx(b zS`qwn;OaMPRS)#ipXch8dh}V$2f>rP-k2>u=5%^2)S!5;%}@2*jc4&Dd6BzPJ8 z4LvpLj>lgXyaIlGe~nso@H$`xyJ*jUgTECQuSswRVtma&jcOL$1sqA&sD5C^+us7N zM)2icpr1Wq!jb+XU~U``r|~Lq<3KU46Sn~Gbod*A1Hn&WFL`v>yc3~3o1jX|I|Kf< z5%Ug&{NDinB7%CK@NIjD(Csasai8t~0PQ0=rSc9AfUgT)0>66F>}haSC-&2Ryj$v5 z3vhrIF25O=;Ju3&Phj@fdOtLK=Ni3>o?hTU`f0m> zn}I*K-X(K=xJTDOP7^XeZ(yuv3XqBAQy%@k%H{LgxO~9ROc6fq^wP)BU8Oc?{bbP& zChk9cI*L$RlKw^LhVQVn&4@FaU!->n9U-hF^$LB?qi?o!Pwxq#Gmg3GCt}ZxM?WL& zXFdAgqCNes7a^9-{g?hSZn=ri`AKKIbJOEO=el&$k3`O0ul>B}wQa55qar8pn{K~O&v%an`3xGR>40wEP@z2*kC$G8Ut1x`rnH#w9tc|y zp<==ar2JKsr1uLBq3P9{lwU$g`mEqU$`ne{7X$}V^8F3z%Yp+bZ=xjKVq5-1 z(>LPZBzs+Y0omTBKq^mbe~Z`UhR`Q9y*U1idrbYBUL0pjKM493{pE&fXEj|j5OY-C zX|saFg6}{mkBP-@WS%3LEath;V!r!KJDBe%mpqK4gHG(vcZWO|F%bvIJjXCu+#~}; zya&0t&w6;y z!wVi>^6;{US3Inp0L|LdL?av=csT0en1_2j-0$JEhw~nu@$jsN=RCaN;Uy0*dw9je zO4e(^e-GRBZuwD_j@?);k<`uJUr{+IS(&*c*(=d9$xXVp7_puxAC{&KPwJD>fxA&dpz9l z;k1YI9-i^=tcT}3yx`#_4=;Oo#l!XpL^78~lk6832Of@kIOgFV58M66%1e9vyoYBz zJnP{(4=;Fl$-~PYUh%MU;=A_W!-0pR9`155PRWvCp29DWCgSRQ7+1?RmiB9*^JeVS8S+?eiZ0x`Vkt-th1( z4=;N7j)(7hc-6z(>d@Otm}@KFz+@$mlkj_B6Nrnb(5Pab%xxuq4)pYdK>K8+}O zxzV1(mkpZTe!BOib^_;<66Xna;l-JJz2SWQQG9FX^euT~`~cs6m@ivOd+E0){EPw5 z(8YxckMYYsiFciLz7$dIBO_Nv!}!2J2A|{jkLHVy@BMc^_BDw6ZHc@$lBR23J5+da zLc=P)JZZAwIdvr>N>A8a34ZWgMaqsF_tSb<_H*IG;&6N9$_(ofsJ z$*>i}eL4Me5*KbrJ6qWP7PD_T{fn)ph1Em-)Mfiy`nMsVoc=y(m^NYx|F*v+{5Cq5 z(|@Ooe!67)Tm6fmmD9iYfckHve>wek%jjo2U;l3L-QZg4Xdj%{!2pV{Nsm~)_!~a_k-rSsC%S&^NiGMI~b_XDN@x%fd6v(7Y|A+ z=}-MzkgYhYzY)qK-S@`9$|D!jqQoED&N6&k{{#eo?y&9RlcvXlTp0Fatg`xd8n*f$ D;7UwF diff --git a/pkg/bpf/test_utils.go b/pkg/bpf/test_utils.go index 4105f64b96..ef71b56b07 100644 --- a/pkg/bpf/test_utils.go +++ b/pkg/bpf/test_utils.go @@ -36,6 +36,10 @@ func NewMockExporter(bpfSupportedMetrics SupportedMetrics) Exporter { } } +func (m *mockExporter) Start(<-chan struct{}) error { + return nil +} + func (m *mockExporter) SupportedMetrics() SupportedMetrics { return SupportedMetrics{ HardwareCounters: m.hardwareCounters, @@ -45,18 +49,22 @@ func (m *mockExporter) SupportedMetrics() SupportedMetrics { func (m *mockExporter) Detach() {} -func (m *mockExporter) CollectProcesses() ([]ProcessMetrics, error) { - return []ProcessMetrics{ - { - CgroupId: 0, - Pid: 0, - ProcessRunTime: 0, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, +func (m *mockExporter) CollectProcesses() (ProcessMetricsCollection, error) { + return ProcessMetricsCollection{ + Metrics: []ProcessMetrics{ + { + CGroupID: 0, + Pid: 0, + ProcessRunTime: 0, + CPUCyles: 0, + CPUInstructions: 0, + CacheMiss: 0, + PageCacheHit: 0, + NetTxIRQ: 0, + NetRxIRQ: 0, + NetBlockIRQ: 0, + }, }, + FreedPIDs: []int{0}, }, nil } diff --git a/pkg/bpf/types.go b/pkg/bpf/types.go index 840a3d9bb0..077b69cd3e 100644 --- a/pkg/bpf/types.go +++ b/pkg/bpf/types.go @@ -20,22 +20,29 @@ import ( "k8s.io/apimachinery/pkg/util/sets" ) -const ( - // Per /sys/kernel/debug/tracing/events/irq/softirq_entry/format - // { 0, "HI" }, { 1, "TIMER" }, { 2, "NET_TX" }, { 3, "NET_RX" }, { 4, "BLOCK" }, { 5, "IRQ_POLL" }, { 6, "TASKLET" }, { 7, "SCHED" }, { 8, "HRTIMER" }, { 9, "RCU" } - - // IRQ vector to IRQ number - IRQNetTX = 2 - IRQNetRX = 3 - IRQBlock = 4 -) - -type ProcessMetrics = keplerProcessMetricsT - type Exporter interface { SupportedMetrics() SupportedMetrics Detach() - CollectProcesses() ([]ProcessMetrics, error) + CollectProcesses() (ProcessMetricsCollection, error) + Start(<-chan struct{}) error +} + +type ProcessMetrics struct { + CGroupID uint64 + Pid uint64 + ProcessRunTime uint64 + CPUCyles uint64 + CPUInstructions uint64 + CacheMiss uint64 + PageCacheHit uint64 + NetTxIRQ uint64 + NetRxIRQ uint64 + NetBlockIRQ uint64 +} + +type ProcessMetricsCollection struct { + Metrics []ProcessMetrics + FreedPIDs []int } type SupportedMetrics struct { diff --git a/pkg/bpftest/bpf_suite_test.go b/pkg/bpftest/bpf_suite_test.go deleted file mode 100644 index 74b911431f..0000000000 --- a/pkg/bpftest/bpf_suite_test.go +++ /dev/null @@ -1,450 +0,0 @@ -package bpftest - -import ( - "fmt" - "syscall" - "testing" - "time" - "unsafe" - - "github.com/cilium/ebpf" - "github.com/cilium/ebpf/rlimit" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - "github.com/onsi/gomega/gmeasure" - "golang.org/x/sys/unix" -) - -func TestBpf(t *testing.T) { - RegisterFailHandler(Fail) - RunSpecs(t, "Bpf Suite") -} - -var _ = Describe("BPF Exporter", func() { - It("should increment the page cache hit counter", func() { - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - key := uint32(0) - - err = obj.Processes.Put(key, testProcessMetricsT{ - CgroupId: 0, - Pid: 0, - ProcessRunTime: 0, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, - }) - Expect(err).NotTo(HaveOccurred()) - - out, err := obj.TestKeplerWritePageTrace.Run(&ebpf.RunOptions{}) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(Equal(uint32(0))) - - // Read the page cache hit counter - var res testProcessMetricsT - err = obj.Processes.Lookup(key, &res) - Expect(err).NotTo(HaveOccurred()) - - Expect(res.PageCacheHit).To(BeNumerically("==", uint64(1))) - - err = obj.Processes.Delete(key) - Expect(err).NotTo(HaveOccurred()) - }) - - It("should register a new process if one doesn't exist", func() { - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - out, err := obj.TestRegisterNewProcessIfNotExist.Run(&ebpf.RunOptions{}) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(BeNumerically("==", uint32(0))) - - // Read the page cache hit counter - var res testProcessMetricsT - key := uint32(42) // Kernel TGID - err = obj.Processes.Lookup(key, &res) - Expect(err).NotTo(HaveOccurred()) - - Expect(res.Pid).To(BeNumerically("==", uint64(42))) - - err = obj.Processes.Delete(key) - Expect(err).NotTo(HaveOccurred()) - }) - - It("should increment the page hit counter efficiently", func() { - experiment := gmeasure.NewExperiment("Increment the page hit counter") - AddReportEntry(experiment.Name, experiment) - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - key := uint32(0) - - err = obj.Processes.Put(key, testProcessMetricsT{ - CgroupId: 0, - Pid: 0, - ProcessRunTime: 0, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, - }) - Expect(err).NotTo(HaveOccurred()) - - experiment.Sample(func(idx int) { - experiment.MeasureDuration("page hit counter increment", func() { - out, err := obj.TestKeplerWritePageTrace.Run(&ebpf.RunOptions{}) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(Equal(uint32(0))) - }, gmeasure.Precision(time.Nanosecond)) - }, gmeasure.SamplingConfig{N: 1000000, Duration: 10 * time.Second}) - }) - - It("collects hardware counter metrics for sched_switch events", Label("perf_event"), func() { - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - "HW": int32(1), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - perfEvents, err := createHardwarePerfEvents( - obj.CpuInstructionsEventReader, - obj.CpuCyclesEventReader, - obj.CacheMissEventReader, - ) - Expect(err).NotTo(HaveOccurred()) - defer func() { - for _, fd := range perfEvents { - unix.Close(fd) - } - }() - - // Register TGID 42 - This would be done by register_new_process_if_not_exist - // when we get a sched_switch event for a new process - key := uint32(42) - nsecs := getNSecs() - err = obj.Processes.Put(key, testProcessMetricsT{ - CgroupId: 0, - Pid: 42, - ProcessRunTime: nsecs, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, - }) - Expect(err).NotTo(HaveOccurred()) - err = obj.PidTimeMap.Put(key, nsecs) - Expect(err).NotTo(HaveOccurred()) - - out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ - Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU - CPU: uint32(0), - }) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(Equal(uint32(0))) - - var res testProcessMetricsT - err = obj.Processes.Lookup(key, &res) - Expect(err).NotTo(HaveOccurred()) - Expect(res.CpuCycles).To(BeNumerically(">", uint64(0))) - Expect(res.CpuInstr).To(BeNumerically(">", uint64(0))) - Expect(res.CacheMiss).To(BeNumerically(">", uint64(0))) - - err = obj.Processes.Delete(key) - Expect(err).NotTo(HaveOccurred()) - }) - - It("collects metrics for sched_switch events when no hardware events are enabled", Label("perf_event"), func() { - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - "HW": int32(0), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - // Register TGID 42 - This would be done by register_new_process_if_not_exist - // when we get a sched_switch event for a new process - key := uint32(42) - nsecs := getNSecs() - err = obj.Processes.Put(key, testProcessMetricsT{ - CgroupId: 0, - Pid: 42, - ProcessRunTime: nsecs, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, - }) - Expect(err).NotTo(HaveOccurred()) - err = obj.PidTimeMap.Put(key, nsecs) - Expect(err).NotTo(HaveOccurred()) - - out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ - Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU - CPU: uint32(0), - }) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(Equal(uint32(0))) - - var res testProcessMetricsT - err = obj.Processes.Lookup(key, &res) - Expect(err).NotTo(HaveOccurred()) - Expect(res.CpuCycles).To(BeNumerically("==", uint64(0))) - Expect(res.ProcessRunTime).To(BeNumerically(">", uint64(0))) - - err = obj.Processes.Delete(key) - Expect(err).NotTo(HaveOccurred()) - }) - - It("efficiently collects hardware counter metrics for sched_switch events", Label("perf_event"), func() { - experiment := gmeasure.NewExperiment("sched_switch tracepoint") - AddReportEntry(experiment.Name, experiment) - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - perfEvents, err := createHardwarePerfEvents( - obj.CpuInstructionsEventReader, - obj.CpuCyclesEventReader, - obj.CacheMissEventReader, - ) - Expect(err).NotTo(HaveOccurred()) - defer func() { - for _, fd := range perfEvents { - unix.Close(fd) - } - }() - experiment.Sample(func(idx int) { - preRunSchedSwitchTracepoint(&obj) - experiment.MeasureDuration("sampled sched_switch tracepoint", func() { - runSchedSwitchTracepoint(&obj) - }, gmeasure.Precision(time.Nanosecond)) - err = obj.Processes.Delete(uint32(42)) - Expect(err).NotTo(HaveOccurred()) - }, gmeasure.SamplingConfig{N: 1000000, Duration: 10 * time.Second}) - }) - - It("uses sample rate to reduce CPU time", Label("perf_event"), func() { - experiment := gmeasure.NewExperiment("sampled sched_switch tracepoint") - AddReportEntry(experiment.Name, experiment) - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - "SAMPLE_RATE": int32(1000), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - perfEvents, err := createHardwarePerfEvents( - obj.CpuInstructionsEventReader, - obj.CpuCyclesEventReader, - obj.CacheMissEventReader, - ) - Expect(err).NotTo(HaveOccurred()) - defer func() { - for _, fd := range perfEvents { - unix.Close(fd) - } - }() - experiment.Sample(func(idx int) { - preRunSchedSwitchTracepoint(&obj) - experiment.MeasureDuration("sampled sched_switch tracepoint", func() { - runSchedSwitchTracepoint(&obj) - }, gmeasure.Precision(time.Nanosecond)) - err = obj.Processes.Delete(uint32(42)) - Expect(err).NotTo(HaveOccurred()) - }, gmeasure.SamplingConfig{N: 1000000, Duration: 10 * time.Second}) - }) -}) - -func getNSecs() uint64 { - var ts syscall.Timespec - _, _, err := syscall.Syscall(syscall.SYS_CLOCK_GETTIME, 4, uintptr(unsafe.Pointer(&ts)), 0) - if err != 0 { - panic(err) - } - return uint64(ts.Sec*1e9 + ts.Nsec) -} - -func preRunSchedSwitchTracepoint(obj *testObjects) { - // Register TGID 42 - This would be done by register_new_process_if_not_exist - // when we get a sched_switch event for a new process - key := uint32(42) - nsecs := getNSecs() - err := obj.Processes.Put(key, testProcessMetricsT{ - CgroupId: 0, - Pid: 42, - ProcessRunTime: nsecs, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, - }) - Expect(err).NotTo(HaveOccurred()) - err = obj.PidTimeMap.Put(key, nsecs) - Expect(err).NotTo(HaveOccurred()) -} - -func runSchedSwitchTracepoint(obj *testObjects) { - out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ - Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU - CPU: uint32(0), - }) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(Equal(uint32(0))) -} - -func unixOpenPerfEvent(typ, conf int) (int, error) { - sysAttr := &unix.PerfEventAttr{ - Type: uint32(typ), - Size: uint32(unsafe.Sizeof(unix.PerfEventAttr{})), - Config: uint64(conf), - } - - cloexecFlags := unix.PERF_FLAG_FD_CLOEXEC - fd, err := unix.PerfEventOpen(sysAttr, -1, 0, -1, cloexecFlags) - if fd < 0 { - return 0, fmt.Errorf("failed to open bpf perf event on cpu 0: %w", err) - } - - return fd, nil -} - -// This function is used to create hardware perf events for CPU cycles, instructions and cache misses. -// Instead of using hardware perf events, we use the software perf event for testing purposes. -func createHardwarePerfEvents(cpuCyclesMap, cpuInstructionsMap, cacheMissMap *ebpf.Map) ([]int, error) { - cpuCyclesFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK) - if err != nil { - return nil, err - } - err = cpuCyclesMap.Update(uint32(0), uint32(cpuCyclesFd), ebpf.UpdateAny) - if err != nil { - return nil, err - } - - cpuInstructionsFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK) - if err != nil { - return nil, err - } - err = cpuInstructionsMap.Update(uint32(0), uint32(cpuInstructionsFd), ebpf.UpdateAny) - if err != nil { - return nil, err - } - - cacheMissFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK) - if err != nil { - return nil, err - } - err = cacheMissMap.Update(uint32(0), uint32(cacheMissFd), ebpf.UpdateAny) - if err != nil { - return nil, err - } - - return []int{cpuCyclesFd, cpuInstructionsFd, cacheMissFd}, nil -} diff --git a/pkg/bpftest/gen.go b/pkg/bpftest/gen.go deleted file mode 100644 index 94def23250..0000000000 --- a/pkg/bpftest/gen.go +++ /dev/null @@ -1,3 +0,0 @@ -package bpftest - -//go:generate go run github.com/cilium/ebpf/cmd/bpf2go@v0.15.0 test ../../bpf/test.bpf.c -- -I../../bpf/include diff --git a/pkg/bpftest/test_bpfeb.go b/pkg/bpftest/test_bpfeb.go deleted file mode 100644 index db3880c877..0000000000 --- a/pkg/bpftest/test_bpfeb.go +++ /dev/null @@ -1,159 +0,0 @@ -// Code generated by bpf2go; DO NOT EDIT. -//go:build mips || mips64 || ppc64 || s390x - -package bpftest - -import ( - "bytes" - _ "embed" - "fmt" - "io" - - "github.com/cilium/ebpf" -) - -type testProcessMetricsT struct { - CgroupId uint64 - Pid uint64 - ProcessRunTime uint64 - CpuCycles uint64 - CpuInstr uint64 - CacheMiss uint64 - PageCacheHit uint64 - VecNr [10]uint16 - Comm [16]int8 - _ [4]byte -} - -// loadTest returns the embedded CollectionSpec for test. -func loadTest() (*ebpf.CollectionSpec, error) { - reader := bytes.NewReader(_TestBytes) - spec, err := ebpf.LoadCollectionSpecFromReader(reader) - if err != nil { - return nil, fmt.Errorf("can't load test: %w", err) - } - - return spec, err -} - -// loadTestObjects loads test and converts it into a struct. -// -// The following types are suitable as obj argument: -// -// *testObjects -// *testPrograms -// *testMaps -// -// See ebpf.CollectionSpec.LoadAndAssign documentation for details. -func loadTestObjects(obj interface{}, opts *ebpf.CollectionOptions) error { - spec, err := loadTest() - if err != nil { - return err - } - - return spec.LoadAndAssign(obj, opts) -} - -// testSpecs contains maps and programs before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testSpecs struct { - testProgramSpecs - testMapSpecs -} - -// testSpecs contains programs before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testProgramSpecs struct { - TestKeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"test_kepler_sched_switch_trace"` - TestKeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"test_kepler_write_page_trace"` - TestRegisterNewProcessIfNotExist *ebpf.ProgramSpec `ebpf:"test_register_new_process_if_not_exist"` -} - -// testMapSpecs contains maps before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testMapSpecs struct { - CacheMiss *ebpf.MapSpec `ebpf:"cache_miss"` - CacheMissEventReader *ebpf.MapSpec `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.MapSpec `ebpf:"cpu_cycles"` - CpuCyclesEventReader *ebpf.MapSpec `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.MapSpec `ebpf:"cpu_instructions"` - CpuInstructionsEventReader *ebpf.MapSpec `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.MapSpec `ebpf:"pid_time_map"` - Processes *ebpf.MapSpec `ebpf:"processes"` -} - -// testObjects contains all objects after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testObjects struct { - testPrograms - testMaps -} - -func (o *testObjects) Close() error { - return _TestClose( - &o.testPrograms, - &o.testMaps, - ) -} - -// testMaps contains all maps after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testMaps struct { - CacheMiss *ebpf.Map `ebpf:"cache_miss"` - CacheMissEventReader *ebpf.Map `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.Map `ebpf:"cpu_cycles"` - CpuCyclesEventReader *ebpf.Map `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.Map `ebpf:"cpu_instructions"` - CpuInstructionsEventReader *ebpf.Map `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.Map `ebpf:"pid_time_map"` - Processes *ebpf.Map `ebpf:"processes"` -} - -func (m *testMaps) Close() error { - return _TestClose( - m.CacheMiss, - m.CacheMissEventReader, - m.CpuCycles, - m.CpuCyclesEventReader, - m.CpuInstructions, - m.CpuInstructionsEventReader, - m.PidTimeMap, - m.Processes, - ) -} - -// testPrograms contains all programs after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testPrograms struct { - TestKeplerSchedSwitchTrace *ebpf.Program `ebpf:"test_kepler_sched_switch_trace"` - TestKeplerWritePageTrace *ebpf.Program `ebpf:"test_kepler_write_page_trace"` - TestRegisterNewProcessIfNotExist *ebpf.Program `ebpf:"test_register_new_process_if_not_exist"` -} - -func (p *testPrograms) Close() error { - return _TestClose( - p.TestKeplerSchedSwitchTrace, - p.TestKeplerWritePageTrace, - p.TestRegisterNewProcessIfNotExist, - ) -} - -func _TestClose(closers ...io.Closer) error { - for _, closer := range closers { - if err := closer.Close(); err != nil { - return err - } - } - return nil -} - -// Do not access this directly. -// -//go:embed test_bpfeb.o -var _TestBytes []byte diff --git a/pkg/bpftest/test_bpfeb.o b/pkg/bpftest/test_bpfeb.o deleted file mode 100644 index f190a3ab7bac5bef9e2497fb95beba4b46cd3b80..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13848 zcmeHNU2I&(b)LI-cSW*EMJI8h*oku`D-`udk+Nc|{>w@b9R*R+jU+d6Vf1pfydt+I zmz&+CL|RR|eh8odPAUX$BNPU*HWEiEQhu>;P|eCM1wbIzHcJ9jSsX#B);sZ^laDNw&c-66C&sEpmxC?~%U zsNmY5uB+l3>mRUoV~@dGBBj>X*FV@Md_z_0o2C*xs8q=6;-DI3xy8r@7XyQ@k%;ss zrclz)p8M_Rd8Ix>71D3s+I#Vh^;ObQ-j;GjO6?=)%Ytu8*^=_QluJ@x)n#-23d_Pn zCJt!dB=-F;JV2^a&kw3J%Nq~z*Z7Zky>~6ra8T?W^y7KsNzGSVMs<9z`1-HqG(V8` zsUC6Mi#T4Uq!ZUyiKSiUT7wAjX|7kKY_Jr5-wjPws`#Vz*GLyTv`@s=Pw_`xd~%x z>iPVnO-KQMj?EWlqNAsd9r!~21o8hTrT+cbj@%QqQ4$Md}Tdmyxj>1TB=`L&ozpHG%SVv4Q(XQ|iBo z`deC?QY7V&e=58|)c;EAJU#xk$T6Jcxv}FO&)Ct|pNkx0t(!YB=3YlR-JXh&^p4}m zkPMjyd8|-ALjFf&QLX+Nb!@TP-W8OLX9M{c$av(a7V`Ua4v9|=-A?+`@xDjB+$9^x zuHFws??ay6PXoI#5kIv3cK&D+#Ez(ke1mk)zE6>cM8CNo=8#vA8G{Bg?c@7K3$FHy zecORuzjp#}W}i04>+ey@BKdK4<01X2g7o)U$m{+*UH4=AcG0z~hpzjuEBL(V+TBCf z{n(WkU3+`zx(~b5dC|4s>XP{YsCf8b3_%K}Q64}!g9%I8DDo!!)U5CiZocdW5-U@Wv zeGYx3Qy+K}eOYr43mrD7Cy`k)KaEU#XxrP4%{kDRKCDXRx{Bi?Cr=)EAwKoR-yM%n zzmOk~V>SQSLD2a-U#^v=%E@Fz(Mol$nAB?VY*Me5inX||ic{6O`AS@xR2637SLe%d zy)>JsVr4!qE*58!ny!}0wR%+*3&rUqo-NgCs#2IrVuPnkb#*=|#$^c1&CaU%#}Cm2 z{I*S)nJZ64{895`kCTbtHnTQ8SA{*(g{rDARuc76vZ&4%X66$$TUdybGAxD_Vj*j# zNo`|1Tc|j8#>sie#?_=SnZOd*uZ^BB)=P8c+MT(MXP#hvLDiF5J$@;v%p}$LLbX&+ zV(omrS|}!}TDTC`D{5rq0RGNa&K;oi2r45*j!|@{#>405N|Vu^Vtrw^qrmi}RZOdX zJ1{bE(K8Wr)AM*{Ztf)vU^0`;4iB}Xvo9Jd&R453g0%Z7)n7Un4SOnftA1bW{%5;~ zbMIa>ttR#PYB?Hx$^@XAOqFVNgrJ;Uh^@-fxwt%6kCO$kI_}<8)OEaHn_Hj4px%d% zl(+tE-=JZ4bP-OrXSf7yDyhd5G4yi z8PNXzA*aO%*TKH1R(d&^J2&hzcIz(tIgVW>q0KN<+3}sPOcpTLHjb}0AaeWL7ah$% z7k}lq$IpBb=E#7_jICkGPR6whrFwBXcIWH?m(v-%p3R1_qGhu7vudJVliV*1>rih5QITWt4UW`MXJhJG zsrz6Wm;59bKs)qB9qXi8ovVr^j%w}{vDS~U(-19((F6_BMU3Kn88aT;R;tN)ii(mA z%dcb9zQ~wiCuh6m&`?AL&qjMJiSuSMnW-0`m8|_87P{PY8f@$b?xL-k07t&^f}UDB zMm;ybJ4MQxb|*hw#jdwK4c@H5=4Q7aFZVpOcQv@J^6sW?5Z%N`-Cx~K`VI5IeKeu9 zmhRe#F^%dqvpv~(VG1Ojb?!XdJO%w6e6BLi&B9DE)^i8DyVeQ!xb0K6(*WCU@-a0v(C|&Di_*a|Le3fk{qB()HQ-b0hXaVb2wv z9*o`hx$U3L2!+7$crFdmu}>poTXKigS6(Ozszu&zU)W1F?vV868V>Al*s2R;AY zE;XTNi}o~ZH)&JQ#?G4Vr$hIwDJKhc9sn?bWKy{M10PxHG)^h%_}A3&LUr;&p_)YF zJUrE+@p9qpOfspQ(Nw1jvpD-iC-F4G0Z(sd-&htIf7&6~q9>bCN8I<-6|2fq0 z?yqI}o{kh;M*g8v`gIKVH^Dy`y8ev}4sNS-3-=fb=Wp^;yHxs}EO1WnGb(+P=lwIl z(3d)^)G%#<4I00un<~{X^5B?sN!tItN+0GuMAP7Ki%LID{Z|F=Q0W|w_3FCdFRSzk z*dO#c81H}qeS>}B*Ht=?yPiP&2`Nv0B>Y!YdV+1mpO6^$NkLBd-%;sl-r?j0w^Vu( z?-qO~)cXHPr7L(R37W{_?=kqRV~rRvk)HmlQl|+b(=PpoD*Ya8v~kqfiDUW=m0sf= z6?Cwl^t&qkD*4dWg|7(yxeD{Vk604?D;2)Tdytz3qh`{w!I^z3JcCzIP6z7Cd6zoh&n z!#BpF-ZVJ#Zz^0sTzQv9d$SL#@G8eq{7b|C*Wo|rr{=fYuNq8$e*k+^)*tx$6ZD^o zgb)9&lRxZW@((+h{G7qr-%;UT!rzqZAM8&}h`jilnszYtRRoK_sY?bcNvWp6+3%|G z9r&BNX)yh}4gXRr!Uz5(#y2JYwcD>5KK)&Vy`kgp8u||-;lHTTyn+Zj{-zcA9e)i! zXXLkF9j4)L==mE?h(7T*^!yDye?!mTuqpbnZ+r;;hPMn>lEM|?Ur^zvaD5W434R-{ zz+NmxGk>N3^k8@U^frft59PpiyM z>=Wr*9u|B1eotkF(RX@H_=_rY2=|m3v9FI9{0uVZbKm!|1Yu7`#-p#LGN*B0nmKOx z{h`X7L4R1MTE71gm6_N9JT3U7%DjlSnTE%|B>1~3Gr1l7rr>wblY5dUms za7y;!>|ulLKAg=7mVG!Y_UU~U`*8M(@MWLPN`H1A&R+HS5>LJF_M!bv;mbapT^20+ zaCSv-%k0D1HNmnE^S>gv-}1RL34iXKFd^{gr{F(9vGg+?Zg8tZ=e}f9&9C3r7zzQ-sXBTdRKpm?=X3V~5@VW{Qx%>d>3O zOcC-0hu#8aijbdj=qtcX5%RJ_CuWL}{{?hAPNXq$ymo;uSJ;M#U(PkA2$|n10%WF3 z$PNx7;P!VRh&6y6zB<#*SDcePpo%`8KNQb77B@Z2@#VQl_^BM~{9a;l)5Dxs5wh^Z z9O~O0-1P7Y@D5~==kuluF919KrI&yob>!Kf!_Uwk$N$U4d-(sHa8DEPd)EQrk`Od+3T6`FF#?#_ss7DUYqt5tQ{xs^e zSJqRu?%{@quK@2vmiGFchq1Q!5b7>I28`!un}5dRvwv59!NZ)NE}#9o_^OAmd-$e@ zZ+ZB(hu69>-(%eVc>eCf!yZ2D;o}}Y)WbOs z=RG{(;fjYF9=`11riYh2-16|UhgUqj+Ku%xcKql0-JK8V$it%^&UrZR;Rz2{Jlyc` zWe+z!yyW4QhnGFP;^Eb9td9eZ|9n4i{m(=m9`$g}!+8%+c(~%>hKDbExar{~54SwL z?BNv;uXba)?)sY@^l;?iQ4i-locHjAhbta#c=)o1n;u^BaLdEX9$xYAYB$!$O=tY| z^V`LdmEW=Rvzfk+j6Aw;XxH%WJ$uzi9e0(uD@)5_; zmCwG;XXDOguhX{cL&H6MG&Y~ayIawE`3&sd|8IU*+7a&l{x&i*bAC21Oiotu)!x56 z?Od?BpOf92d-<~GvNwFH9>Ev3df}`ZsV&a3#PzSf3%1eky7WD9^ytC(;BP5ij!F5r zfd{3;M|{oxoRFm))8$xvh~%;OF~faKm!t7Pfl?l{<(Q$5*>V(9=GT#eS!j`Sny>G` zHA6}9Z0IJQm26_&V!hBU17*&syH$IxBkk{y@6tZtdUR8t1O5he`(BcB=%^0yT*Q<^ z9z?c&+V-UVH*$~ugH`a5w&Zlt`LXRuc^|n~|2$VA1v&h%Fa01uY1@O=KKp^J^g)Rg zlln)cf;+LU5v-XQ#1 zd)f3`{AqK#Dg)B~ffu^S-S&SIydLr2bxYbUqZ~x$T|1NQ-_m;Y|L{+YxO?BADNg(p zH}@X>4_}q`EZCQgqtkydFt>g}$Gt(Ru5>`!oB!!|SdP8HJm}uMv64GhOf6hM$u`>z JZu`IOe*v@C&$a*n diff --git a/pkg/bpftest/test_bpfel.go b/pkg/bpftest/test_bpfel.go deleted file mode 100644 index 7317a75a1b..0000000000 --- a/pkg/bpftest/test_bpfel.go +++ /dev/null @@ -1,159 +0,0 @@ -// Code generated by bpf2go; DO NOT EDIT. -//go:build 386 || amd64 || arm || arm64 || loong64 || mips64le || mipsle || ppc64le || riscv64 - -package bpftest - -import ( - "bytes" - _ "embed" - "fmt" - "io" - - "github.com/cilium/ebpf" -) - -type testProcessMetricsT struct { - CgroupId uint64 - Pid uint64 - ProcessRunTime uint64 - CpuCycles uint64 - CpuInstr uint64 - CacheMiss uint64 - PageCacheHit uint64 - VecNr [10]uint16 - Comm [16]int8 - _ [4]byte -} - -// loadTest returns the embedded CollectionSpec for test. -func loadTest() (*ebpf.CollectionSpec, error) { - reader := bytes.NewReader(_TestBytes) - spec, err := ebpf.LoadCollectionSpecFromReader(reader) - if err != nil { - return nil, fmt.Errorf("can't load test: %w", err) - } - - return spec, err -} - -// loadTestObjects loads test and converts it into a struct. -// -// The following types are suitable as obj argument: -// -// *testObjects -// *testPrograms -// *testMaps -// -// See ebpf.CollectionSpec.LoadAndAssign documentation for details. -func loadTestObjects(obj interface{}, opts *ebpf.CollectionOptions) error { - spec, err := loadTest() - if err != nil { - return err - } - - return spec.LoadAndAssign(obj, opts) -} - -// testSpecs contains maps and programs before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testSpecs struct { - testProgramSpecs - testMapSpecs -} - -// testSpecs contains programs before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testProgramSpecs struct { - TestKeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"test_kepler_sched_switch_trace"` - TestKeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"test_kepler_write_page_trace"` - TestRegisterNewProcessIfNotExist *ebpf.ProgramSpec `ebpf:"test_register_new_process_if_not_exist"` -} - -// testMapSpecs contains maps before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testMapSpecs struct { - CacheMiss *ebpf.MapSpec `ebpf:"cache_miss"` - CacheMissEventReader *ebpf.MapSpec `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.MapSpec `ebpf:"cpu_cycles"` - CpuCyclesEventReader *ebpf.MapSpec `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.MapSpec `ebpf:"cpu_instructions"` - CpuInstructionsEventReader *ebpf.MapSpec `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.MapSpec `ebpf:"pid_time_map"` - Processes *ebpf.MapSpec `ebpf:"processes"` -} - -// testObjects contains all objects after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testObjects struct { - testPrograms - testMaps -} - -func (o *testObjects) Close() error { - return _TestClose( - &o.testPrograms, - &o.testMaps, - ) -} - -// testMaps contains all maps after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testMaps struct { - CacheMiss *ebpf.Map `ebpf:"cache_miss"` - CacheMissEventReader *ebpf.Map `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.Map `ebpf:"cpu_cycles"` - CpuCyclesEventReader *ebpf.Map `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.Map `ebpf:"cpu_instructions"` - CpuInstructionsEventReader *ebpf.Map `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.Map `ebpf:"pid_time_map"` - Processes *ebpf.Map `ebpf:"processes"` -} - -func (m *testMaps) Close() error { - return _TestClose( - m.CacheMiss, - m.CacheMissEventReader, - m.CpuCycles, - m.CpuCyclesEventReader, - m.CpuInstructions, - m.CpuInstructionsEventReader, - m.PidTimeMap, - m.Processes, - ) -} - -// testPrograms contains all programs after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testPrograms struct { - TestKeplerSchedSwitchTrace *ebpf.Program `ebpf:"test_kepler_sched_switch_trace"` - TestKeplerWritePageTrace *ebpf.Program `ebpf:"test_kepler_write_page_trace"` - TestRegisterNewProcessIfNotExist *ebpf.Program `ebpf:"test_register_new_process_if_not_exist"` -} - -func (p *testPrograms) Close() error { - return _TestClose( - p.TestKeplerSchedSwitchTrace, - p.TestKeplerWritePageTrace, - p.TestRegisterNewProcessIfNotExist, - ) -} - -func _TestClose(closers ...io.Closer) error { - for _, closer := range closers { - if err := closer.Close(); err != nil { - return err - } - } - return nil -} - -// Do not access this directly. -// -//go:embed test_bpfel.o -var _TestBytes []byte diff --git a/pkg/bpftest/test_bpfel.o b/pkg/bpftest/test_bpfel.o deleted file mode 100644 index 5c8972722c3cd9d16b12680db8a5487ce2871ca9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13848 zcmeHNU2I&(bsln;zt#`sm)oY9noYYY5I_Oklnq=&FbsrN(nKgyfPRt;wBQDHYqd{oUaA5ux&@3tK#PJ1(nb$1 z)bE=)b9Z*Ql*G1=9dPc)Sj#i~UzJLFIpU3Q*AhW4gC3MI6mSJ6XppoqhJqF*@txEl| zl$>Wj>YJVCoO)Vw;JjnVR^El3Yg*ogop+_INqI-gRVi;vS(WmZl*>}yFlF~OU8dhP zc39ZCq_MAmdWRNAooh+z%F4S2Q~!@(ubZ%251J@1OW&R=-+>hJS+_1S1W z$$|5dbNWZd-W%D=<+mC8oKsJ58udT5$3yLq)$AMmnf7(%eH8tz-z;1bf4L<7a>=Y? z`ns`?%eR>weE*2`|B?8^8=Lcob^5<9eqh{8tz7;;(jR_i;-Hb!^`~ZfT7T_T+qD4m zvTq+{93%An_s#xak@)}g>qZayKVAOdh;e+oAI`NJyu!_g^a6s#T`*#NkdUI=ZG4zS zdw`Y?rM!v0sH*8tY}NhN#m~oevzo;G0jeKLIlYI{Dt+$2t?p-bX^2?%1zqVwzlQmD z8@lb++xZFI;M(QQ2kie|W#pT0F^bM#H*saV%sb!8=&jWD-`9m&2n=56k^U`#CT#yB zq4yX(-!fp@)3)7USFS~G-&BC@zVPiuKWDP>gK^Nr|MmXeZ2wvRIj5eA`HZ2qaOs{o z7cQ+yc~{Dsly{_DmGZWfRVi;tDf_tF_;dPQBiF)Cv#))9tlu;{KmUz(z&g{?l$)h% zH|qB&aRDZ(+Y5Cjdk0EI9%!xx^7&~QJ{{Q*Ce{Ip{E-i5tXFT)!-LYL- zxl6@<`UVul&|Cg&c(gkkCTnA9;x zavb&J$Xk$^xR9);km={FSCG?EucFLJor*B5>;9;JL+TaO7g1-Zs-pf~lssL>q1-zt zkvbVYmRLi58y-K@RnpQpCR`mSCM~? zjI@o%pdE*5D{aPvIUdTFYh6Kh`F;$Hv~4FIE?@Bl(i-&8y08^WEk?ZrqYu^UEHY3-V@Z$N^Ovla0CD*ZS2OVCh+GQaBtpro3 z9735xiN7tws5ANWX>=z2upu|lxn56UF-U(-=qS2hsMotV*NY}jZhKzZQwHxi0*s^U z{-{SkDfC|x`nX5W3H>FZ&k3D#K^B{|AKI9}xbz3o@AIg$-*I>w{G79RbchHX5 zqTA)VrvkKpTKcfQ+o|_E^#P~8->DBe^&zKz9dkiYJs0i4L>Z_%OJ?fH_{sWvj?5`w zCVgYe=F2J!kDWe!?A7qh7k_IsJp1a{Xc(&5XAh&D&)Iw_JDHCr0(LDFXEIT#6i!Fw zVm4C>%PKQjoS7|z*$Gu(7J6|uAC|Mzk;)Wi!_0gp7nO81n=h4%DwEDkMd5U|R8ocX zWE2`al`X5wQ6|iz!_4%wntlE`ihySDTuc}|t1QaOArD&(SKc%_&vN1;}}Tuf&oRZL$A%LO$wbO?VJ3KtKt_Yf*W83n@! z4;nmhc_up%?9Y_v_Bjm9Oq#`%>aq(X3l|&>z?+(fxtW>Quz*o6njYwntuq+(XJ(7V z5JuYfqUy?C3SdxYh{ z(GtVSu4@Q3MV(U*0@Jj+&W@frOM`ZG^*bYmxDN({Qud8#=Hh@)*r%uL;yN~ngfRnP zW!HDMFp-9@tzTa)K;X_d7#tsaCH(SljGq4@#E}IPk1b)#PK2c^*>Ywobba=aOQ|p3 zhQkK1qs3XfST$BJiE&?J)X@6mOWLM-1JYhr&(d_k`prxcaYy%}5l?90wAL>95WVi-j=Sub015?9J9fWAy@)O(uvFS6~)={xIQxr)Y){GRP<`1(|5Zw=} zi8l1kU=?Tc@OVsHC`OmrRhX>Xes!S+10#lY&Uj>he?SH=2m5Uk{bnM{mDAu#(!n|j zQ#N&j_47boj5Q8$?8~oe*U~oHF!(*lQdYDF>FF|dwcSwQW&@gw9Um``ytEG$IA-~9 zQP*{CBBY+LY@L3cAGk*oM(f@Ukr-}NE}8IT?FAQz>igU|+dKwcTzu{_`er(p3AOJa zx@(?Lk71w84+SH`!wB2jr_UVw(%7ld@bs~>qwu|9Gc}!EoI7-*K)pv-GxEWM+=0y~ z5XqTHroJ$HamXGN_Fhq+!N|SIupbZ7eoi&w7JGUa4b6^qPnCl&*Edga0AUHz)mx}o zvoY)Bmyv$^M&mhy_J4Ou-LT<^*bU<$aSO&+uW39T8oeeT&6RlpfCGtBxbcBUmO6`D ziaPmKbuwL?xRNeL!6+|JrC>ClzL1M1l(U-ZOnMr3pWrl3BV6!wNNn&L+~;!HOq4H0 zYUl#4hC|pEcot`Hex(sa^B1#|%J#thmG4#pUSyGYPiFq89lzr{*kbO!-@-+m_j{%h zjjV1NFXO|ub{TrZnjzb>@cL1jZe;f42r;=(+ za1}T?k<`Bl$?qS469veNdaUmWV{6)giS1wE50bNWJN_7^ryxWYFY3vfuHAzyu;4uCTMLQUPCo$NzSgSF3!S(Rc-+D5!2E5^`Tv^zf6=P0 z3Kns-T}l5J&|Bvc`du3Fr{VK^Za_cjul0W?q3(L)*92>Qe*k^pU$MSFPN+bz*5?-V zb8tKG5eIhzrv(2d=HL3~33Xnu)py*%)OX6k)VClw)>m+>ui)>1f9nU(Pq5YxA5m%+ zgZ|_%5(+G}K6d;)!CK!n=!+$e^}Ppu1#5j(QVlq`9r%cYyMa@JI}l&CCgG2QV|^X0 z^>wh;S8%MaU@PyE;G^K*x+SUZc=VcJe*4|>nS_E0TA%lT+o=rvocdY+>Je=D_X++9 z`X`_)LQC9V%P08fpdZ+qP;-L!c9MUay5Zqv!7qW{vA0d#@o-J>Z-d@9(5CJQo(JCf zT$}2FA5&i9KHxsV-ve&PCd3d;uL6JSY@0eMIDr?d9p~HhKIQnQfOn6#sVSkK1|Gno zRs>u6Rl(l@eb+>rS`z#L$766+@IBzx-)U0;{EGZ-96lo0#^ID;8;4H{wsH8HgBgdH z1lu@#!=oZTQ$oS;Ui!N5vEyy|z8;UY@&=oA?<7M9X}i1qV|8AxgGa1P4-n1tr^;1P4-H zLCN;2;6Tb3QL?=zIFOQe1h!jj%YV}CcC~cqMGNkL&})bO@}r@A6<6_ikoC8^tA>m< zH#lGgi3Pt6RUFLs<#zvBI^|*_<~M_3r$4{>EO~g%!TMW`_&1yR&1A{LJUf|450F#h z_r$x9mpoi^@NUri#DUqLm}dzSG4Ywf#p z;oBa*Qyhoq%ust7a|EnJTnunJiO#4c-X@!5080x+`|PAS3JDv;Uy0*d${W1RS(xZyyjsX5smtL zIPmbWhf^NL&%XwK;~vJ(vj%#_!;2nX^6;{Us~%qUaLvPO9@dTyiL5{H?#^U!;Nf8p zr#w97;c*WaJY4bcqKB6}yzJqshgUsZ^YEI7l@pJ-ANbpViQR|;58Ly>(o-HCKOc0m z^l=XtJY4bcqKB6}yzJqshgUsZ^YEI7O%SN-+oOX*9S0sB_HfF>?s-GfxJNH|xZ>eO z4=;Im*~3*2uX?!V;WZE2tEOEqdwyFS$o@@3B@ulrOzgfDF6^aV9knxAHg z*T4E*u(f`}OW#w+j~@;X|EkjEh?LJ8cvwn&#Mk7{NwbtAx*Q3gWAjM(tf4-u%i-{_ zKq(K~a>TTc*mBqeg8$`n8t1o!>;_IZ4;lERoM)CO$GwI6j>a}|`?x(7*BUGIb2t4B zz=Ouy`-mOWX)99x6-vC;P$|K-INxt-w!H_CdfM^CO#g}8Y<_!>vA^GRXFTk9i}`Ne9w8a$h)3@I%ppd7(hA+u@zrT<7$^09Q*Z)fs z&u(OOp!#(H%I}XKlle8#8DRCdP5d^&@qQY^|Mf7v%72D*k}v-^F}Rui?>u7$t;$&1 zoBt1L=ePD}TeJCpEdSi(zi;s8XZUV5|BV3rDR*LZZPQ diff --git a/pkg/collector/resourceutilization/bpf/process_bpf_collector.go b/pkg/collector/resourceutilization/bpf/process_bpf_collector.go index d45bc59c3f..2a132ab553 100644 --- a/pkg/collector/resourceutilization/bpf/process_bpf_collector.go +++ b/pkg/collector/resourceutilization/bpf/process_bpf_collector.go @@ -18,11 +18,10 @@ package bpf import "C" import ( - "unsafe" - "github.com/sustainable-computing-io/kepler/pkg/bpf" "github.com/sustainable-computing-io/kepler/pkg/cgroup" "github.com/sustainable-computing-io/kepler/pkg/collector/stats" + "github.com/sustainable-computing-io/kepler/pkg/comm" "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/libvirt" "github.com/sustainable-computing-io/kepler/pkg/utils" @@ -32,6 +31,8 @@ import ( type ProcessBPFMetrics = bpf.ProcessMetrics +var commResolver = comm.NewCommResolver() + // update software counter metrics func updateSWCounters(key uint64, ct *ProcessBPFMetrics, processStats map[uint64]*stats.ProcessStats, bpfSupportedMetrics bpf.SupportedMetrics) { // update ebpf metrics @@ -43,11 +44,11 @@ func updateSWCounters(key uint64, ct *ProcessBPFMetrics, processStats map[uint64 case config.PageCacheHit: processStats[key].ResourceUsage[config.PageCacheHit].AddDeltaStat(utils.GenericSocketID, ct.PageCacheHit/(1000*1000)) case config.IRQNetTXLabel: - processStats[key].ResourceUsage[config.IRQNetTXLabel].AddDeltaStat(utils.GenericSocketID, uint64(ct.VecNr[bpf.IRQNetTX])) + processStats[key].ResourceUsage[config.IRQNetTXLabel].AddDeltaStat(utils.GenericSocketID, ct.NetTxIRQ) case config.IRQNetRXLabel: - processStats[key].ResourceUsage[config.IRQNetRXLabel].AddDeltaStat(utils.GenericSocketID, uint64(ct.VecNr[bpf.IRQNetRX])) + processStats[key].ResourceUsage[config.IRQNetRXLabel].AddDeltaStat(utils.GenericSocketID, ct.NetRxIRQ) case config.IRQBlockLabel: - processStats[key].ResourceUsage[config.IRQBlockLabel].AddDeltaStat(utils.GenericSocketID, uint64(ct.VecNr[bpf.IRQBlock])) + processStats[key].ResourceUsage[config.IRQBlockLabel].AddDeltaStat(utils.GenericSocketID, ct.NetBlockIRQ) default: klog.Errorf("counter %s is not supported\n", counterKey) } @@ -61,13 +62,13 @@ func updateHWCounters(key uint64, ct *ProcessBPFMetrics, processStats map[uint64 var event string switch counterKey { case config.CPUCycle: - val = ct.CpuCycles + val = ct.CPUCyles event = config.CPUCycle case config.CPURefCycle: - val = ct.CpuCycles + val = ct.CPUCyles event = config.CPURefCycle case config.CPUInstruction: - val = ct.CpuInstr + val = ct.CPUInstructions event = config.CPUInstruction case config.CacheMiss: val = ct.CacheMiss @@ -86,12 +87,23 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]* klog.Errorln("could not collect ebpf metrics") return } - for _, ct := range processesData { - comm := C.GoString((*C.char)(unsafe.Pointer(&ct.Comm))) + + // Clear the cache of any PIDs freed this sample period. + // This is safe given that the *stats.ProcessStats.Command is only updated if it is not already known. + // If it is a long-running process, the comm will be preserved from the previous sample period. + commResolver.Clear(processesData.FreedPIDs) + + for _, ct := range processesData.Metrics { + processComm, err := commResolver.ResolveComm(int(ct.Pid)) + if err != nil { + // skip process that is not running + klog.V(6).Infof("failed to resolve comm for PID %v: %v, set comm=%s", ct.Pid, err, utils.SystemProcessName) + continue + } if ct.Pid != 0 { klog.V(6).Infof("process %s (pid=%d, cgroup=%d) has %d CPU cycles, %d instructions, %d cache misses, %d page cache hits", - comm, ct.Pid, ct.CgroupId, ct.CpuCycles, ct.CpuInstr, ct.CacheMiss, ct.PageCacheHit) + processComm, ct.Pid, ct.CGroupID, ct.CPUCyles, ct.CPUInstructions, ct.CacheMiss, ct.PageCacheHit) } // skip process without resource utilization if ct.CacheMiss == 0 && ct.PageCacheHit == 0 { @@ -99,9 +111,9 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]* } // if the pid is within a container, it will have a container ID - containerID, err := cgroup.GetContainerID(ct.CgroupId, ct.Pid, config.EnabledEBPFCgroupID) + containerID, err := cgroup.GetContainerID(ct.CGroupID, ct.Pid, config.EnabledEBPFCgroupID) if err != nil { - klog.V(6).Infof("failed to resolve container for PID %v (command=%s): %v, set containerID=%s", ct.Pid, comm, err, utils.SystemProcessName) + klog.V(6).Infof("failed to resolve container for PID %v (command=%s): %v, set containerID=%s", ct.Pid, processComm, err, utils.SystemProcessName) } // if the pid is within a VM, it will have an VM ID @@ -109,12 +121,12 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]* if config.IsExposeVMStatsEnabled() { vmID, err = libvirt.GetVMID(ct.Pid) if err != nil { - klog.V(6).Infof("failed to resolve VM ID for PID %v (command=%s): %v", ct.Pid, comm, err) + klog.V(6).Infof("failed to resolve VM ID for PID %v (command=%s): %v", ct.Pid, processComm, err) } } mapKey := ct.Pid - if ct.CgroupId == 1 && config.EnabledEBPFCgroupID { + if ct.CGroupID == 1 && config.EnabledEBPFCgroupID { // we aggregate all kernel process to minimize overhead // all kernel process has cgroup id as 1 and pid 1 is also a kernel process mapKey = 1 @@ -124,11 +136,12 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]* var ok bool var pStat *stats.ProcessStats if pStat, ok = processStats[mapKey]; !ok { - pStat = stats.NewProcessStats(ct.Pid, ct.CgroupId, containerID, vmID, comm, bpfSupportedMetrics) + pStat = stats.NewProcessStats(ct.Pid, ct.CGroupID, containerID, vmID, processComm, bpfSupportedMetrics) processStats[mapKey] = pStat } else if pStat.Command == "" { - pStat.Command = comm + pStat.Command = processComm } + // when the process metrics are updated, reset the idle counter pStat.IdleCounter = 0 diff --git a/pkg/comm/resolve_comm.go b/pkg/comm/resolve_comm.go new file mode 100644 index 0000000000..7a4ffe6053 --- /dev/null +++ b/pkg/comm/resolve_comm.go @@ -0,0 +1,105 @@ +package comm + +import ( + "bytes" + "fmt" + "os" + "strconv" + "strings" + + "golang.org/x/sys/unix" +) + +const unknownComm = "unknown" + +type CommResolver struct { + cacheExist map[int]string + cacheNotExist map[int]struct{} + procFsResolver func(pid int) (string, error) +} + +func NewCommResolver() *CommResolver { + return &CommResolver{ + cacheExist: map[int]string{}, + cacheNotExist: map[int]struct{}{}, + procFsResolver: readCommandFromProcFs, + } +} + +func NewTestCommResolver(procFsResolver func(pid int) (string, error)) *CommResolver { + return &CommResolver{ + cacheExist: map[int]string{}, + cacheNotExist: map[int]struct{}{}, + procFsResolver: procFsResolver, + } +} + +func (r *CommResolver) ResolveComm(pid int) (string, error) { + if comm, ok := r.cacheExist[pid]; ok { + return comm, nil + } + if _, ok := r.cacheNotExist[pid]; ok { + return unknownComm, fmt.Errorf("process not running") + } + + comm, err := r.procFsResolver(pid) + if err != nil && os.IsNotExist(err) { + // skip process that is not running + r.cacheNotExist[pid] = struct{}{} + return unknownComm, fmt.Errorf("process not running: %w", err) + } + + r.cacheExist[pid] = comm + return comm, nil +} + +func (r *CommResolver) Clear(freed []int) { + for _, pid := range freed { + delete(r.cacheExist, pid) + delete(r.cacheNotExist, pid) + } +} + +func readCommandFromProcFs(pid int) (string, error) { + if _, err := os.Stat("/proc/" + strconv.Itoa(pid)); os.IsNotExist(err) { + return "", err + } + var comm string + if cmdLineBytes, err := os.ReadFile("/proc/" + strconv.Itoa(pid) + "/cmdline"); err == nil { + comm = readCommandFromProcFsCmdline(cmdLineBytes) + } + if comm != "" { + return comm, nil + } + if commBytes, err := os.ReadFile("/proc/" + strconv.Itoa(pid) + "/comm"); err == nil { + comm = readCommandFromProcFsComm(commBytes) + } + if comm != "" { + return comm, nil + } + return unknownComm, nil +} + +// This gives the same output as `ps -o comm` command +func readCommandFromProcFsCmdline(b []byte) string { + // replace null bytes with new line + buf := bytes.ReplaceAll(b, []byte{0x0}, []byte{0x0a}) + // Using all the parts would be nice, but as these become prometheus labels + // we need to be careful about the cardinality. Just use the first part. + parts := strings.Split(strings.TrimSpace(unix.ByteSliceToString(buf)), "\n") + if len(parts) > 0 && parts[0] != "" { + return parts[0] + } + return "" +} + +// This is a fallback method when we can't read the executable name from +// the cmdline. i.e for kernel threads +func readCommandFromProcFsComm(b []byte) string { + comm := strings.TrimSpace(unix.ByteSliceToString(b)) + if comm != "" { + // return the command in square brackets, like ps does + return "[" + comm + "]" + } + return "" +} diff --git a/pkg/comm/resolve_comm_test.go b/pkg/comm/resolve_comm_test.go new file mode 100644 index 0000000000..7efd98195c --- /dev/null +++ b/pkg/comm/resolve_comm_test.go @@ -0,0 +1,83 @@ +package comm + +import ( + "os" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestCollector(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Comm Resolver Suite") +} + +func resolveFromCmdline(pid int) (string, error) { + return readCommandFromProcFsCmdline([]byte("cmdline\x00is\x00a\x000test")), nil +} + +func resolveFromComm(pid int) (string, error) { + return readCommandFromProcFsComm([]byte("comm")), nil +} + +func resolveNotExist(pid int) (string, error) { + return unknownComm, os.ErrNotExist +} + +var _ = Describe("CommResolver", func() { + Describe("ResolveComm", func() { + Context("when the process ID exists", func() { + It("should return the resolved command name from cmdline", func() { + resolver := NewTestCommResolver(resolveFromCmdline) + resolvedComm, err := resolver.ResolveComm(1234) + Expect(err).ToNot(HaveOccurred()) + Expect(resolvedComm).To(Equal("cmdline")) + // Verify that the resolved command name is cached + Expect(resolver.cacheExist).To(HaveKey(1234)) + Expect(resolver.cacheExist[1234]).To(Equal("cmdline")) + }) + + It("should return the resolved command name from comm", func() { + resolver := NewTestCommResolver(resolveFromComm) + resolvedComm, err := resolver.ResolveComm(1234) + Expect(err).ToNot(HaveOccurred()) + Expect(resolvedComm).To(Equal("[comm]")) + // Verify that the resolved command name is cached + Expect(resolver.cacheExist).To(HaveKey(1234)) + Expect(resolver.cacheExist[1234]).To(Equal("[comm]")) + }) + }) + + Context("when the process ID does not exist", func() { + It("should return an error", func() { + resolver := NewTestCommResolver(resolveNotExist) + resolvedComm, err := resolver.ResolveComm(54321) + Expect(err).To(HaveOccurred()) + Expect(resolvedComm) + // Verify that the process ID is cached as non-existent + Expect(resolver.cacheNotExist).To(HaveKey(54321)) + }) + }) + }) + + Describe("Clear", func() { + It("should clear the cache for freed process IDs", func() { + freed := []int{123, 456, 789} + resolver := NewTestCommResolver(resolveFromCmdline) + + // Add some entries to the cache + for _, pid := range freed { + _, err := resolver.ResolveComm(pid) + Expect(err).ToNot(HaveOccurred()) + } + + // Clear the cache + resolver.Clear(freed) + + // Verify that the cache is empty for the freed process IDs + Expect(resolver.cacheExist).To(HaveLen(0)) + Expect(resolver.cacheNotExist).To(HaveLen(0)) + }) + }) +}) diff --git a/pkg/config/config.go b/pkg/config/config.go index c570c95bdd..4af4741eff 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -82,7 +82,6 @@ var ( BindAddressKey = "BIND_ADDRESS" CPUArchOverride = getConfig("CPU_ARCH_OVERRIDE", "") MaxLookupRetry = getIntConfig("MAX_LOOKUP_RETRY", defaultMaxLookupRetry) - BPFSampleRate = getIntConfig("EXPERIMENTAL_BPF_SAMPLE_RATE", 0) EstimatorModel = getConfig("ESTIMATOR_MODEL", defaultMetricValue) // auto-select EstimatorSelectFilter = getConfig("ESTIMATOR_SELECT_FILTER", defaultMetricValue) // no filter @@ -157,7 +156,6 @@ func logBoolConfigs() { klog.V(5).Infof("EXPOSE_BPF_METRICS: %t", ExposeBPFMetrics) klog.V(5).Infof("EXPOSE_COMPONENT_POWER: %t", ExposeComponentPower) klog.V(5).Infof("EXPOSE_ESTIMATED_IDLE_POWER_METRICS: %t. This only impacts when the power is estimated using pre-prained models. Estimated idle power is meaningful only when Kepler is running on bare-metal or with a single virtual machine (VM) on the node.", ExposeIdlePowerMetrics) - klog.V(5).Infof("EXPERIMENTAL_BPF_SAMPLE_RATE: %d", BPFSampleRate) } } diff --git a/vendor/github.com/cilium/ebpf/.vimto.toml b/vendor/github.com/cilium/ebpf/.vimto.toml new file mode 100644 index 0000000000..49a12dbc09 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/.vimto.toml @@ -0,0 +1,12 @@ +kernel="ghcr.io/cilium/ci-kernels:stable" +smp="cpus=2" +memory="1G" +user="root" +setup=[ + "mount -t cgroup2 -o nosuid,noexec,nodev cgroup2 /sys/fs/cgroup", + "/bin/sh -c 'modprobe bpf_testmod || true'", + "dmesg --clear", +] +teardown=[ + "dmesg --read-clear", +] diff --git a/vendor/github.com/cilium/ebpf/CODEOWNERS b/vendor/github.com/cilium/ebpf/CODEOWNERS index ad13437ea2..ca65d23c09 100644 --- a/vendor/github.com/cilium/ebpf/CODEOWNERS +++ b/vendor/github.com/cilium/ebpf/CODEOWNERS @@ -7,3 +7,5 @@ perf/ @florianl ringbuf/ @florianl btf/ @dylandreimerink + +cmd/bpf2go/ @mejedi diff --git a/vendor/github.com/cilium/ebpf/Makefile b/vendor/github.com/cilium/ebpf/Makefile index c55a93d9cb..d355eea71c 100644 --- a/vendor/github.com/cilium/ebpf/Makefile +++ b/vendor/github.com/cilium/ebpf/Makefile @@ -106,7 +106,7 @@ testdata/loader-%-eb.elf: testdata/loader.c $(STRIP) -g $@ .PHONY: update-kernel-deps -update-kernel-deps: export KERNEL_VERSION?=6.7 +update-kernel-deps: export KERNEL_VERSION?=6.8 update-kernel-deps: ./testdata/sh/update-kernel-deps.sh $(MAKE) container-all diff --git a/vendor/github.com/cilium/ebpf/btf/btf.go b/vendor/github.com/cilium/ebpf/btf/btf.go index 204757dbf6..671f680b2a 100644 --- a/vendor/github.com/cilium/ebpf/btf/btf.go +++ b/vendor/github.com/cilium/ebpf/btf/btf.go @@ -66,7 +66,7 @@ func (s *immutableTypes) typeByID(id TypeID) (Type, bool) { // mutableTypes is a set of types which may be changed. type mutableTypes struct { imm immutableTypes - mu *sync.RWMutex // protects copies below + mu sync.RWMutex // protects copies below copies map[Type]Type // map[orig]copy copiedTypeIDs map[Type]TypeID // map[copy]origID } @@ -94,10 +94,14 @@ func (mt *mutableTypes) add(typ Type, typeIDs map[Type]TypeID) Type { } // copy a set of mutable types. -func (mt *mutableTypes) copy() mutableTypes { - mtCopy := mutableTypes{ +func (mt *mutableTypes) copy() *mutableTypes { + if mt == nil { + return nil + } + + mtCopy := &mutableTypes{ mt.imm, - &sync.RWMutex{}, + sync.RWMutex{}, make(map[Type]Type, len(mt.copies)), make(map[Type]TypeID, len(mt.copiedTypeIDs)), } @@ -169,7 +173,7 @@ func (mt *mutableTypes) anyTypesByName(name string) ([]Type, error) { // Spec allows querying a set of Types and loading the set into the // kernel. type Spec struct { - mutableTypes + *mutableTypes // String table from ELF. strings *stringTable @@ -339,7 +343,7 @@ func loadRawSpec(btf io.ReaderAt, bo binary.ByteOrder, base *Spec) (*Spec, error typeIDs, typesByName := indexTypes(types, firstTypeID) return &Spec{ - mutableTypes{ + &mutableTypes{ immutableTypes{ types, typeIDs, @@ -347,7 +351,7 @@ func loadRawSpec(btf io.ReaderAt, bo binary.ByteOrder, base *Spec) (*Spec, error typesByName, bo, }, - &sync.RWMutex{}, + sync.RWMutex{}, make(map[Type]Type), make(map[Type]TypeID), }, @@ -522,6 +526,10 @@ func fixupDatasecLayout(ds *Datasec) error { // Copy creates a copy of Spec. func (s *Spec) Copy() *Spec { + if s == nil { + return nil + } + return &Spec{ s.mutableTypes.copy(), s.strings, diff --git a/vendor/github.com/cilium/ebpf/btf/handle.go b/vendor/github.com/cilium/ebpf/btf/handle.go index b6b3e87f50..adfa6fed4b 100644 --- a/vendor/github.com/cilium/ebpf/btf/handle.go +++ b/vendor/github.com/cilium/ebpf/btf/handle.go @@ -41,6 +41,8 @@ func NewHandle(b *Builder) (*Handle, error) { // // Returns an error wrapping ErrNotSupported if the kernel doesn't support BTF. func NewHandleFromRawBTF(btf []byte) (*Handle, error) { + const minLogSize = 64 * 1024 + if uint64(len(btf)) > math.MaxUint32 { return nil, errors.New("BTF exceeds the maximum size") } @@ -50,26 +52,54 @@ func NewHandleFromRawBTF(btf []byte) (*Handle, error) { BtfSize: uint32(len(btf)), } - fd, err := sys.BtfLoad(attr) - if err == nil { - return &Handle{fd, attr.BtfSize, false}, nil + var ( + logBuf []byte + err error + ) + for { + var fd *sys.FD + fd, err = sys.BtfLoad(attr) + if err == nil { + return &Handle{fd, attr.BtfSize, false}, nil + } + + if attr.BtfLogTrueSize != 0 && attr.BtfLogSize >= attr.BtfLogTrueSize { + // The log buffer already has the correct size. + break + } + + if attr.BtfLogSize != 0 && !errors.Is(err, unix.ENOSPC) { + // Up until at least kernel 6.0, the BTF verifier does not return ENOSPC + // if there are other verification errors. ENOSPC is only returned when + // the BTF blob is correct, a log was requested, and the provided buffer + // is too small. We're therefore not sure whether we got the full + // log or not. + break + } + + // Make an educated guess how large the buffer should be. Start + // at a reasonable minimum and then double the size. + logSize := uint32(max(len(logBuf)*2, minLogSize)) + if int(logSize) < len(logBuf) { + return nil, errors.New("overflow while probing log buffer size") + } + + if attr.BtfLogTrueSize != 0 { + // The kernel has given us a hint how large the log buffer has to be. + logSize = attr.BtfLogTrueSize + } + + logBuf = make([]byte, logSize) + attr.BtfLogSize = logSize + attr.BtfLogBuf = sys.NewSlicePointer(logBuf) + attr.BtfLogLevel = 1 } if err := haveBTF(); err != nil { return nil, err } - logBuf := make([]byte, 64*1024) - attr.BtfLogBuf = sys.NewSlicePointer(logBuf) - attr.BtfLogSize = uint32(len(logBuf)) - attr.BtfLogLevel = 1 - - // Up until at least kernel 6.0, the BTF verifier does not return ENOSPC - // if there are other verification errors. ENOSPC is only returned when - // the BTF blob is correct, a log was requested, and the provided buffer - // is too small. - _, ve := sys.BtfLoad(attr) - return nil, internal.ErrorWithLog("load btf", err, logBuf, errors.Is(ve, unix.ENOSPC)) + return nil, internal.ErrorWithLog("load btf", err, logBuf) } // NewHandleFromID returns the BTF handle for a given id. diff --git a/vendor/github.com/cilium/ebpf/btf/types.go b/vendor/github.com/cilium/ebpf/btf/types.go index 3cb9184f00..a3397460b9 100644 --- a/vendor/github.com/cilium/ebpf/btf/types.go +++ b/vendor/github.com/cilium/ebpf/btf/types.go @@ -682,6 +682,10 @@ func Copy(typ Type) Type { } func copyType(typ Type, ids map[Type]TypeID, copies map[Type]Type, copiedIDs map[Type]TypeID) Type { + if typ == nil { + return nil + } + cpy, ok := copies[typ] if ok { // This has been copied previously, no need to continue. diff --git a/vendor/github.com/cilium/ebpf/collection.go b/vendor/github.com/cilium/ebpf/collection.go index a5532220fd..b2cb214adc 100644 --- a/vendor/github.com/cilium/ebpf/collection.go +++ b/vendor/github.com/cilium/ebpf/collection.go @@ -57,7 +57,7 @@ func (cs *CollectionSpec) Copy() *CollectionSpec { Maps: make(map[string]*MapSpec, len(cs.Maps)), Programs: make(map[string]*ProgramSpec, len(cs.Programs)), ByteOrder: cs.ByteOrder, - Types: cs.Types, + Types: cs.Types.Copy(), } for name, spec := range cs.Maps { diff --git a/vendor/github.com/cilium/ebpf/elf_reader.go b/vendor/github.com/cilium/ebpf/elf_reader.go index d55ab88928..620037d80a 100644 --- a/vendor/github.com/cilium/ebpf/elf_reader.go +++ b/vendor/github.com/cilium/ebpf/elf_reader.go @@ -972,6 +972,9 @@ func mapSpecFromBTF(es *elfSection, vs *btf.VarSecinfo, def *btf.Struct, spec *b return nil, fmt.Errorf("resolving values contents: %w", err) } + case "map_extra": + return nil, fmt.Errorf("BTF map definition: field %s: %w", member.Name, ErrNotSupported) + default: return nil, fmt.Errorf("unrecognized field %s in BTF map definition", member.Name) } diff --git a/vendor/github.com/cilium/ebpf/info.go b/vendor/github.com/cilium/ebpf/info.go index 79b11c951f..04c60c64b8 100644 --- a/vendor/github.com/cilium/ebpf/info.go +++ b/vendor/github.com/cilium/ebpf/info.go @@ -20,6 +20,23 @@ import ( "github.com/cilium/ebpf/internal/unix" ) +// The *Info structs expose metadata about a program or map. Most +// fields are exposed via a getter: +// +// func (*MapInfo) ID() (MapID, bool) +// +// This is because the metadata available changes based on kernel version. +// The second boolean return value indicates whether a particular field is +// available on the current kernel. +// +// Always add new metadata as such a getter, unless you can somehow get the +// value of the field on all supported kernels. Also document which version +// a particular field first appeared in. +// +// Some metadata is a buffer which needs additional parsing. In this case, +// store the undecoded data in the Info struct and provide a getter which +// decodes it when necessary. See ProgramInfo.Instructions for an example. + // MapInfo describes a map. type MapInfo struct { Type MapType @@ -30,6 +47,8 @@ type MapInfo struct { Flags uint32 // Name as supplied by user space at load time. Available from 4.15. Name string + + btf btf.ID } func newMapInfoFromFd(fd *sys.FD) (*MapInfo, error) { @@ -50,6 +69,7 @@ func newMapInfoFromFd(fd *sys.FD) (*MapInfo, error) { info.MaxEntries, uint32(info.MapFlags), unix.ByteSliceToString(info.Name[:]), + btf.ID(info.BtfId), }, nil } @@ -77,12 +97,27 @@ func (mi *MapInfo) ID() (MapID, bool) { return mi.id, mi.id > 0 } +// BTFID returns the BTF ID associated with the Map. +// +// The ID is only valid as long as the associated Map is kept alive. +// Available from 4.18. +// +// The bool return value indicates whether this optional field is available and +// populated. (The field may be available but not populated if the kernel +// supports the field but the Map was loaded without BTF information.) +func (mi *MapInfo) BTFID() (btf.ID, bool) { + return mi.btf, mi.btf > 0 +} + // programStats holds statistics of a program. type programStats struct { // Total accumulated runtime of the program ins ns. runtime time.Duration // Total number of times the program was called. runCount uint64 + // Total number of times the programm was NOT called. + // Added in commit 9ed9e9ba2337 ("bpf: Count the number of times recursion was prevented"). + recursionMisses uint64 } // ProgramInfo describes a program. @@ -125,8 +160,9 @@ func newProgramInfoFromFd(fd *sys.FD) (*ProgramInfo, error) { Name: unix.ByteSliceToString(info.Name[:]), btf: btf.ID(info.BtfId), stats: &programStats{ - runtime: time.Duration(info.RunTimeNs), - runCount: info.RunCnt, + runtime: time.Duration(info.RunTimeNs), + runCount: info.RunCnt, + recursionMisses: info.RecursionMisses, }, } @@ -259,6 +295,16 @@ func (pi *ProgramInfo) Runtime() (time.Duration, bool) { return time.Duration(0), false } +// RecursionMisses returns the total number of times the program was NOT called. +// This can happen when another bpf program is already running on the cpu, which +// is likely to happen for example when you interrupt bpf program execution. +func (pi *ProgramInfo) RecursionMisses() (uint64, bool) { + if pi.stats != nil { + return pi.stats.recursionMisses, true + } + return 0, false +} + // Instructions returns the 'xlated' instruction stream of the program // after it has been verified and rewritten by the kernel. These instructions // cannot be loaded back into the kernel as-is, this is mainly used for diff --git a/vendor/github.com/cilium/ebpf/internal/epoll/poller.go b/vendor/github.com/cilium/ebpf/internal/epoll/poller.go new file mode 100644 index 0000000000..ed1c3a3c8f --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/epoll/poller.go @@ -0,0 +1,278 @@ +package epoll + +import ( + "errors" + "fmt" + "math" + "os" + "runtime" + "slices" + "sync" + "time" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +var ErrFlushed = errors.New("data was flushed") + +// Poller waits for readiness notifications from multiple file descriptors. +// +// The wait can be interrupted by calling Close. +type Poller struct { + // mutexes protect the fields declared below them. If you need to + // acquire both at once you must lock epollMu before eventMu. + epollMu sync.Mutex + epollFd int + + eventMu sync.Mutex + closeEvent *eventFd + flushEvent *eventFd +} + +func New() (_ *Poller, err error) { + closeFDOnError := func(fd int) { + if err != nil { + unix.Close(fd) + } + } + closeEventFDOnError := func(e *eventFd) { + if err != nil { + e.close() + } + } + + epollFd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("create epoll fd: %v", err) + } + defer closeFDOnError(epollFd) + + p := &Poller{epollFd: epollFd} + p.closeEvent, err = newEventFd() + if err != nil { + return nil, err + } + defer closeEventFDOnError(p.closeEvent) + + p.flushEvent, err = newEventFd() + if err != nil { + return nil, err + } + defer closeEventFDOnError(p.flushEvent) + + if err := p.Add(p.closeEvent.raw, 0); err != nil { + return nil, fmt.Errorf("add close eventfd: %w", err) + } + + if err := p.Add(p.flushEvent.raw, 0); err != nil { + return nil, fmt.Errorf("add flush eventfd: %w", err) + } + + runtime.SetFinalizer(p, (*Poller).Close) + return p, nil +} + +// Close the poller. +// +// Interrupts any calls to Wait. Multiple calls to Close are valid, but subsequent +// calls will return os.ErrClosed. +func (p *Poller) Close() error { + runtime.SetFinalizer(p, nil) + + // Interrupt Wait() via the closeEvent fd if it's currently blocked. + if err := p.wakeWaitForClose(); err != nil { + return err + } + + // Acquire the lock. This ensures that Wait isn't running. + p.epollMu.Lock() + defer p.epollMu.Unlock() + + // Prevent other calls to Close(). + p.eventMu.Lock() + defer p.eventMu.Unlock() + + if p.epollFd != -1 { + unix.Close(p.epollFd) + p.epollFd = -1 + } + + if p.closeEvent != nil { + p.closeEvent.close() + p.closeEvent = nil + } + + if p.flushEvent != nil { + p.flushEvent.close() + p.flushEvent = nil + } + + return nil +} + +// Add an fd to the poller. +// +// id is returned by Wait in the unix.EpollEvent.Pad field any may be zero. It +// must not exceed math.MaxInt32. +// +// Add is blocked by Wait. +func (p *Poller) Add(fd int, id int) error { + if int64(id) > math.MaxInt32 { + return fmt.Errorf("unsupported id: %d", id) + } + + p.epollMu.Lock() + defer p.epollMu.Unlock() + + if p.epollFd == -1 { + return fmt.Errorf("epoll add: %w", os.ErrClosed) + } + + // The representation of EpollEvent isn't entirely accurate. + // Pad is fully usable, not just padding. Hence we stuff the + // id in there, which allows us to identify the event later (e.g., + // in case of perf events, which CPU sent it). + event := unix.EpollEvent{ + Events: unix.EPOLLIN, + Fd: int32(fd), + Pad: int32(id), + } + + if err := unix.EpollCtl(p.epollFd, unix.EPOLL_CTL_ADD, fd, &event); err != nil { + return fmt.Errorf("add fd to epoll: %v", err) + } + + return nil +} + +// Wait for events. +// +// Returns the number of pending events and any errors. +// +// - [os.ErrClosed] if interrupted by [Close]. +// - [ErrFlushed] if interrupted by [Flush]. +// - [os.ErrDeadlineExceeded] if deadline is reached. +func (p *Poller) Wait(events []unix.EpollEvent, deadline time.Time) (int, error) { + p.epollMu.Lock() + defer p.epollMu.Unlock() + + if p.epollFd == -1 { + return 0, fmt.Errorf("epoll wait: %w", os.ErrClosed) + } + + for { + timeout := int(-1) + if !deadline.IsZero() { + msec := time.Until(deadline).Milliseconds() + // Deadline is in the past, don't block. + msec = max(msec, 0) + // Deadline is too far in the future. + msec = min(msec, math.MaxInt) + + timeout = int(msec) + } + + n, err := unix.EpollWait(p.epollFd, events, timeout) + if temp, ok := err.(temporaryError); ok && temp.Temporary() { + // Retry the syscall if we were interrupted, see https://github.com/golang/go/issues/20400 + continue + } + + if err != nil { + return 0, err + } + + if n == 0 { + return 0, fmt.Errorf("epoll wait: %w", os.ErrDeadlineExceeded) + } + + for i := 0; i < n; { + event := events[i] + if int(event.Fd) == p.closeEvent.raw { + // Since we don't read p.closeEvent the event is never cleared and + // we'll keep getting this wakeup until Close() acquires the + // lock and sets p.epollFd = -1. + return 0, fmt.Errorf("epoll wait: %w", os.ErrClosed) + } + if int(event.Fd) == p.flushEvent.raw { + // read event to prevent it from continuing to wake + p.flushEvent.read() + err = ErrFlushed + events = slices.Delete(events, i, i+1) + n -= 1 + continue + } + i++ + } + + return n, err + } +} + +type temporaryError interface { + Temporary() bool +} + +// wakeWaitForClose unblocks Wait if it's epoll_wait. +func (p *Poller) wakeWaitForClose() error { + p.eventMu.Lock() + defer p.eventMu.Unlock() + + if p.closeEvent == nil { + return fmt.Errorf("epoll wake: %w", os.ErrClosed) + } + + return p.closeEvent.add(1) +} + +// Flush unblocks Wait if it's epoll_wait, for purposes of reading pending samples +func (p *Poller) Flush() error { + p.eventMu.Lock() + defer p.eventMu.Unlock() + + if p.flushEvent == nil { + return fmt.Errorf("epoll wake: %w", os.ErrClosed) + } + + return p.flushEvent.add(1) +} + +// eventFd wraps a Linux eventfd. +// +// An eventfd acts like a counter: writes add to the counter, reads retrieve +// the counter and reset it to zero. Reads also block if the counter is zero. +// +// See man 2 eventfd. +type eventFd struct { + file *os.File + // prefer raw over file.Fd(), since the latter puts the file into blocking + // mode. + raw int +} + +func newEventFd() (*eventFd, error) { + fd, err := unix.Eventfd(0, unix.O_CLOEXEC|unix.O_NONBLOCK) + if err != nil { + return nil, err + } + file := os.NewFile(uintptr(fd), "event") + return &eventFd{file, fd}, nil +} + +func (efd *eventFd) close() error { + return efd.file.Close() +} + +func (efd *eventFd) add(n uint64) error { + var buf [8]byte + internal.NativeEndian.PutUint64(buf[:], n) + _, err := efd.file.Write(buf[:]) + return err +} + +func (efd *eventFd) read() (uint64, error) { + var buf [8]byte + _, err := efd.file.Read(buf[:]) + return internal.NativeEndian.Uint64(buf[:]), err +} diff --git a/vendor/github.com/cilium/ebpf/internal/errors.go b/vendor/github.com/cilium/ebpf/internal/errors.go index bda01e2fde..83a371ad35 100644 --- a/vendor/github.com/cilium/ebpf/internal/errors.go +++ b/vendor/github.com/cilium/ebpf/internal/errors.go @@ -12,7 +12,7 @@ import ( // // The default error output is a summary of the full log. The latter can be // accessed via VerifierError.Log or by formatting the error, see Format. -func ErrorWithLog(source string, err error, log []byte, truncated bool) *VerifierError { +func ErrorWithLog(source string, err error, log []byte) *VerifierError { const whitespace = "\t\r\v\n " // Convert verifier log C string by truncating it on the first 0 byte @@ -23,7 +23,7 @@ func ErrorWithLog(source string, err error, log []byte, truncated bool) *Verifie log = bytes.Trim(log, whitespace) if len(log) == 0 { - return &VerifierError{source, err, nil, truncated} + return &VerifierError{source, err, nil, false} } logLines := bytes.Split(log, []byte{'\n'}) @@ -34,7 +34,7 @@ func ErrorWithLog(source string, err error, log []byte, truncated bool) *Verifie lines = append(lines, string(bytes.TrimRight(line, whitespace))) } - return &VerifierError{source, err, lines, truncated} + return &VerifierError{source, err, lines, false} } // VerifierError includes information from the eBPF verifier. @@ -46,7 +46,7 @@ type VerifierError struct { Cause error // The verifier output split into lines. Log []string - // Whether the log output is truncated, based on several heuristics. + // Deprecated: the log is never truncated anymore. Truncated bool } @@ -70,7 +70,7 @@ func (le *VerifierError) Error() string { } lines := log[n-1:] - if n >= 2 && (includePreviousLine(log[n-1]) || le.Truncated) { + if n >= 2 && includePreviousLine(log[n-1]) { // Add one more line of context if it aids understanding the error. lines = log[n-2:] } @@ -81,22 +81,9 @@ func (le *VerifierError) Error() string { } omitted := len(le.Log) - len(lines) - if omitted == 0 && !le.Truncated { - return b.String() - } - - b.WriteString(" (") - if le.Truncated { - b.WriteString("truncated") - } - if omitted > 0 { - if le.Truncated { - b.WriteString(", ") - } - fmt.Fprintf(&b, "%d line(s) omitted", omitted) + fmt.Fprintf(&b, " (%d line(s) omitted)", omitted) } - b.WriteString(")") return b.String() } @@ -188,10 +175,6 @@ func (le *VerifierError) Format(f fmt.State, verb rune) { } } - if le.Truncated { - fmt.Fprintf(f, "\n\t(truncated)") - } - default: fmt.Fprintf(f, "%%!%c(BADVERB)", verb) } diff --git a/vendor/github.com/cilium/ebpf/internal/sys/types.go b/vendor/github.com/cilium/ebpf/internal/sys/types.go index d2ae942668..70e754de71 100644 --- a/vendor/github.com/cilium/ebpf/internal/sys/types.go +++ b/vendor/github.com/cilium/ebpf/internal/sys/types.go @@ -359,7 +359,7 @@ const ( BPF_LINK_TYPE_TCX LinkType = 11 BPF_LINK_TYPE_UPROBE_MULTI LinkType = 12 BPF_LINK_TYPE_NETKIT LinkType = 13 - MAX_BPF_LINK_TYPE LinkType = 14 + __MAX_BPF_LINK_TYPE LinkType = 14 ) type MapType uint32 @@ -528,7 +528,7 @@ type LinkInfo struct { Id LinkID ProgId uint32 _ [4]byte - Extra [40]uint8 + Extra [48]uint8 } type MapInfo struct { @@ -1263,7 +1263,7 @@ type CgroupLinkInfo struct { _ [4]byte CgroupId uint64 AttachType AttachType - _ [28]byte + _ [36]byte } type IterLinkInfo struct { @@ -1287,6 +1287,7 @@ type KprobeLinkInfo struct { Offset uint32 Addr uint64 Missed uint64 + _ [8]byte } type KprobeMultiLinkInfo struct { @@ -1298,7 +1299,7 @@ type KprobeMultiLinkInfo struct { Count uint32 Flags uint32 Missed uint64 - _ [16]byte + _ [24]byte } type NetNsLinkInfo struct { @@ -1308,7 +1309,7 @@ type NetNsLinkInfo struct { _ [4]byte NetnsIno uint32 AttachType AttachType - _ [32]byte + _ [40]byte } type NetfilterLinkInfo struct { @@ -1320,7 +1321,7 @@ type NetfilterLinkInfo struct { Hooknum uint32 Priority int32 Flags uint32 - _ [24]byte + _ [32]byte } type NetkitLinkInfo struct { @@ -1330,7 +1331,7 @@ type NetkitLinkInfo struct { _ [4]byte Ifindex uint32 AttachType AttachType - _ [32]byte + _ [40]byte } type PerfEventLinkInfo struct { @@ -1348,7 +1349,7 @@ type RawTracepointLinkInfo struct { _ [4]byte TpName Pointer TpNameLen uint32 - _ [28]byte + _ [36]byte } type TcxLinkInfo struct { @@ -1358,7 +1359,7 @@ type TcxLinkInfo struct { _ [4]byte Ifindex uint32 AttachType AttachType - _ [32]byte + _ [40]byte } type TracingLinkInfo struct { @@ -1369,7 +1370,7 @@ type TracingLinkInfo struct { AttachType AttachType TargetObjId uint32 TargetBtfId TypeID - _ [28]byte + _ [36]byte } type XDPLinkInfo struct { @@ -1378,5 +1379,5 @@ type XDPLinkInfo struct { ProgId uint32 _ [4]byte Ifindex uint32 - _ [36]byte + _ [44]byte } diff --git a/vendor/github.com/cilium/ebpf/link/cgroup.go b/vendor/github.com/cilium/ebpf/link/cgroup.go index 79f3d2b7f4..f17d34f03c 100644 --- a/vendor/github.com/cilium/ebpf/link/cgroup.go +++ b/vendor/github.com/cilium/ebpf/link/cgroup.go @@ -6,6 +6,7 @@ import ( "os" "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" ) type cgroupAttachFlags uint32 @@ -187,3 +188,21 @@ func newLinkCgroup(cgroup *os.File, attach ebpf.AttachType, prog *ebpf.Program) return &linkCgroup{*link}, err } + +func (cg *linkCgroup) Info() (*Info, error) { + var info sys.CgroupLinkInfo + if err := sys.ObjInfo(cg.fd, &info); err != nil { + return nil, fmt.Errorf("cgroup link info: %s", err) + } + extra := &CgroupInfo{ + CgroupId: info.CgroupId, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/kprobe.go b/vendor/github.com/cilium/ebpf/link/kprobe.go index b54ca90853..fe3f17c371 100644 --- a/vendor/github.com/cilium/ebpf/link/kprobe.go +++ b/vendor/github.com/cilium/ebpf/link/kprobe.go @@ -59,6 +59,8 @@ func (ko *KprobeOptions) cookie() uint64 { // If attaching to symbol fails, automatically retries with the running // platform's syscall prefix (e.g. __x64_) to support attaching to syscalls // in a portable fashion. +// +// The returned Link may implement [PerfEvent]. func Kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) { k, err := kprobe(symbol, prog, opts, false) if err != nil { @@ -90,6 +92,8 @@ func Kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error // // On kernels 5.10 and earlier, setting a kretprobe on a nonexistent symbol // incorrectly returns unix.EINVAL instead of os.ErrNotExist. +// +// The returned Link may implement [PerfEvent]. func Kretprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) { k, err := kprobe(symbol, prog, opts, true) if err != nil { @@ -274,7 +278,11 @@ func pmuProbe(args tracefs.ProbeArgs) (*perfEvent, error) { } } - rawFd, err := unix.PerfEventOpen(&attr, args.Pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC) + cpu := 0 + if args.Pid != perfAllThreads { + cpu = -1 + } + rawFd, err := unix.PerfEventOpen(&attr, args.Pid, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC) // On some old kernels, kprobe PMU doesn't allow `.` in symbol names and // return -EINVAL. Return ErrNotSupported to allow falling back to tracefs. diff --git a/vendor/github.com/cilium/ebpf/link/kprobe_multi.go b/vendor/github.com/cilium/ebpf/link/kprobe_multi.go index 4d364d80eb..f7a8291f94 100644 --- a/vendor/github.com/cilium/ebpf/link/kprobe_multi.go +++ b/vendor/github.com/cilium/ebpf/link/kprobe_multi.go @@ -130,12 +130,23 @@ func (kml *kprobeMultiLink) Update(prog *ebpf.Program) error { return fmt.Errorf("update kprobe_multi: %w", ErrNotSupported) } -func (kml *kprobeMultiLink) Pin(string) error { - return fmt.Errorf("pin kprobe_multi: %w", ErrNotSupported) -} +func (kml *kprobeMultiLink) Info() (*Info, error) { + var info sys.KprobeMultiLinkInfo + if err := sys.ObjInfo(kml.fd, &info); err != nil { + return nil, fmt.Errorf("kprobe multi link info: %s", err) + } + extra := &KprobeMultiInfo{ + count: info.Count, + flags: info.Flags, + missed: info.Missed, + } -func (kml *kprobeMultiLink) Unpin() error { - return fmt.Errorf("unpin kprobe_multi: %w", ErrNotSupported) + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil } var haveBPFLinkKprobeMulti = internal.NewFeatureTest("bpf_link_kprobe_multi", "5.18", func() error { diff --git a/vendor/github.com/cilium/ebpf/link/link.go b/vendor/github.com/cilium/ebpf/link/link.go index 81428568f8..9c34616c9a 100644 --- a/vendor/github.com/cilium/ebpf/link/link.go +++ b/vendor/github.com/cilium/ebpf/link/link.go @@ -119,13 +119,15 @@ func wrapRawLink(raw *RawLink) (_ Link, err error) { case UprobeMultiType: return &uprobeMultiLink{*raw}, nil case PerfEventType: - return nil, fmt.Errorf("recovering perf event fd: %w", ErrNotSupported) + return &perfEventLink{*raw, nil}, nil case TCXType: return &tcxLink{*raw}, nil case NetfilterType: return &netfilterLink{*raw}, nil case NetkitType: return &netkitLink{*raw}, nil + case XDPType: + return &xdpLink{*raw}, nil default: return raw, nil } @@ -438,6 +440,9 @@ func (l *RawLink) UpdateArgs(opts RawLinkUpdateOptions) error { } // Info returns metadata about the link. +// +// Linktype specific metadata is not included and can be retrieved +// via the linktype specific Info() method. func (l *RawLink) Info() (*Info, error) { var info sys.LinkInfo @@ -445,117 +450,11 @@ func (l *RawLink) Info() (*Info, error) { return nil, fmt.Errorf("link info: %s", err) } - var extra interface{} - switch info.Type { - case CgroupType: - var cgroupInfo sys.CgroupLinkInfo - if err := sys.ObjInfo(l.fd, &cgroupInfo); err != nil { - return nil, fmt.Errorf("cgroup link info: %s", err) - } - extra = &CgroupInfo{ - CgroupId: cgroupInfo.CgroupId, - AttachType: cgroupInfo.AttachType, - } - case NetNsType: - var netnsInfo sys.NetNsLinkInfo - if err := sys.ObjInfo(l.fd, &netnsInfo); err != nil { - return nil, fmt.Errorf("netns link info: %s", err) - } - extra = &NetNsInfo{ - NetnsIno: netnsInfo.NetnsIno, - AttachType: netnsInfo.AttachType, - } - case TracingType: - var tracingInfo sys.TracingLinkInfo - if err := sys.ObjInfo(l.fd, &tracingInfo); err != nil { - return nil, fmt.Errorf("tracing link info: %s", err) - } - extra = &TracingInfo{ - TargetObjId: tracingInfo.TargetObjId, - TargetBtfId: tracingInfo.TargetBtfId, - AttachType: tracingInfo.AttachType, - } - case XDPType: - var xdpInfo sys.XDPLinkInfo - if err := sys.ObjInfo(l.fd, &xdpInfo); err != nil { - return nil, fmt.Errorf("xdp link info: %s", err) - } - extra = &XDPInfo{ - Ifindex: xdpInfo.Ifindex, - } - case RawTracepointType, IterType, UprobeMultiType: - // Extra metadata not supported. - case TCXType: - var tcxInfo sys.TcxLinkInfo - if err := sys.ObjInfo(l.fd, &tcxInfo); err != nil { - return nil, fmt.Errorf("tcx link info: %s", err) - } - extra = &TCXInfo{ - Ifindex: tcxInfo.Ifindex, - AttachType: tcxInfo.AttachType, - } - case NetfilterType: - var netfilterInfo sys.NetfilterLinkInfo - if err := sys.ObjInfo(l.fd, &netfilterInfo); err != nil { - return nil, fmt.Errorf("netfilter link info: %s", err) - } - extra = &NetfilterInfo{ - Pf: netfilterInfo.Pf, - Hooknum: netfilterInfo.Hooknum, - Priority: netfilterInfo.Priority, - Flags: netfilterInfo.Flags, - } - case NetkitType: - var netkitInfo sys.NetkitLinkInfo - if err := sys.ObjInfo(l.fd, &netkitInfo); err != nil { - return nil, fmt.Errorf("tcx link info: %s", err) - } - extra = &NetkitInfo{ - Ifindex: netkitInfo.Ifindex, - AttachType: netkitInfo.AttachType, - } - case KprobeMultiType: - var kprobeMultiInfo sys.KprobeMultiLinkInfo - if err := sys.ObjInfo(l.fd, &kprobeMultiInfo); err != nil { - return nil, fmt.Errorf("kprobe multi link info: %s", err) - } - extra = &KprobeMultiInfo{ - count: kprobeMultiInfo.Count, - flags: kprobeMultiInfo.Flags, - missed: kprobeMultiInfo.Missed, - } - case PerfEventType: - var perfEventInfo sys.PerfEventLinkInfo - if err := sys.ObjInfo(l.fd, &perfEventInfo); err != nil { - return nil, fmt.Errorf("perf event link info: %s", err) - } - - var extra2 interface{} - switch perfEventInfo.PerfEventType { - case sys.BPF_PERF_EVENT_KPROBE, sys.BPF_PERF_EVENT_KRETPROBE: - var kprobeInfo sys.KprobeLinkInfo - if err := sys.ObjInfo(l.fd, &kprobeInfo); err != nil { - return nil, fmt.Errorf("kprobe multi link info: %s", err) - } - extra2 = &KprobeInfo{ - address: kprobeInfo.Addr, - missed: kprobeInfo.Missed, - } - } - - extra = &PerfEventInfo{ - Type: perfEventInfo.PerfEventType, - extra: extra2, - } - default: - return nil, fmt.Errorf("unknown link info type: %d", info.Type) - } - return &Info{ info.Type, info.Id, ebpf.ProgramID(info.ProgId), - extra, + nil, }, nil } diff --git a/vendor/github.com/cilium/ebpf/link/netfilter.go b/vendor/github.com/cilium/ebpf/link/netfilter.go index 250c87677b..34be390859 100644 --- a/vendor/github.com/cilium/ebpf/link/netfilter.go +++ b/vendor/github.com/cilium/ebpf/link/netfilter.go @@ -67,4 +67,24 @@ func (*netfilterLink) Update(new *ebpf.Program) error { return fmt.Errorf("netfilter update: %w", ErrNotSupported) } +func (nf *netfilterLink) Info() (*Info, error) { + var info sys.NetfilterLinkInfo + if err := sys.ObjInfo(nf.fd, &info); err != nil { + return nil, fmt.Errorf("netfilter link info: %s", err) + } + extra := &NetfilterInfo{ + Pf: info.Pf, + Hooknum: info.Hooknum, + Priority: info.Priority, + Flags: info.Flags, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} + var _ Link = (*netfilterLink)(nil) diff --git a/vendor/github.com/cilium/ebpf/link/netkit.go b/vendor/github.com/cilium/ebpf/link/netkit.go index 36ed72a480..5eee3b023a 100644 --- a/vendor/github.com/cilium/ebpf/link/netkit.go +++ b/vendor/github.com/cilium/ebpf/link/netkit.go @@ -69,3 +69,21 @@ type netkitLink struct { } var _ Link = (*netkitLink)(nil) + +func (netkit *netkitLink) Info() (*Info, error) { + var info sys.NetkitLinkInfo + if err := sys.ObjInfo(netkit.fd, &info); err != nil { + return nil, fmt.Errorf("netkit link info: %s", err) + } + extra := &NetkitInfo{ + Ifindex: info.Ifindex, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/netns.go b/vendor/github.com/cilium/ebpf/link/netns.go index 344ecced6b..b1edd340a3 100644 --- a/vendor/github.com/cilium/ebpf/link/netns.go +++ b/vendor/github.com/cilium/ebpf/link/netns.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" ) // NetNsLink is a program attached to a network namespace. @@ -34,3 +35,21 @@ func AttachNetNs(ns int, prog *ebpf.Program) (*NetNsLink, error) { return &NetNsLink{*link}, nil } + +func (ns *NetNsLink) Info() (*Info, error) { + var info sys.NetNsLinkInfo + if err := sys.ObjInfo(ns.fd, &info); err != nil { + return nil, fmt.Errorf("netns link info: %s", err) + } + extra := &NetNsInfo{ + NetnsIno: info.NetnsIno, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/perf_event.go b/vendor/github.com/cilium/ebpf/link/perf_event.go index 5f7a628b3d..1d8feb58c1 100644 --- a/vendor/github.com/cilium/ebpf/link/perf_event.go +++ b/vendor/github.com/cilium/ebpf/link/perf_event.go @@ -3,6 +3,7 @@ package link import ( "errors" "fmt" + "os" "runtime" "unsafe" @@ -78,6 +79,18 @@ func (pe *perfEvent) Close() error { return nil } +// PerfEvent is implemented by some Link types which use a perf event under +// the hood. +type PerfEvent interface { + // PerfEvent returns a file for the underlying perf event. + // + // It is the callers responsibility to close the returned file. + // + // Making changes to the associated perf event lead to + // undefined behaviour. + PerfEvent() (*os.File, error) +} + // perfEventLink represents a bpf perf link. type perfEventLink struct { RawLink @@ -86,30 +99,16 @@ type perfEventLink struct { func (pl *perfEventLink) isLink() {} -// Pinning requires the underlying perf event FD to stay open. -// -// | PerfEvent FD | BpfLink FD | Works | -// |--------------|------------|-------| -// | Open | Open | Yes | -// | Closed | Open | No | -// | Open | Closed | No (Pin() -> EINVAL) | -// | Closed | Closed | No (Pin() -> EINVAL) | -// -// There is currently no pretty way to recover the perf event FD -// when loading a pinned link, so leave as not supported for now. -func (pl *perfEventLink) Pin(string) error { - return fmt.Errorf("perf event link pin: %w", ErrNotSupported) -} - -func (pl *perfEventLink) Unpin() error { - return fmt.Errorf("perf event link unpin: %w", ErrNotSupported) -} - func (pl *perfEventLink) Close() error { if err := pl.fd.Close(); err != nil { return fmt.Errorf("perf link close: %w", err) } + // when created from pinned link + if pl.pe == nil { + return nil + } + if err := pl.pe.Close(); err != nil { return fmt.Errorf("perf event close: %w", err) } @@ -120,6 +119,54 @@ func (pl *perfEventLink) Update(prog *ebpf.Program) error { return fmt.Errorf("perf event link update: %w", ErrNotSupported) } +var _ PerfEvent = (*perfEventLink)(nil) + +func (pl *perfEventLink) PerfEvent() (*os.File, error) { + // when created from pinned link + if pl.pe == nil { + return nil, ErrNotSupported + } + + fd, err := pl.pe.fd.Dup() + if err != nil { + return nil, err + } + + return fd.File("perf-event"), nil +} + +func (pl *perfEventLink) Info() (*Info, error) { + var info sys.PerfEventLinkInfo + if err := sys.ObjInfo(pl.fd, &info); err != nil { + return nil, fmt.Errorf("perf event link info: %s", err) + } + + var extra2 interface{} + switch info.PerfEventType { + case sys.BPF_PERF_EVENT_KPROBE, sys.BPF_PERF_EVENT_KRETPROBE: + var kprobeInfo sys.KprobeLinkInfo + if err := sys.ObjInfo(pl.fd, &kprobeInfo); err != nil { + return nil, fmt.Errorf("kprobe link info: %s", err) + } + extra2 = &KprobeInfo{ + address: kprobeInfo.Addr, + missed: kprobeInfo.Missed, + } + } + + extra := &PerfEventInfo{ + Type: info.PerfEventType, + extra: extra2, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} + // perfEventIoctl implements Link and handles the perf event lifecycle // via ioctl(). type perfEventIoctl struct { @@ -154,6 +201,17 @@ func (pi *perfEventIoctl) Info() (*Info, error) { return nil, fmt.Errorf("perf event ioctl info: %w", ErrNotSupported) } +var _ PerfEvent = (*perfEventIoctl)(nil) + +func (pi *perfEventIoctl) PerfEvent() (*os.File, error) { + fd, err := pi.fd.Dup() + if err != nil { + return nil, err + } + + return fd.File("perf-event"), nil +} + // attach the given eBPF prog to the perf event stored in pe. // pe must contain a valid perf event fd. // prog's type must match the program type stored in pe. @@ -229,7 +287,11 @@ func openTracepointPerfEvent(tid uint64, pid int) (*sys.FD, error) { Wakeup: 1, } - fd, err := unix.PerfEventOpen(&attr, pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC) + cpu := 0 + if pid != perfAllThreads { + cpu = -1 + } + fd, err := unix.PerfEventOpen(&attr, pid, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC) if err != nil { return nil, fmt.Errorf("opening tracepoint perf event: %w", err) } diff --git a/vendor/github.com/cilium/ebpf/link/tcx.go b/vendor/github.com/cilium/ebpf/link/tcx.go index 88f2237d29..ac045b71da 100644 --- a/vendor/github.com/cilium/ebpf/link/tcx.go +++ b/vendor/github.com/cilium/ebpf/link/tcx.go @@ -69,3 +69,21 @@ type tcxLink struct { } var _ Link = (*tcxLink)(nil) + +func (tcx *tcxLink) Info() (*Info, error) { + var info sys.TcxLinkInfo + if err := sys.ObjInfo(tcx.fd, &info); err != nil { + return nil, fmt.Errorf("tcx link info: %s", err) + } + extra := &TCXInfo{ + Ifindex: info.Ifindex, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/tracepoint.go b/vendor/github.com/cilium/ebpf/link/tracepoint.go index 95f5fae3b0..6fc78b9828 100644 --- a/vendor/github.com/cilium/ebpf/link/tracepoint.go +++ b/vendor/github.com/cilium/ebpf/link/tracepoint.go @@ -30,6 +30,8 @@ type TracepointOptions struct { // // Note that attaching eBPF programs to syscalls (sys_enter_*/sys_exit_*) is // only possible as of kernel 4.14 (commit cf5f5ce). +// +// The returned Link may implement [PerfEvent]. func Tracepoint(group, name string, prog *ebpf.Program, opts *TracepointOptions) (Link, error) { if group == "" || name == "" { return nil, fmt.Errorf("group and name cannot be empty: %w", errInvalidInput) diff --git a/vendor/github.com/cilium/ebpf/link/tracing.go b/vendor/github.com/cilium/ebpf/link/tracing.go index 1e1a7834d8..9e570afc96 100644 --- a/vendor/github.com/cilium/ebpf/link/tracing.go +++ b/vendor/github.com/cilium/ebpf/link/tracing.go @@ -18,6 +18,25 @@ func (f *tracing) Update(new *ebpf.Program) error { return fmt.Errorf("tracing update: %w", ErrNotSupported) } +func (f *tracing) Info() (*Info, error) { + var info sys.TracingLinkInfo + if err := sys.ObjInfo(f.fd, &info); err != nil { + return nil, fmt.Errorf("tracing link info: %s", err) + } + extra := &TracingInfo{ + TargetObjId: info.TargetObjId, + TargetBtfId: info.TargetBtfId, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} + // AttachFreplace attaches the given eBPF program to the function it replaces. // // The program and name can either be provided at link time, or can be provided diff --git a/vendor/github.com/cilium/ebpf/link/uprobe.go b/vendor/github.com/cilium/ebpf/link/uprobe.go index ad85024e38..194d1d319a 100644 --- a/vendor/github.com/cilium/ebpf/link/uprobe.go +++ b/vendor/github.com/cilium/ebpf/link/uprobe.go @@ -222,6 +222,8 @@ func (ex *Executable) address(symbol string, address, offset uint64) (uint64, er // // Functions provided by shared libraries can currently not be traced and // will result in an ErrNotSupported. +// +// The returned Link may implement [PerfEvent]. func (ex *Executable) Uprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions) (Link, error) { u, err := ex.uprobe(symbol, prog, opts, false) if err != nil { @@ -256,6 +258,8 @@ func (ex *Executable) Uprobe(symbol string, prog *ebpf.Program, opts *UprobeOpti // // Functions provided by shared libraries can currently not be traced and // will result in an ErrNotSupported. +// +// The returned Link may implement [PerfEvent]. func (ex *Executable) Uretprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions) (Link, error) { u, err := ex.uprobe(symbol, prog, opts, true) if err != nil { diff --git a/vendor/github.com/cilium/ebpf/link/uprobe_multi.go b/vendor/github.com/cilium/ebpf/link/uprobe_multi.go index 9a8d329c8f..aea807b329 100644 --- a/vendor/github.com/cilium/ebpf/link/uprobe_multi.go +++ b/vendor/github.com/cilium/ebpf/link/uprobe_multi.go @@ -172,14 +172,6 @@ func (kml *uprobeMultiLink) Update(prog *ebpf.Program) error { return fmt.Errorf("update uprobe_multi: %w", ErrNotSupported) } -func (kml *uprobeMultiLink) Pin(string) error { - return fmt.Errorf("pin uprobe_multi: %w", ErrNotSupported) -} - -func (kml *uprobeMultiLink) Unpin() error { - return fmt.Errorf("unpin uprobe_multi: %w", ErrNotSupported) -} - var haveBPFLinkUprobeMulti = internal.NewFeatureTest("bpf_link_uprobe_multi", "6.6", func() error { prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ Name: "probe_upm_link", diff --git a/vendor/github.com/cilium/ebpf/link/xdp.go b/vendor/github.com/cilium/ebpf/link/xdp.go index aa8dd3a4cb..2ec441229a 100644 --- a/vendor/github.com/cilium/ebpf/link/xdp.go +++ b/vendor/github.com/cilium/ebpf/link/xdp.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" ) // XDPAttachFlags represents how XDP program will be attached to interface. @@ -50,5 +51,30 @@ func AttachXDP(opts XDPOptions) (Link, error) { Flags: uint32(opts.Flags), }) - return rawLink, err + if err != nil { + return nil, fmt.Errorf("failed to attach link: %w", err) + } + + return &xdpLink{*rawLink}, nil +} + +type xdpLink struct { + RawLink +} + +func (xdp *xdpLink) Info() (*Info, error) { + var info sys.XDPLinkInfo + if err := sys.ObjInfo(xdp.fd, &info); err != nil { + return nil, fmt.Errorf("xdp link info: %s", err) + } + extra := &XDPInfo{ + Ifindex: info.Ifindex, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil } diff --git a/vendor/github.com/cilium/ebpf/map.go b/vendor/github.com/cilium/ebpf/map.go index e46fa3f12e..0b62101c3c 100644 --- a/vendor/github.com/cilium/ebpf/map.go +++ b/vendor/github.com/cilium/ebpf/map.go @@ -9,6 +9,7 @@ import ( "os" "path/filepath" "reflect" + "slices" "strings" "sync" "time" @@ -28,6 +29,10 @@ var ( ErrIterationAborted = errors.New("iteration aborted") ErrMapIncompatible = errors.New("map spec is incompatible with existing map") errMapNoBTFValue = errors.New("map spec does not contain a BTF Value") + + // pre-allocating these errors here since they may get called in hot code paths + // and cause unnecessary memory allocations + errMapLookupKeyNotExist = fmt.Errorf("lookup: %w", sysErrKeyNotExist) ) // MapOptions control loading a map into the kernel. @@ -96,11 +101,20 @@ func (ms *MapSpec) Copy() *MapSpec { } cpy := *ms + cpy.Contents = slices.Clone(cpy.Contents) + cpy.Key = btf.Copy(cpy.Key) + cpy.Value = btf.Copy(cpy.Value) - cpy.Contents = make([]MapKV, len(ms.Contents)) - copy(cpy.Contents, ms.Contents) + if cpy.InnerMap == ms { + cpy.InnerMap = &cpy + } else { + cpy.InnerMap = ms.InnerMap.Copy() + } - cpy.InnerMap = ms.InnerMap.Copy() + if cpy.Extra != nil { + extra := *cpy.Extra + cpy.Extra = &extra + } return &cpy } @@ -499,9 +513,6 @@ func handleMapCreateError(attr sys.MapCreateAttr, spec *MapSpec, err error) erro return fmt.Errorf("map create: %w (ring map size %d not a multiple of page size %d)", err, maxEntries, pageSize) } } - if attr.BtfFd == 0 { - return fmt.Errorf("map create: %w (without BTF k/v)", err) - } return fmt.Errorf("map create: %w", err) } @@ -571,6 +582,24 @@ func (m *Map) Info() (*MapInfo, error) { return newMapInfoFromFd(m.fd) } +// Handle returns a reference to the Map's type information in the kernel. +// +// Returns ErrNotSupported if the kernel has no BTF support, or if there is no +// BTF associated with the Map. +func (m *Map) Handle() (*btf.Handle, error) { + info, err := m.Info() + if err != nil { + return nil, err + } + + id, ok := info.BTFID() + if !ok { + return nil, fmt.Errorf("map %s: retrieve BTF ID: %w", m, ErrNotSupported) + } + + return btf.NewHandleFromID(id) +} + // MapLookupFlags controls the behaviour of the map lookup calls. type MapLookupFlags uint64 @@ -652,7 +681,7 @@ func (m *Map) LookupBytes(key interface{}) ([]byte, error) { } func (m *Map) lookupPerCPU(key, valueOut any, flags MapLookupFlags) error { - slice, err := ensurePerCPUSlice(valueOut, int(m.valueSize)) + slice, err := ensurePerCPUSlice(valueOut) if err != nil { return err } @@ -677,13 +706,16 @@ func (m *Map) lookup(key interface{}, valueOut sys.Pointer, flags MapLookupFlags } if err = sys.MapLookupElem(&attr); err != nil { + if errors.Is(err, unix.ENOENT) { + return errMapLookupKeyNotExist + } return fmt.Errorf("lookup: %w", wrapMapError(err)) } return nil } func (m *Map) lookupAndDeletePerCPU(key, valueOut any, flags MapLookupFlags) error { - slice, err := ensurePerCPUSlice(valueOut, int(m.valueSize)) + slice, err := ensurePerCPUSlice(valueOut) if err != nil { return err } @@ -695,7 +727,7 @@ func (m *Map) lookupAndDeletePerCPU(key, valueOut any, flags MapLookupFlags) err } // ensurePerCPUSlice allocates a slice for a per-CPU value if necessary. -func ensurePerCPUSlice(sliceOrPtr any, elemLength int) (any, error) { +func ensurePerCPUSlice(sliceOrPtr any) (any, error) { sliceOrPtrType := reflect.TypeOf(sliceOrPtr) if sliceOrPtrType.Kind() == reflect.Slice { // The target is a slice, the caller is responsible for ensuring that @@ -985,7 +1017,11 @@ func (m *Map) guessNonExistentKey() ([]byte, error) { // the end of all possible results, even when partial results // are returned. It should be used to evaluate when lookup is "done". func (m *Map) BatchLookup(cursor *MapBatchCursor, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { - return m.batchLookup(sys.BPF_MAP_LOOKUP_BATCH, cursor, keysOut, valuesOut, opts) + n, err := m.batchLookup(sys.BPF_MAP_LOOKUP_BATCH, cursor, keysOut, valuesOut, opts) + if err != nil { + return n, fmt.Errorf("map batch lookup: %w", err) + } + return n, nil } // BatchLookupAndDelete looks up many elements in a map at once, @@ -1005,7 +1041,11 @@ func (m *Map) BatchLookup(cursor *MapBatchCursor, keysOut, valuesOut interface{} // the end of all possible results, even when partial results // are returned. It should be used to evaluate when lookup is "done". func (m *Map) BatchLookupAndDelete(cursor *MapBatchCursor, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { - return m.batchLookup(sys.BPF_MAP_LOOKUP_AND_DELETE_BATCH, cursor, keysOut, valuesOut, opts) + n, err := m.batchLookup(sys.BPF_MAP_LOOKUP_AND_DELETE_BATCH, cursor, keysOut, valuesOut, opts) + if err != nil { + return n, fmt.Errorf("map batch lookup and delete: %w", err) + } + return n, nil } // MapBatchCursor represents a starting point for a batch operation. @@ -1027,7 +1067,11 @@ func (m *Map) batchLookup(cmd sys.Cmd, cursor *MapBatchCursor, keysOut, valuesOu valueBuf := sysenc.SyscallOutput(valuesOut, count*int(m.fullValueSize)) n, err := m.batchLookupCmd(cmd, cursor, count, keysOut, valueBuf.Pointer(), opts) - if err != nil { + if errors.Is(err, unix.ENOSPC) { + // Hash tables return ENOSPC when the size of the batch is smaller than + // any bucket. + return n, fmt.Errorf("%w (batch size too small?)", err) + } else if err != nil { return n, err } diff --git a/vendor/github.com/cilium/ebpf/prog.go b/vendor/github.com/cilium/ebpf/prog.go index f4f3af7c36..9bc6325f88 100644 --- a/vendor/github.com/cilium/ebpf/prog.go +++ b/vendor/github.com/cilium/ebpf/prog.go @@ -46,13 +46,13 @@ const ( outputPad = 256 + 2 ) -// DefaultVerifierLogSize is the default number of bytes allocated for the -// verifier log. +// Deprecated: the correct log size is now detected automatically and this +// constant is unused. const DefaultVerifierLogSize = 64 * 1024 -// maxVerifierLogSize is the maximum size of verifier log buffer the kernel -// will accept before returning EINVAL. -const maxVerifierLogSize = math.MaxUint32 >> 2 +// minVerifierLogSize is the default number of bytes allocated for the +// verifier log. +const minVerifierLogSize = 64 * 1024 // ProgramOptions control loading a program into the kernel. type ProgramOptions struct { @@ -73,15 +73,8 @@ type ProgramOptions struct { // attempt at loading the program. LogLevel LogLevel - // Controls the output buffer size for the verifier log, in bytes. See the - // documentation on ProgramOptions.LogLevel for details about how this value - // is used. - // - // If this value is set too low to fit the verifier log, the resulting - // [ebpf.VerifierError]'s Truncated flag will be true, and the error string - // will also contain a hint to that effect. - // - // Defaults to DefaultVerifierLogSize. + // Deprecated: the correct log buffer size is determined automatically + // and this field is ignored. LogSize int // Disables the verifier log completely, regardless of other options. @@ -262,10 +255,6 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er return nil, fmt.Errorf("can't load %s program on %s", spec.ByteOrder, internal.NativeEndian) } - if opts.LogSize < 0 { - return nil, errors.New("ProgramOptions.LogSize must be a positive value; disable verifier logs using ProgramOptions.LogDisabled") - } - // Kernels before 5.0 (6c4fc209fcf9 "bpf: remove useless version check for prog load") // require the version field to be set to the value of the KERNEL_VERSION // macro for kprobe-type programs. @@ -404,37 +393,59 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er } } - if opts.LogSize == 0 { - opts.LogSize = DefaultVerifierLogSize - } - - // The caller requested a specific verifier log level. Set up the log buffer. + // The caller requested a specific verifier log level. Set up the log buffer + // so that there is a chance of loading the program in a single shot. var logBuf []byte if !opts.LogDisabled && opts.LogLevel != 0 { - logBuf = make([]byte, opts.LogSize) + logBuf = make([]byte, minVerifierLogSize) attr.LogLevel = opts.LogLevel attr.LogSize = uint32(len(logBuf)) attr.LogBuf = sys.NewSlicePointer(logBuf) } - fd, err := sys.ProgLoad(attr) - if err == nil { - return &Program{unix.ByteSliceToString(logBuf), fd, spec.Name, "", spec.Type}, nil - } + for { + var fd *sys.FD + fd, err = sys.ProgLoad(attr) + if err == nil { + return &Program{unix.ByteSliceToString(logBuf), fd, spec.Name, "", spec.Type}, nil + } - // An error occurred loading the program, but the caller did not explicitly - // enable the verifier log. Re-run with branch-level verifier logs enabled to - // obtain more info. Preserve the original error to return it to the caller. - // An undersized log buffer will result in ENOSPC regardless of the underlying - // cause. - var err2 error - if !opts.LogDisabled && opts.LogLevel == 0 { - logBuf = make([]byte, opts.LogSize) - attr.LogLevel = LogLevelBranch - attr.LogSize = uint32(len(logBuf)) - attr.LogBuf = sys.NewSlicePointer(logBuf) + if opts.LogDisabled { + break + } - _, err2 = sys.ProgLoad(attr) + if attr.LogTrueSize != 0 && attr.LogSize >= attr.LogTrueSize { + // The log buffer already has the correct size. + break + } + + if attr.LogSize != 0 && !errors.Is(err, unix.ENOSPC) { + // Logging is enabled and the error is not ENOSPC, so we can infer + // that the log buffer is large enough. + break + } + + if attr.LogLevel == 0 { + // Logging is not enabled but loading the program failed. Enable + // basic logging. + attr.LogLevel = LogLevelBranch + } + + // Make an educated guess how large the buffer should be. Start + // at minVerifierLogSize and then double the size. + logSize := uint32(max(len(logBuf)*2, minVerifierLogSize)) + if int(logSize) < len(logBuf) { + return nil, errors.New("overflow while probing log buffer size") + } + + if attr.LogTrueSize != 0 { + // The kernel has given us a hint how large the log buffer has to be. + logSize = attr.LogTrueSize + } + + logBuf = make([]byte, logSize) + attr.LogSize = logSize + attr.LogBuf = sys.NewSlicePointer(logBuf) } end := bytes.IndexByte(logBuf, 0) @@ -452,10 +463,6 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er } case errors.Is(err, unix.EINVAL): - if opts.LogSize > maxVerifierLogSize { - return nil, fmt.Errorf("load program: %w (ProgramOptions.LogSize exceeds maximum value of %d)", err, maxVerifierLogSize) - } - if bytes.Contains(tail, coreBadCall) { err = errBadRelocation break @@ -479,8 +486,7 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er } } - truncated := errors.Is(err, unix.ENOSPC) || errors.Is(err2, unix.ENOSPC) - return nil, internal.ErrorWithLog("load program", err, logBuf, truncated) + return nil, internal.ErrorWithLog("load program", err, logBuf) } // NewProgramFromFD creates a program from a raw fd. diff --git a/vendor/github.com/cilium/ebpf/ringbuf/doc.go b/vendor/github.com/cilium/ebpf/ringbuf/doc.go new file mode 100644 index 0000000000..9e45012187 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ringbuf/doc.go @@ -0,0 +1,6 @@ +// Package ringbuf allows interacting with Linux BPF ring buffer. +// +// BPF allows submitting custom events to a BPF ring buffer map set up +// by userspace. This is very useful to push things like packet samples +// from BPF to a daemon running in user space. +package ringbuf diff --git a/vendor/github.com/cilium/ebpf/ringbuf/reader.go b/vendor/github.com/cilium/ebpf/ringbuf/reader.go new file mode 100644 index 0000000000..3d3ba0ecfa --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ringbuf/reader.go @@ -0,0 +1,197 @@ +package ringbuf + +import ( + "errors" + "fmt" + "os" + "sync" + "time" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/epoll" + "github.com/cilium/ebpf/internal/unix" +) + +var ( + ErrClosed = os.ErrClosed + ErrFlushed = epoll.ErrFlushed + errEOR = errors.New("end of ring") + errBusy = errors.New("sample not committed yet") +) + +// ringbufHeader from 'struct bpf_ringbuf_hdr' in kernel/bpf/ringbuf.c +type ringbufHeader struct { + Len uint32 + _ uint32 // pg_off, only used by kernel internals +} + +func (rh *ringbufHeader) isBusy() bool { + return rh.Len&unix.BPF_RINGBUF_BUSY_BIT != 0 +} + +func (rh *ringbufHeader) isDiscard() bool { + return rh.Len&unix.BPF_RINGBUF_DISCARD_BIT != 0 +} + +func (rh *ringbufHeader) dataLen() int { + return int(rh.Len & ^uint32(unix.BPF_RINGBUF_BUSY_BIT|unix.BPF_RINGBUF_DISCARD_BIT)) +} + +type Record struct { + RawSample []byte + + // The minimum number of bytes remaining in the ring buffer after this Record has been read. + Remaining int +} + +// Reader allows reading bpf_ringbuf_output +// from user space. +type Reader struct { + poller *epoll.Poller + + // mu protects read/write access to the Reader structure + mu sync.Mutex + ring *ringbufEventRing + epollEvents []unix.EpollEvent + haveData bool + deadline time.Time + bufferSize int + pendingErr error +} + +// NewReader creates a new BPF ringbuf reader. +func NewReader(ringbufMap *ebpf.Map) (*Reader, error) { + if ringbufMap.Type() != ebpf.RingBuf { + return nil, fmt.Errorf("invalid Map type: %s", ringbufMap.Type()) + } + + maxEntries := int(ringbufMap.MaxEntries()) + if maxEntries == 0 || (maxEntries&(maxEntries-1)) != 0 { + return nil, fmt.Errorf("ringbuffer map size %d is zero or not a power of two", maxEntries) + } + + poller, err := epoll.New() + if err != nil { + return nil, err + } + + if err := poller.Add(ringbufMap.FD(), 0); err != nil { + poller.Close() + return nil, err + } + + ring, err := newRingBufEventRing(ringbufMap.FD(), maxEntries) + if err != nil { + poller.Close() + return nil, fmt.Errorf("failed to create ringbuf ring: %w", err) + } + + return &Reader{ + poller: poller, + ring: ring, + epollEvents: make([]unix.EpollEvent, 1), + bufferSize: ring.size(), + }, nil +} + +// Close frees resources used by the reader. +// +// It interrupts calls to Read. +func (r *Reader) Close() error { + if err := r.poller.Close(); err != nil { + if errors.Is(err, os.ErrClosed) { + return nil + } + return err + } + + // Acquire the lock. This ensures that Read isn't running. + r.mu.Lock() + defer r.mu.Unlock() + + if r.ring != nil { + r.ring.Close() + r.ring = nil + } + + return nil +} + +// SetDeadline controls how long Read and ReadInto will block waiting for samples. +// +// Passing a zero time.Time will remove the deadline. +func (r *Reader) SetDeadline(t time.Time) { + r.mu.Lock() + defer r.mu.Unlock() + + r.deadline = t +} + +// Read the next record from the BPF ringbuf. +// +// Calling [Close] interrupts the method with [os.ErrClosed]. Calling [Flush] +// makes it return all records currently in the ring buffer, followed by [ErrFlushed]. +// +// Returns [os.ErrDeadlineExceeded] if a deadline was set and after all records +// have been read from the ring. +// +// See [ReadInto] for a more efficient version of this method. +func (r *Reader) Read() (Record, error) { + var rec Record + return rec, r.ReadInto(&rec) +} + +// ReadInto is like Read except that it allows reusing Record and associated buffers. +func (r *Reader) ReadInto(rec *Record) error { + r.mu.Lock() + defer r.mu.Unlock() + + if r.ring == nil { + return fmt.Errorf("ringbuffer: %w", ErrClosed) + } + + for { + if !r.haveData { + if pe := r.pendingErr; pe != nil { + r.pendingErr = nil + return pe + } + + _, err := r.poller.Wait(r.epollEvents[:cap(r.epollEvents)], r.deadline) + if errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, ErrFlushed) { + // Ignoring this for reading a valid entry after timeout or flush. + // This can occur if the producer submitted to the ring buffer + // with BPF_RB_NO_WAKEUP. + r.pendingErr = err + } else if err != nil { + return err + } + r.haveData = true + } + + for { + err := r.ring.readRecord(rec) + // Not using errors.Is which is quite a bit slower + // For a tight loop it might make a difference + if err == errBusy { + continue + } + if err == errEOR { + r.haveData = false + break + } + return err + } + } +} + +// BufferSize returns the size in bytes of the ring buffer +func (r *Reader) BufferSize() int { + return r.bufferSize +} + +// Flush unblocks Read/ReadInto and successive Read/ReadInto calls will return pending samples at this point, +// until you receive a ErrFlushed error. +func (r *Reader) Flush() error { + return r.poller.Flush() +} diff --git a/vendor/github.com/cilium/ebpf/ringbuf/ring.go b/vendor/github.com/cilium/ebpf/ringbuf/ring.go new file mode 100644 index 0000000000..8f8f4bce36 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ringbuf/ring.go @@ -0,0 +1,137 @@ +package ringbuf + +import ( + "fmt" + "io" + "os" + "runtime" + "sync/atomic" + "unsafe" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +type ringbufEventRing struct { + prod []byte + cons []byte + *ringReader +} + +func newRingBufEventRing(mapFD, size int) (*ringbufEventRing, error) { + cons, err := unix.Mmap(mapFD, 0, os.Getpagesize(), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED) + if err != nil { + return nil, fmt.Errorf("can't mmap consumer page: %w", err) + } + + prod, err := unix.Mmap(mapFD, (int64)(os.Getpagesize()), os.Getpagesize()+2*size, unix.PROT_READ, unix.MAP_SHARED) + if err != nil { + _ = unix.Munmap(cons) + return nil, fmt.Errorf("can't mmap data pages: %w", err) + } + + cons_pos := (*uint64)(unsafe.Pointer(&cons[0])) + prod_pos := (*uint64)(unsafe.Pointer(&prod[0])) + + ring := &ringbufEventRing{ + prod: prod, + cons: cons, + ringReader: newRingReader(cons_pos, prod_pos, prod[os.Getpagesize():]), + } + runtime.SetFinalizer(ring, (*ringbufEventRing).Close) + + return ring, nil +} + +func (ring *ringbufEventRing) Close() { + runtime.SetFinalizer(ring, nil) + + _ = unix.Munmap(ring.prod) + _ = unix.Munmap(ring.cons) + + ring.prod = nil + ring.cons = nil +} + +type ringReader struct { + // These point into mmap'ed memory and must be accessed atomically. + prod_pos, cons_pos *uint64 + mask uint64 + ring []byte +} + +func newRingReader(cons_ptr, prod_ptr *uint64, ring []byte) *ringReader { + return &ringReader{ + prod_pos: prod_ptr, + cons_pos: cons_ptr, + // cap is always a power of two + mask: uint64(cap(ring)/2 - 1), + ring: ring, + } +} + +// To be able to wrap around data, data pages in ring buffers are mapped twice in +// a single contiguous virtual region. +// Therefore the returned usable size is half the size of the mmaped region. +func (rr *ringReader) size() int { + return cap(rr.ring) / 2 +} + +// Read a record from an event ring. +func (rr *ringReader) readRecord(rec *Record) error { + prod := atomic.LoadUint64(rr.prod_pos) + cons := atomic.LoadUint64(rr.cons_pos) + + for { + if remaining := prod - cons; remaining == 0 { + return errEOR + } else if remaining < unix.BPF_RINGBUF_HDR_SZ { + return fmt.Errorf("read record header: %w", io.ErrUnexpectedEOF) + } + + // read the len field of the header atomically to ensure a happens before + // relationship with the xchg in the kernel. Without this we may see len + // without BPF_RINGBUF_BUSY_BIT before the written data is visible. + // See https://github.com/torvalds/linux/blob/v6.8/kernel/bpf/ringbuf.c#L484 + start := cons & rr.mask + len := atomic.LoadUint32((*uint32)((unsafe.Pointer)(&rr.ring[start]))) + header := ringbufHeader{Len: len} + + if header.isBusy() { + // the next sample in the ring is not committed yet so we + // exit without storing the reader/consumer position + // and start again from the same position. + return errBusy + } + + cons += unix.BPF_RINGBUF_HDR_SZ + + // Data is always padded to 8 byte alignment. + dataLenAligned := uint64(internal.Align(header.dataLen(), 8)) + if remaining := prod - cons; remaining < dataLenAligned { + return fmt.Errorf("read sample data: %w", io.ErrUnexpectedEOF) + } + + start = cons & rr.mask + cons += dataLenAligned + + if header.isDiscard() { + // when the record header indicates that the data should be + // discarded, we skip it by just updating the consumer position + // to the next record. + atomic.StoreUint64(rr.cons_pos, cons) + continue + } + + if n := header.dataLen(); cap(rec.RawSample) < n { + rec.RawSample = make([]byte, n) + } else { + rec.RawSample = rec.RawSample[:n] + } + + copy(rec.RawSample, rr.ring[start:]) + rec.Remaining = int(prod - cons) + atomic.StoreUint64(rr.cons_pos, cons) + return nil + } +} diff --git a/vendor/github.com/cilium/ebpf/run-tests.sh b/vendor/github.com/cilium/ebpf/run-tests.sh deleted file mode 100644 index c7ff7ea333..0000000000 --- a/vendor/github.com/cilium/ebpf/run-tests.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env bash -# Test the current package under a different kernel. -# Requires virtme and qemu to be installed. -# Examples: -# Run all tests on a 5.4 kernel -# $ ./run-tests.sh 5.4 -# Run a subset of tests: -# $ ./run-tests.sh 5.4 ./link -# Run using a local kernel image -# $ ./run-tests.sh /path/to/bzImage - -set -euo pipefail - -script="$(realpath "$0")" -readonly script - -source "$(dirname "$script")/testdata/sh/lib.sh" - -quote_env() { - for var in "$@"; do - if [ -v "$var" ]; then - printf "%s=%q " "$var" "${!var}" - fi - done -} - -declare -a preserved_env=( - PATH - CI_MAX_KERNEL_VERSION - TEST_SEED - KERNEL_VERSION -) - -# This script is a bit like a Matryoshka doll since it keeps re-executing itself -# in various different contexts: -# -# 1. invoked by the user like run-tests.sh 5.4 -# 2. invoked by go test like run-tests.sh --exec-vm -# 3. invoked by init in the vm like run-tests.sh --exec-test -# -# This allows us to use all available CPU on the host machine to compile our -# code, and then only use the VM to execute the test. This is because the VM -# is usually slower at compiling than the host. -if [[ "${1:-}" = "--exec-vm" ]]; then - shift - - input="$1" - shift - - # Use sudo if /dev/kvm isn't accessible by the current user. - sudo="" - if [[ ! -r /dev/kvm || ! -w /dev/kvm ]]; then - sudo="sudo" - fi - readonly sudo - - testdir="$(dirname "$1")" - output="$(mktemp -d)" - printf -v cmd "%q " "$@" - - if [[ "$(stat -c '%t:%T' -L /proc/$$/fd/0)" == "1:3" ]]; then - # stdin is /dev/null, which doesn't play well with qemu. Use a fifo as a - # blocking substitute. - mkfifo "${output}/fake-stdin" - # Open for reading and writing to avoid blocking. - exec 0<> "${output}/fake-stdin" - rm "${output}/fake-stdin" - fi - - if ! $sudo virtme-run --kimg "${input}/boot/vmlinuz" --cpus 2 --memory 1G --pwd \ - --rwdir="${testdir}=${testdir}" \ - --rodir=/run/input="${input}" \ - --rwdir=/run/output="${output}" \ - --script-sh "$(quote_env "${preserved_env[@]}") \"$script\" \ - --exec-test $cmd"; then - exit 23 - fi - - if ! [[ -e "${output}/status" ]]; then - exit 42 - fi - - rc=$(<"${output}/status") - $sudo rm -r "$output" - exit "$rc" -elif [[ "${1:-}" = "--exec-test" ]]; then - shift - - mount -t bpf bpf /sys/fs/bpf - mount -t tracefs tracefs /sys/kernel/debug/tracing - - if [[ -d "/run/input/usr/src/linux/tools/testing/selftests/bpf" ]]; then - export KERNEL_SELFTESTS="/run/input/usr/src/linux/tools/testing/selftests/bpf" - fi - - if [[ -d "/run/input/lib/modules" ]]; then - find /run/input/lib/modules -type f -name bpf_testmod.ko -exec insmod {} \; - fi - - dmesg --clear - rc=0 - "$@" || rc=$? - dmesg - echo $rc > "/run/output/status" - exit $rc # this return code is "swallowed" by qemu -fi - -if [[ -z "${1:-}" ]]; then - echo "Expecting kernel version or path as first argument" - exit 1 -fi - -input="$(mktemp -d)" -readonly input - -if [[ -f "${1}" ]]; then - # First argument is a local file. - readonly kernel="${1}" - cp "${1}" "${input}/boot/vmlinuz" -else - readonly kernel="${1}" - - # LINUX_VERSION_CODE test compares this to discovered value. - export KERNEL_VERSION="${1}" - - if ! extract_oci_image "ghcr.io/cilium/ci-kernels:${kernel}-selftests" "${input}"; then - extract_oci_image "ghcr.io/cilium/ci-kernels:${kernel}" "${input}" - fi -fi -shift - -args=(-short -coverpkg=./... -coverprofile=coverage.out -count 1 ./...) -if (( $# > 0 )); then - args=("$@") -fi - -export GOFLAGS=-mod=readonly -export CGO_ENABLED=0 - -echo Testing on "${kernel}" -go test -exec "$script --exec-vm $input" "${args[@]}" -echo "Test successful on ${kernel}" - -rm -r "${input}" diff --git a/vendor/modules.txt b/vendor/modules.txt index 45e5b5e4a4..70f612076f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -26,12 +26,13 @@ github.com/bits-and-blooms/bitset # github.com/cespare/xxhash/v2 v2.3.0 ## explicit; go 1.11 github.com/cespare/xxhash/v2 -# github.com/cilium/ebpf v0.15.0 -## explicit; go 1.21.0 +# github.com/cilium/ebpf v0.16.0 +## explicit; go 1.21 github.com/cilium/ebpf github.com/cilium/ebpf/asm github.com/cilium/ebpf/btf github.com/cilium/ebpf/internal +github.com/cilium/ebpf/internal/epoll github.com/cilium/ebpf/internal/kallsyms github.com/cilium/ebpf/internal/kconfig github.com/cilium/ebpf/internal/sys @@ -39,6 +40,7 @@ github.com/cilium/ebpf/internal/sysenc github.com/cilium/ebpf/internal/tracefs github.com/cilium/ebpf/internal/unix github.com/cilium/ebpf/link +github.com/cilium/ebpf/ringbuf github.com/cilium/ebpf/rlimit # github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc ## explicit From a55466957f28311e63a10576c3eabcbb8b08d358 Mon Sep 17 00:00:00 2001 From: Sunil Thaha Date: Fri, 2 Aug 2024 14:40:35 +1000 Subject: [PATCH 2/3] fix: resolve pid 0 to system_processes The resolves pid 0 to system_processes without (throwing) any error since there is no command associated with it. This commit fixes the following log I0802 03:24:03.647656 733341 process_bpf_collector.go:100] failed to resolve comm for PID 0: process not running: stat /proc/0: no such file or directory, set comm=system_processes I0802 03:24:03.648079 733341 process_bpf_collector.go:100] failed to resolve comm for PID 0: process not running, set comm=system_processes 0802 process_bpf_collector.go:100] failed to resolve comm for PID 0: process not running: stat /proc/0: no such file or directory, set comm=system_processes ... I0802 process_bpf_collector.go:100] failed to resolve comm for PID 0: process not running, set comm=system_processes Signed-off-by: Sunil Thaha --- pkg/comm/resolve_comm.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/comm/resolve_comm.go b/pkg/comm/resolve_comm.go index 7a4ffe6053..8b42206901 100644 --- a/pkg/comm/resolve_comm.go +++ b/pkg/comm/resolve_comm.go @@ -7,10 +7,11 @@ import ( "strconv" "strings" + "github.com/sustainable-computing-io/kepler/pkg/utils" "golang.org/x/sys/unix" ) -const unknownComm = "unknown" +const unknownComm = `unknown` type CommResolver struct { cacheExist map[int]string @@ -35,6 +36,10 @@ func NewTestCommResolver(procFsResolver func(pid int) (string, error)) *CommReso } func (r *CommResolver) ResolveComm(pid int) (string, error) { + if pid == 0 { + return utils.SystemProcessName, nil + } + if comm, ok := r.cacheExist[pid]; ok { return comm, nil } From 5199fc9218f2d221beda6184a9ecedd90d22b534 Mon Sep 17 00:00:00 2001 From: Dave Tucker Date: Mon, 5 Aug 2024 18:39:00 +0100 Subject: [PATCH 3/3] fix(pkg/bpf): Use channel to process events (#1671) Processing events in the same goroutine as the ring buffer reader requires acquiring a mutex, which blocks ringbuf event processing causing a backlog. To avoid this, send events via a buffered channel to a dedicated event processing goroutine to ensure that the ringbuf remains unblocked. This has decreased CPU load from 1-3% on my machine to 0-1% CPU load. Signed-off-by: Dave Tucker --- cmd/exporter/exporter.go | 5 ++ pkg/bpf/exporter.go | 164 ++++++++++++++++++++++++++------------- pkg/bpf/test_utils.go | 3 + pkg/bpf/types.go | 2 + 4 files changed, 118 insertions(+), 56 deletions(-) diff --git a/cmd/exporter/exporter.go b/cmd/exporter/exporter.go index a30c9d6fda..6d4a2abae6 100644 --- a/cmd/exporter/exporter.go +++ b/cmd/exporter/exporter.go @@ -79,6 +79,7 @@ var ( apiserverEnabled = flag.Bool("apiserver", true, "if apiserver is disabled, we collect pod information from kubelet") redfishCredFilePath = flag.String("redfish-cred-file-path", "", "path to the redfish credential file") exposeEstimatedIdlePower = flag.Bool("expose-estimated-idle-power", false, "estimated idle power is meaningful only if Kepler is running on bare-metal or when there is only one virtual machine on the node") + bpfDebugMetricsEnabled = flag.Bool("bpf-debug-metrics", false, "whether to enable debug metrics for eBPF") ) func healthProbe(w http.ResponseWriter, req *http.Request) { @@ -150,6 +151,10 @@ func main() { klog.Fatalf("failed to create eBPF exporter: %v", err) } defer bpfExporter.Detach() + if *bpfDebugMetricsEnabled { + bpfExporter.RegisterMetrics(registry) + } + stopCh := make(chan struct{}) bpfErrCh := make(chan error) go func() { diff --git a/pkg/bpf/exporter.go b/pkg/bpf/exporter.go index c9814fcc69..b11089eb09 100644 --- a/pkg/bpf/exporter.go +++ b/pkg/bpf/exporter.go @@ -31,6 +31,7 @@ import ( "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" "github.com/jaypipes/ghw" + "github.com/prometheus/client_golang/prometheus" "github.com/sustainable-computing-io/kepler/pkg/config" "golang.org/x/sys/unix" "k8s.io/apimachinery/pkg/util/sets" @@ -51,6 +52,10 @@ type exporter struct { enabledHardwareCounters sets.Set[string] enabledSoftwareCounters sets.Set[string] + eventsRead prometheus.Counter + eventsProcessed prometheus.Counter + channelDepth prometheus.GaugeFunc + // Locks processMetrics and freedPIDs. // Acquired in CollectProcesses - to prevent new events from being processed // while summarizing the metrics and resetting the counters. @@ -61,6 +66,9 @@ type exporter struct { mu *sync.Mutex processMetrics map[uint32]*bpfMetrics freedPIDs []int + + ringbufReader *ringbuf.Reader + eventsChan chan *keplerEvent } func NewExporter() (Exporter, error) { @@ -70,7 +78,28 @@ func NewExporter() (Exporter, error) { enabledSoftwareCounters: sets.New[string](), mu: &sync.Mutex{}, processMetrics: make(map[uint32]*bpfMetrics), + eventsChan: make(chan *keplerEvent, 1024), } + e.eventsRead = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "kepler_bpf_exporter_events_read_total", + Help: "Total number of events read from the ring buffer.", + }) + e.eventsProcessed = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "kepler_bpf_exporter_events_processed_total", + Help: "Total number of events processed.", + }) + e.channelDepth = prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Name: "kepler_bpf_exporter_events_channel_depth", + Help: "Current depth of the events channel", + }, + func() float64 { + if e.eventsChan == nil { + return 0 + } + return float64(len(e.eventsChan)) + }, + ) err := e.attach() if err != nil { e.Detach() @@ -78,6 +107,12 @@ func NewExporter() (Exporter, error) { return e, err } +func (e *exporter) RegisterMetrics(registry *prometheus.Registry) { + registry.MustRegister(e.eventsRead) + registry.MustRegister(e.eventsProcessed) + registry.MustRegister(e.channelDepth) +} + func (e *exporter) SupportedMetrics() SupportedMetrics { return SupportedMetrics{ HardwareCounters: e.enabledHardwareCounters.Clone(), @@ -220,29 +255,38 @@ func (e *exporter) Detach() { } func (e *exporter) Start(stopChan <-chan struct{}) error { - rd, err := ringbuf.NewReader(e.bpfObjects.Rb) + var err error + e.ringbufReader, err = ringbuf.NewReader(e.bpfObjects.Rb) if err != nil { return fmt.Errorf("failed to create ring buffer reader: %w", err) } - defer rd.Close() + defer e.ringbufReader.Close() + + wg := &sync.WaitGroup{} + wg.Add(2) + go e.ringBufReader(wg, stopChan) + go e.eventProcessor(wg, stopChan) + wg.Wait() + return nil +} + +func (e *exporter) ringBufReader(wg *sync.WaitGroup, stopChan <-chan struct{}) { + defer wg.Done() for { var record *ringbuf.Record select { case <-stopChan: - if err := rd.Close(); err != nil { - return fmt.Errorf("closing ring buffer reader: %w", err) - } - return nil + return default: var event keplerEvent record = new(ringbuf.Record) - err := rd.ReadInto(record) + err := e.ringbufReader.ReadInto(record) if err != nil { if errors.Is(err, ringbuf.ErrClosed) { - return nil + return } if errors.Is(err, ringbuf.ErrFlushed) { record.RawSample = record.RawSample[:0] @@ -255,9 +299,63 @@ func (e *exporter) Start(stopChan <-chan struct{}) error { klog.Errorf("parsing ringbuf event: %s", err) continue } + // process events on another channel to avoid blocking the ring buffer reader + e.eventsChan <- &event + e.eventsRead.Inc() + } + } +} - // Process the event - e.handleEvent(event) +func (e *exporter) eventProcessor(wg *sync.WaitGroup, stopChan <-chan struct{}) { + defer wg.Done() + for { + select { + case <-stopChan: + return + case event := <-e.eventsChan: + e.mu.Lock() + + var p *bpfMetrics + + if _, ok := e.processMetrics[event.Pid]; !ok { + e.processMetrics[event.Pid] = newBpfMetrics() + } + p = e.processMetrics[event.Pid] + + switch event.EventType { + case uint64(keplerEventTypeSCHED_SWITCH): + // Handle the new task going on CPU + p.CPUCyles.Start(event.CpuId, event.Tid, event.CpuCycles) + p.CPUInstructions.Start(event.CpuId, event.Tid, event.CpuInstr) + p.CacheMiss.Start(event.CpuId, event.Tid, event.CacheMiss) + p.CPUTime.Start(event.CpuId, event.Tid, event.Ts) + + // Handle the task going OFF CPU + if _, ok := e.processMetrics[event.OffcpuPid]; !ok { + e.processMetrics[event.OffcpuPid] = newBpfMetrics() + } + offcpu := e.processMetrics[event.OffcpuPid] + offcpu.CPUCyles.Stop(event.CpuId, event.OffcpuTid, event.CpuCycles) + offcpu.CPUInstructions.Stop(event.CpuId, event.OffcpuTid, event.CpuInstr) + offcpu.CacheMiss.Stop(event.CpuId, event.OffcpuTid, event.CacheMiss) + offcpu.CPUTime.Stop(event.CpuId, event.OffcpuTid, event.Ts) + offcpu.CGroupID = event.OffcpuCgroupId + case uint64(keplerEventTypePAGE_CACHE_HIT): + p.PageCacheHit += 1 + case uint64(keplerEventTypeIRQ): + switch event.IrqNumber { + case uint32(keplerIrqTypeNET_TX): + p.TxIRQ += 1 + case uint32(keplerIrqTypeNET_RX): + p.RxIRQ += 1 + case uint32(keplerIrqTypeBLOCK): + p.BlockIRQ += 1 + } + case uint64(keplerEventTypeFREE): + e.freedPIDs = append(e.freedPIDs, int(event.Pid)) + } + e.mu.Unlock() + e.eventsProcessed.Inc() } } } @@ -343,52 +441,6 @@ func (p *PerCPUCounter) Reset() { p.Total = 0 } -func (e *exporter) handleEvent(event keplerEvent) { - e.mu.Lock() - defer e.mu.Unlock() - - var p *bpfMetrics - - if _, ok := e.processMetrics[event.Pid]; !ok { - e.processMetrics[event.Pid] = newBpfMetrics() - } - p = e.processMetrics[event.Pid] - - switch event.EventType { - case uint64(keplerEventTypeSCHED_SWITCH): - // Handle the new task going on CPU - p.CPUCyles.Start(event.CpuId, event.Tid, event.CpuCycles) - p.CPUInstructions.Start(event.CpuId, event.Tid, event.CpuInstr) - p.CacheMiss.Start(event.CpuId, event.Tid, event.CacheMiss) - p.CPUTime.Start(event.CpuId, event.Tid, event.Ts) - - // Handle the task going OFF CPU - if _, ok := e.processMetrics[event.OffcpuPid]; !ok { - e.processMetrics[event.OffcpuPid] = newBpfMetrics() - } - offcpu := e.processMetrics[event.OffcpuPid] - offcpu.CPUCyles.Stop(event.CpuId, event.OffcpuTid, event.CpuCycles) - offcpu.CPUInstructions.Stop(event.CpuId, event.OffcpuTid, event.CpuInstr) - offcpu.CacheMiss.Stop(event.CpuId, event.OffcpuTid, event.CacheMiss) - offcpu.CPUTime.Stop(event.CpuId, event.OffcpuTid, event.Ts) - offcpu.CGroupID = event.OffcpuCgroupId - case uint64(keplerEventTypePAGE_CACHE_HIT): - p.PageCacheHit += 1 - case uint64(keplerEventTypeIRQ): - switch event.IrqNumber { - case uint32(keplerIrqTypeNET_TX): - p.TxIRQ += 1 - case uint32(keplerIrqTypeNET_RX): - p.RxIRQ += 1 - case uint32(keplerIrqTypeBLOCK): - p.BlockIRQ += 1 - } - return - case uint64(keplerEventTypeFREE): - e.freedPIDs = append(e.freedPIDs, int(event.Pid)) - } -} - func (e *exporter) CollectProcesses() (ProcessMetricsCollection, error) { e.mu.Lock() defer e.mu.Unlock() diff --git a/pkg/bpf/test_utils.go b/pkg/bpf/test_utils.go index ef71b56b07..7a9855f47b 100644 --- a/pkg/bpf/test_utils.go +++ b/pkg/bpf/test_utils.go @@ -1,6 +1,7 @@ package bpf import ( + "github.com/prometheus/client_golang/prometheus" "github.com/sustainable-computing-io/kepler/pkg/config" "k8s.io/apimachinery/pkg/util/sets" ) @@ -68,3 +69,5 @@ func (m *mockExporter) CollectProcesses() (ProcessMetricsCollection, error) { FreedPIDs: []int{0}, }, nil } + +func (m *mockExporter) RegisterMetrics(registry *prometheus.Registry) {} diff --git a/pkg/bpf/types.go b/pkg/bpf/types.go index 077b69cd3e..71b5328ba5 100644 --- a/pkg/bpf/types.go +++ b/pkg/bpf/types.go @@ -17,6 +17,7 @@ limitations under the License. package bpf import ( + "github.com/prometheus/client_golang/prometheus" "k8s.io/apimachinery/pkg/util/sets" ) @@ -25,6 +26,7 @@ type Exporter interface { Detach() CollectProcesses() (ProcessMetricsCollection, error) Start(<-chan struct{}) error + RegisterMetrics(registry *prometheus.Registry) } type ProcessMetrics struct {