Skip to content

Commit d598ae7

Browse files
committed
bpftune: add adaptive sampling
One approach to reducing overhead is to introduce sampling; i.e. a high-frequency program bails early and only does more expensive work for 1 out of every N invokations. bpftune has adaptive sampling to support this. It works as follows. In your BPF object, add a struct bpftune_sample variable for the BPF program you wish to sample. For example in tcp_buffer_tuner.bpf.c we have ``` struct bpftune_sample rcv_space_sample = { }; ``` Since tcp_rcv_space_adjust() is called frequently, add the following early in function lifetime: ``` bpftune_sample(rcv_space_sample); ``` The above will bail if it is not the Nth invokation of the program. Note that N is usually bpftune_sample_rate - default 4 - but will be adaptively adjusted if the program runs too frequently. Too frequently is 2N instances - i.e. collecting data twice - in a 10 msec interval. In such cases we double the sample rate. Similarly if we fall outside that range we lower the sample rate towards bpftune_sample_rate, so over time it should adjust to handle the rate of invokations adaptively. To add reporting to your tuner on exit (how many times the program was invoked and what fraction of these we collected data for), in the init method add ``` bpftuner_bpf_sample_add(tcp_buffer, tuner, rcv_space_sample); ``` Then on tuner fini, you will see something like: ``` bpftune: Sample 'rcv_space_sample': associated program was called 598663 times, collected data every 2048 of these. ``` In this case the frequency dictated we increase the fractional rate of data collection from 1 in 4 to 1 in 2048. With this in place, overheads recorded via stress-ng are much reduced. Signed-off-by: Alan Maguire <[email protected]>
1 parent b0f6187 commit d598ae7

11 files changed

+244
-27
lines changed

CONTRIBUTING.md

+46
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,52 @@ $ TUNER=tcp_buffer_tuner.so sh qperf_test.sh
232232

233233
Replace TUNER value with the name of the tuner you want to assess.
234234

235+
One approach to reducing overhead is to introduce sampling; i.e.
236+
a high-frequency program bails early and only does more expensive
237+
work for 1 out of every N invokations. bpftune has adaptive sampling
238+
to support this. It works as follows.
239+
240+
In your BPF object, add a struct bpftune_sample variable for the
241+
BPF program you wish to sample. For example in tcp_buffer_tuner.bpf.c
242+
we have
243+
244+
```
245+
struct bpftune_sample rcv_space_sample = { };
246+
```
247+
248+
Since tcp_rcv_space_adjust() is called frequently, add the following
249+
early in function lifetime:
250+
251+
```
252+
bpftune_sample(rcv_space_sample);
253+
```
254+
255+
The above will bail if it is not the Nth invokation of the program.
256+
Note that N is usually bpftune_sample_rate - default 4 - but will
257+
be adaptively adjusted if the program runs too frequently. Too
258+
frequently is 2N instances - i.e. collecting data twice - in a 10 msec
259+
interval. In such cases we double the sample rate. Similarly if
260+
we fall outside that range we lower the sample rate towards
261+
bpftune_sample_rate, so over time it should adjust to handle the
262+
rate of invokations adaptively.
263+
264+
To add reporting to your tuner on exit (how many times the program
265+
was invoked and what fraction of these we collected data for),
266+
in the init method add
267+
268+
```
269+
bpftuner_bpf_sample_add(tcp_buffer, tuner, rcv_space_sample);
270+
```
271+
272+
Then on tuner fini, you will see something like:
273+
274+
```
275+
bpftune: Sample 'rcv_space_sample': associated program was called 598663 times, collected data every 2048 of these.
276+
```
277+
278+
In this case the frequency dictated we increase the fractional rate of
279+
data collection from 1 in 4 to 1 in 2048.
280+
235281
## Tests
236282

237283
Tests are mandatory for tuners; in the test directory you can see

include/bpftune/bpftune.bpf.h

+25-2
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,31 @@ unsigned long long bpftune_init_net;
274274
bool debug;
275275

276276
unsigned int bpftune_sample_rate = 4;
277-
278-
#define bpftune_skip_sample(count) ((++count % bpftune_sample_rate) != 0)
277+
278+
#define TEN_MSEC ((__u64)10000000)
279+
280+
/* Auto-tune sample rate. Start with bpftune_sample_rate, and only
281+
* sample every bpftune_sample_rate samples (default 4). However
282+
* if we see multiple events in a 10msec window, double the sample
283+
* rate such that we only sample 1/8 etc. Similarly lower the sample
284+
* rate if we have previously increased it and not seen as many samples
285+
* in that window.
286+
*/
287+
#define bpftune_sample(sample) \
288+
do { \
289+
__u64 last_ts = 0; \
290+
\
291+
if (!sample.rate) \
292+
sample.rate = bpftune_sample_rate; \
293+
if (((++sample.count) % sample.rate) != 0) \
294+
return 0; \
295+
last_ts = sample.ts; \
296+
sample.ts = bpf_ktime_get_ns(); \
297+
if ((sample.ts - last_ts) < TEN_MSEC) \
298+
sample.rate = sample.rate << 1; \
299+
else if (sample.rate > bpftune_sample_rate) \
300+
sample.rate = sample.rate >> 1; \
301+
} while (0)
279302

280303
#define __barrier asm volatile("" ::: "memory")
281304

include/bpftune/bpftune.h

+15
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,19 @@ enum bpftune_support_level {
160160
BPFTUNE_SUPPORT_NORMAL
161161
};
162162

163+
struct bpftune_sample {
164+
__u64 count;
165+
__u64 ts;
166+
__u64 rate;
167+
};
168+
169+
struct bpftune_sample_desc {
170+
const char *name;
171+
struct bpftune_sample *sample;
172+
};
173+
174+
#define BPFTUNE_MAX_SAMPLES 8
175+
163176
struct bpftuner {
164177
unsigned int id;
165178
enum bpftune_state state;
@@ -189,6 +202,8 @@ struct bpftuner {
189202
struct bpftunable *tunables;
190203
unsigned int num_scenarios;
191204
struct bpftunable_scenario *scenarios;
205+
unsigned int num_samples;
206+
struct bpftune_sample_desc samples[BPFTUNE_MAX_SAMPLES];
192207
};
193208

194209
/* from include/linux/log2.h */

include/bpftune/libbpftune.h

+26
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,32 @@ void bpftuner_tunables_fini(struct bpftuner *tuner);
312312
#define bpftuner_bpf_map_get(tuner_name, tuner, map) \
313313
bpftuner_bpf_skel_val(tuner_name, tuner, maps.map)
314314

315+
#define bpftuner_bpf_sample_add(tuner_name, tuner, s) \
316+
do { \
317+
struct tuner_name##_tuner_bpf *__skel = tuner->skel; \
318+
struct tuner_name##_tuner_bpf_legacy *__lskel = tuner->skel; \
319+
struct tuner_name##_tuner_bpf_nobtf *__nskel = tuner->skel; \
320+
struct bpftune_sample_desc *d; \
321+
d = &tuner->samples[tuner->num_samples]; \
322+
d->name = #s; \
323+
switch (tuner->bpf_support) { \
324+
case BPFTUNE_SUPPORT_NORMAL: \
325+
d->sample = &__skel->bss->s; \
326+
break; \
327+
case BPFTUNE_SUPPORT_LEGACY: \
328+
d->sample = &__lskel->bss->s; \
329+
break; \
330+
case BPFTUNE_SUPPORT_NOBTF: \
331+
d->sample = &__nskel->bss->s; \
332+
default: \
333+
break; \
334+
} \
335+
tuner->num_samples++; \
336+
bpftune_log(LOG_DEBUG, "%s: added sample '%s'\n", \
337+
#tuner_name, #s); \
338+
} while (0)
339+
340+
315341
enum bpftune_support_level bpftune_bpf_support(void);
316342
bool bpftune_have_vmlinux_btf(void);
317343
void bpftune_force_bpf_support(enum bpftune_support_level);

src/libbpftune.c

+7
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,13 @@ void bpftuner_fini(struct bpftuner *tuner, enum bpftune_state state)
791791

792792
bpftune_log(LOG_DEBUG, "cleaning up tuner %s with %d tunables, %d scenarios\n",
793793
tuner->name, tuner->num_tunables, tuner->num_scenarios);
794+
/* Show sample data before destroying BPF skeleton */
795+
for (i = 0; i < tuner->num_samples; i++) {
796+
bpftune_log(BPFTUNE_LOG_LEVEL, "Sample '%s': associated program was called %lu times, collected data every %lu of these.\n",
797+
tuner->samples[i].name,
798+
tuner->samples[i].sample->count,
799+
tuner->samples[i].sample->rate);
800+
}
794801
if (tuner->fini)
795802
tuner->fini(tuner);
796803
/* report summary of events for tuner */

src/net_buffer_tuner.bpf.c

+8-7
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ int netdev_max_backlog = 0;
3333
int netdev_budget = 0;
3434
int netdev_budget_usecs = 0;
3535

36+
struct bpftune_sample drop_sample = {};
37+
3638
#ifdef BPFTUNE_LEGACY
3739
SEC("kretprobe/enqueue_to_backlog")
3840
int BPF_KRETPROBE(bpftune_enqueue_to_backlog, int ret)
@@ -55,9 +57,8 @@ int BPF_PROG(bpftune_enqueue_to_backlog, struct sk_buff *skb, int cpu,
5557
drop_count++;
5658

5759
/* only sample subset of drops to reduce overhead. */
58-
if (bpftune_skip_sample(drop_count))
59-
return 0;
60-
60+
bpftune_sample(drop_sample);
61+
6162
/* if we drop more than 1/16 of the backlog queue size/min,
6263
* increase backlog queue size. This means as the queue size
6364
* increases, the likliehood of hitting that limit decreases.
@@ -92,14 +93,14 @@ int BPF_PROG(bpftune_enqueue_to_backlog, struct sk_buff *skb, int cpu,
9293
return 0;
9394
}
9495

96+
struct bpftune_sample rx_action_sample = {};
97+
9598
#ifndef BPFTUNE_LEGACY
9699

97100
BPF_MAP_DEF(time_squeeze_map, BPF_MAP_TYPE_PERCPU_ARRAY, unsigned int, unsigned int, 1, 0);
98101

99102
extern const struct softnet_data softnet_data __ksym;
100103

101-
__u64 rx_count = 0;
102-
103104
SEC("fexit/net_rx_action")
104105
int BPF_PROG(net_rx_action)
105106
{
@@ -111,8 +112,8 @@ int BPF_PROG(net_rx_action)
111112
unsigned int *last_time_squeezep = NULL;
112113
unsigned int zero = 0;
113114

114-
if (bpftune_skip_sample(rx_count))
115-
return 0;
115+
bpftune_sample(rx_action_sample);
116+
116117
sd = (struct softnet_data *)bpf_this_cpu_ptr(&softnet_data);
117118
if (!sd)
118119
return 0;

src/net_buffer_tuner.c

+2
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ int init(struct bpftuner *tuner)
6262
budget);
6363
bpftuner_bpf_var_set(net_buffer, tuner, netdev_budget_usecs,
6464
budget_usecs);
65+
bpftuner_bpf_sample_add(net_buffer, tuner, drop_sample);
66+
bpftuner_bpf_sample_add(net_buffer, tuner, rx_action_sample);
6567
err = bpftuner_bpf_attach(net_buffer, tuner);
6668
if (err)
6769
return err;

src/tcp_buffer_tuner.bpf.c

+10-17
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,6 @@
2626
bool under_memory_pressure = false;
2727
bool near_memory_pressure = false;
2828
bool near_memory_exhaustion = false;
29-
/* use global tcp sock count since tcp memory pressure/exhaustion are
30-
* computed as fraction of total system memory.
31-
*/
32-
__s64 tcp_sock_count = 0;
33-
__s64 tcp_max_sock_count = 0;
3429

3530
/* set from userspace */
3631
int kernel_page_size;
@@ -227,19 +222,27 @@ BPF_FENTRY(tcp_sndbuf_expand, struct sock *sk)
227222
return 0;
228223
}
229224

225+
__u64 rcv_space_count = 0;
226+
227+
struct bpftune_sample rcv_space_sample = { };
228+
230229
/* sadly tcp_rcv_space_adjust() has checks internal to it so it is called
231230
* regardless of if we are under memory pressure or not; so use the variable
232231
* we set when memory pressure is triggered.
233232
*/
234233
BPF_FENTRY(tcp_rcv_space_adjust, struct sock *sk)
235234
{
236235
struct bpftune_event event = { 0 };
237-
struct net *net = BPFTUNE_CORE_READ(sk, sk_net.net);
238236
struct tcp_sock *tp = (struct tcp_sock *)sk;
239237
long rmem[3], rmem_new[3];
240238
__u8 sk_userlocks = 0;
239+
struct net *net;
241240
long rcvbuf;
242241

242+
/* only sample subset of events to reduce overhead. */
243+
bpftune_sample(rcv_space_sample);
244+
245+
net = BPFTUNE_CORE_READ(sk, sk_net.net);
243246
if (!sk || !net)
244247
return 0;
245248

@@ -288,17 +291,7 @@ BPF_FENTRY(tcp_init_sock, struct sock *sk)
288291
{
289292
struct bpftune_event event = { 0 };
290293

291-
if (sk) {
292-
if (++tcp_sock_count > tcp_max_sock_count)
293-
tcp_max_sock_count = tcp_sock_count;
294+
if (sk)
294295
(void) tcp_nearly_out_of_memory(sk, &event);
295-
}
296-
return 0;
297-
}
298-
299-
BPF_FENTRY(tcp_release_cb, struct sock *sk)
300-
{
301-
if (tcp_sock_count > 0)
302-
tcp_sock_count--;
303296
return 0;
304297
}

src/tcp_buffer_tuner.c

+1
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ int init(struct bpftuner *tuner)
164164
ilog2(SK_MEM_QUANTUM));
165165
bpftuner_bpf_var_set(tcp_buffer, tuner, nr_free_buffer_pages,
166166
nr_free_buffer_pages(true));
167+
bpftuner_bpf_sample_add(tcp_buffer, tuner, rcv_space_sample);
167168
err = bpftuner_bpf_attach(tcp_buffer, tuner);
168169
if (err)
169170
return err;

test/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ TUNER_TESTS = support_test log_test service_test inotify_test cap_test \
2424
sample_test sample_legacy_test \
2525
strategy_test strategy_legacy_test \
2626
rollback_test rollback_legacy_test \
27-
query_test \
27+
query_test rate_test \
2828
many_netns_test many_netns_legacy_test \
2929
podman_globalonly_test podman_globalonly_legacy_test \
3030
sysctl_test sysctl_legacy_test sysctl_netns_test \

0 commit comments

Comments
 (0)