Skip to content

Commit d10cafc

Browse files
author
Alexei Starovoitov
committed
Merge branch 'free-htab-element-out-of-bucket-lock'
Hou Tao says: ==================== The patch set continues the previous work [1] to move all the freeings of htab elements out of bucket lock. One motivation for the patch set is the locking problem reported by Sebastian [2]: the freeing of bpf_timer under PREEMPT_RT may acquire a spin-lock (namely softirq_expiry_lock). However the freeing procedure for htab element has already held a raw-spin-lock (namely bucket lock), and it will trigger the warning: "BUG: scheduling while atomic" as demonstrated by the selftests patch. Another motivation is to reduce the locked scope of bucket lock. However, the patch set doesn't move all freeing of htab element out of bucket lock, it still keep the free of special fields in pre-allocated hash map under the protect of bucket lock in htab_map_update_elem(). The patch set is structured as follows: * Patch Rust-for-Linux#1 moves the element freeing out of bucket lock for htab_lru_map_delete_node(). However the freeing is still in the locked scope of LRU raw spin lock. * Patch Rust-for-Linux#2~Rust-for-Linux#3 move the element freeing out of bucket lock for __htab_map_lookup_and_delete_elem() * Patch Rust-for-Linux#4 cancels the bpf_timer in two steps to fix the locking problem in htab_map_update_elem() for PREEMPT_PRT. * Patch Rust-for-Linux#5 adds a selftest for the locking problem Please see individual patches for more details. Comments are always welcome. --- v3: * patch Rust-for-Linux#1: update the commit message to state that the freeing of special field is still in the locked scope of LRU raw spin lock * patch Rust-for-Linux#4: cancel the bpf_timer in two steps only for PREEMPT_RT (suggested by Alexei) v2: https://lore.kernel.org/bpf/[email protected] * cancels the bpf timer in two steps instead of breaking the reuse the refill of per-cpu ->extra_elems into two steps v1: https://lore.kernel.org/bpf/[email protected] [1]: https://lore.kernel.org/bpf/[email protected] [2]: https://lore.kernel.org/bpf/[email protected] ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents 01f3ce5 + 0a5d2ef commit d10cafc

File tree

4 files changed

+284
-30
lines changed

4 files changed

+284
-30
lines changed

kernel/bpf/hashtab.c

+32-28
Original file line numberDiff line numberDiff line change
@@ -824,13 +824,14 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
824824
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
825825
if (l == tgt_l) {
826826
hlist_nulls_del_rcu(&l->hash_node);
827-
check_and_free_fields(htab, l);
828827
bpf_map_dec_elem_count(&htab->map);
829828
break;
830829
}
831830

832831
htab_unlock_bucket(htab, b, tgt_l->hash, flags);
833832

833+
if (l == tgt_l)
834+
check_and_free_fields(htab, l);
834835
return l == tgt_l;
835836
}
836837

@@ -1634,41 +1635,44 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
16341635
l = lookup_elem_raw(head, hash, key, key_size);
16351636
if (!l) {
16361637
ret = -ENOENT;
1637-
} else {
1638-
if (is_percpu) {
1639-
u32 roundup_value_size = round_up(map->value_size, 8);
1640-
void __percpu *pptr;
1641-
int off = 0, cpu;
1638+
goto out_unlock;
1639+
}
16421640

1643-
pptr = htab_elem_get_ptr(l, key_size);
1644-
for_each_possible_cpu(cpu) {
1645-
copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
1646-
check_and_init_map_value(&htab->map, value + off);
1647-
off += roundup_value_size;
1648-
}
1649-
} else {
1650-
u32 roundup_key_size = round_up(map->key_size, 8);
1641+
if (is_percpu) {
1642+
u32 roundup_value_size = round_up(map->value_size, 8);
1643+
void __percpu *pptr;
1644+
int off = 0, cpu;
16511645

1652-
if (flags & BPF_F_LOCK)
1653-
copy_map_value_locked(map, value, l->key +
1654-
roundup_key_size,
1655-
true);
1656-
else
1657-
copy_map_value(map, value, l->key +
1658-
roundup_key_size);
1659-
/* Zeroing special fields in the temp buffer */
1660-
check_and_init_map_value(map, value);
1646+
pptr = htab_elem_get_ptr(l, key_size);
1647+
for_each_possible_cpu(cpu) {
1648+
copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
1649+
check_and_init_map_value(&htab->map, value + off);
1650+
off += roundup_value_size;
16611651
}
1652+
} else {
1653+
u32 roundup_key_size = round_up(map->key_size, 8);
16621654

1663-
hlist_nulls_del_rcu(&l->hash_node);
1664-
if (!is_lru_map)
1665-
free_htab_elem(htab, l);
1655+
if (flags & BPF_F_LOCK)
1656+
copy_map_value_locked(map, value, l->key +
1657+
roundup_key_size,
1658+
true);
1659+
else
1660+
copy_map_value(map, value, l->key +
1661+
roundup_key_size);
1662+
/* Zeroing special fields in the temp buffer */
1663+
check_and_init_map_value(map, value);
16661664
}
1665+
hlist_nulls_del_rcu(&l->hash_node);
16671666

1667+
out_unlock:
16681668
htab_unlock_bucket(htab, b, hash, bflags);
16691669

1670-
if (is_lru_map && l)
1671-
htab_lru_push_free(htab, l);
1670+
if (l) {
1671+
if (is_lru_map)
1672+
htab_lru_push_free(htab, l);
1673+
else
1674+
free_htab_elem(htab, l);
1675+
}
16721676

16731677
return ret;
16741678
}

kernel/bpf/helpers.c

+16-2
Original file line numberDiff line numberDiff line change
@@ -1593,10 +1593,24 @@ void bpf_timer_cancel_and_free(void *val)
15931593
* To avoid these issues, punt to workqueue context when we are in a
15941594
* timer callback.
15951595
*/
1596-
if (this_cpu_read(hrtimer_running))
1596+
if (this_cpu_read(hrtimer_running)) {
15971597
queue_work(system_unbound_wq, &t->cb.delete_work);
1598-
else
1598+
return;
1599+
}
1600+
1601+
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
1602+
/* If the timer is running on other CPU, also use a kworker to
1603+
* wait for the completion of the timer instead of trying to
1604+
* acquire a sleepable lock in hrtimer_cancel() to wait for its
1605+
* completion.
1606+
*/
1607+
if (hrtimer_try_to_cancel(&t->timer) >= 0)
1608+
kfree_rcu(t, cb.rcu);
1609+
else
1610+
queue_work(system_unbound_wq, &t->cb.delete_work);
1611+
} else {
15991612
bpf_timer_delete_work(&t->cb.delete_work);
1613+
}
16001614
}
16011615

16021616
/* This function is called by map_delete/update_elem for individual element and
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
3+
#define _GNU_SOURCE
4+
#include <unistd.h>
5+
#include <sys/syscall.h>
6+
#include <test_progs.h>
7+
8+
#include "free_timer.skel.h"
9+
10+
struct run_ctx {
11+
struct bpf_program *start_prog;
12+
struct bpf_program *overwrite_prog;
13+
pthread_barrier_t notify;
14+
int loop;
15+
bool start;
16+
bool stop;
17+
};
18+
19+
static void start_threads(struct run_ctx *ctx)
20+
{
21+
ctx->start = true;
22+
}
23+
24+
static void stop_threads(struct run_ctx *ctx)
25+
{
26+
ctx->stop = true;
27+
/* Guarantee the order between ->stop and ->start */
28+
__atomic_store_n(&ctx->start, true, __ATOMIC_RELEASE);
29+
}
30+
31+
static int wait_for_start(struct run_ctx *ctx)
32+
{
33+
while (!__atomic_load_n(&ctx->start, __ATOMIC_ACQUIRE))
34+
usleep(10);
35+
36+
return ctx->stop;
37+
}
38+
39+
static void *overwrite_timer_fn(void *arg)
40+
{
41+
struct run_ctx *ctx = arg;
42+
int loop, fd, err;
43+
cpu_set_t cpuset;
44+
long ret = 0;
45+
46+
/* Pin on CPU 0 */
47+
CPU_ZERO(&cpuset);
48+
CPU_SET(0, &cpuset);
49+
pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
50+
51+
/* Is the thread being stopped ? */
52+
err = wait_for_start(ctx);
53+
if (err)
54+
return NULL;
55+
56+
fd = bpf_program__fd(ctx->overwrite_prog);
57+
loop = ctx->loop;
58+
while (loop-- > 0) {
59+
LIBBPF_OPTS(bpf_test_run_opts, opts);
60+
61+
/* Wait for start thread to complete */
62+
pthread_barrier_wait(&ctx->notify);
63+
64+
/* Overwrite timers */
65+
err = bpf_prog_test_run_opts(fd, &opts);
66+
if (err)
67+
ret |= 1;
68+
else if (opts.retval)
69+
ret |= 2;
70+
71+
/* Notify start thread to start timers */
72+
pthread_barrier_wait(&ctx->notify);
73+
}
74+
75+
return (void *)ret;
76+
}
77+
78+
static void *start_timer_fn(void *arg)
79+
{
80+
struct run_ctx *ctx = arg;
81+
int loop, fd, err;
82+
cpu_set_t cpuset;
83+
long ret = 0;
84+
85+
/* Pin on CPU 1 */
86+
CPU_ZERO(&cpuset);
87+
CPU_SET(1, &cpuset);
88+
pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
89+
90+
/* Is the thread being stopped ? */
91+
err = wait_for_start(ctx);
92+
if (err)
93+
return NULL;
94+
95+
fd = bpf_program__fd(ctx->start_prog);
96+
loop = ctx->loop;
97+
while (loop-- > 0) {
98+
LIBBPF_OPTS(bpf_test_run_opts, opts);
99+
100+
/* Run the prog to start timer */
101+
err = bpf_prog_test_run_opts(fd, &opts);
102+
if (err)
103+
ret |= 4;
104+
else if (opts.retval)
105+
ret |= 8;
106+
107+
/* Notify overwrite thread to do overwrite */
108+
pthread_barrier_wait(&ctx->notify);
109+
110+
/* Wait for overwrite thread to complete */
111+
pthread_barrier_wait(&ctx->notify);
112+
}
113+
114+
return (void *)ret;
115+
}
116+
117+
void test_free_timer(void)
118+
{
119+
struct free_timer *skel;
120+
struct bpf_program *prog;
121+
struct run_ctx ctx;
122+
pthread_t tid[2];
123+
void *ret;
124+
int err;
125+
126+
skel = free_timer__open_and_load();
127+
if (!ASSERT_OK_PTR(skel, "open_load"))
128+
return;
129+
130+
memset(&ctx, 0, sizeof(ctx));
131+
132+
prog = bpf_object__find_program_by_name(skel->obj, "start_timer");
133+
if (!ASSERT_OK_PTR(prog, "find start prog"))
134+
goto out;
135+
ctx.start_prog = prog;
136+
137+
prog = bpf_object__find_program_by_name(skel->obj, "overwrite_timer");
138+
if (!ASSERT_OK_PTR(prog, "find overwrite prog"))
139+
goto out;
140+
ctx.overwrite_prog = prog;
141+
142+
pthread_barrier_init(&ctx.notify, NULL, 2);
143+
ctx.loop = 10;
144+
145+
err = pthread_create(&tid[0], NULL, start_timer_fn, &ctx);
146+
if (!ASSERT_OK(err, "create start_timer"))
147+
goto out;
148+
149+
err = pthread_create(&tid[1], NULL, overwrite_timer_fn, &ctx);
150+
if (!ASSERT_OK(err, "create overwrite_timer")) {
151+
stop_threads(&ctx);
152+
goto out;
153+
}
154+
155+
start_threads(&ctx);
156+
157+
ret = NULL;
158+
err = pthread_join(tid[0], &ret);
159+
ASSERT_EQ(err | (long)ret, 0, "start_timer");
160+
ret = NULL;
161+
err = pthread_join(tid[1], &ret);
162+
ASSERT_EQ(err | (long)ret, 0, "overwrite_timer");
163+
out:
164+
free_timer__destroy(skel);
165+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
3+
#include <linux/bpf.h>
4+
#include <time.h>
5+
#include <bpf/bpf_tracing.h>
6+
#include <bpf/bpf_helpers.h>
7+
8+
#define MAX_ENTRIES 8
9+
10+
struct map_value {
11+
struct bpf_timer timer;
12+
};
13+
14+
struct {
15+
__uint(type, BPF_MAP_TYPE_HASH);
16+
__type(key, int);
17+
__type(value, struct map_value);
18+
__uint(max_entries, MAX_ENTRIES);
19+
} map SEC(".maps");
20+
21+
static int timer_cb(void *map, void *key, struct map_value *value)
22+
{
23+
volatile int sum = 0;
24+
int i;
25+
26+
bpf_for(i, 0, 1024 * 1024) sum += i;
27+
28+
return 0;
29+
}
30+
31+
static int start_cb(int key)
32+
{
33+
struct map_value *value;
34+
35+
value = bpf_map_lookup_elem(&map, (void *)&key);
36+
if (!value)
37+
return 0;
38+
39+
bpf_timer_init(&value->timer, &map, CLOCK_MONOTONIC);
40+
bpf_timer_set_callback(&value->timer, timer_cb);
41+
/* Hope 100us will be enough to wake-up and run the overwrite thread */
42+
bpf_timer_start(&value->timer, 100000, BPF_F_TIMER_CPU_PIN);
43+
44+
return 0;
45+
}
46+
47+
static int overwrite_cb(int key)
48+
{
49+
struct map_value zero = {};
50+
51+
/* Free the timer which may run on other CPU */
52+
bpf_map_update_elem(&map, (void *)&key, &zero, BPF_ANY);
53+
54+
return 0;
55+
}
56+
57+
SEC("syscall")
58+
int BPF_PROG(start_timer)
59+
{
60+
bpf_loop(MAX_ENTRIES, start_cb, NULL, 0);
61+
return 0;
62+
}
63+
64+
SEC("syscall")
65+
int BPF_PROG(overwrite_timer)
66+
{
67+
bpf_loop(MAX_ENTRIES, overwrite_cb, NULL, 0);
68+
return 0;
69+
}
70+
71+
char _license[] SEC("license") = "GPL";

0 commit comments

Comments
 (0)