Skip to content

Commit 509df67

Browse files
author
Alexei Starovoitov
committed
Merge branch 'fixes-for-lpm-trie'
Hou Tao says: ==================== This patch set fixes several issues for LPM trie. These issues were found during adding new test cases or were reported by syzbot. The patch set is structured as follows: Patch #1~#2 are clean-ups for lpm_trie_update_elem(). Patch #3 handles BPF_EXIST and BPF_NOEXIST correctly for LPM trie. Patch #4 fixes the accounting of n_entries when doing in-place update. Patch #5 fixes the exact match condition in trie_get_next_key() and it may skip keys when the passed key is not found in the map. Patch #6~#7 switch from kmalloc() to bpf memory allocator for LPM trie to fix several lock order warnings reported by syzbot. It also enables raw_spinlock_t for LPM trie again. After these changes, the LPM trie will be closer to being usable in any context (though the reentrance check of trie->lock is still missing, but it is on my todo list). Patch #8: move test_lpm_map to map_tests to make it run regularly. Patch #9: add test cases for the issues fixed by patch #3~#5. Please see individual patches for more details. Comments are always welcome. Change Log: v3: * patch #2: remove the unnecessary NULL-init for im_node * patch #6: alloc the leaf node before disabling IRQ to low the possibility of -ENOMEM when leaf_size is large; Free these nodes outside the trie lock (Suggested by Alexei) * collect review and ack tags (Thanks for Toke & Daniel) v2: https://lore.kernel.org/bpf/[email protected]/ * collect review tags (Thanks for Toke) * drop "Add bpf_mem_cache_is_mergeable() helper" patch * patch #3~#4: add fix tag * patch #4: rename the helper to trie_check_add_elem() and increase n_entries in it. * patch #6: use one bpf mem allocator and update commit message to clarify that using bpf mem allocator is more appropriate. * patch #7: update commit message to add the possible max running time for update operation. * patch #9: update commit message to specify the purpose of these test cases. v1: https://lore.kernel.org/bpf/[email protected]/ ==================== Link: https://lore.kernel.org/all/[email protected]/ Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents e2cf913 + 04d4ce9 commit 509df67

File tree

4 files changed

+484
-57
lines changed

4 files changed

+484
-57
lines changed

kernel/bpf/lpm_trie.c

+85-48
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515
#include <net/ipv6.h>
1616
#include <uapi/linux/btf.h>
1717
#include <linux/btf_ids.h>
18+
#include <linux/bpf_mem_alloc.h>
1819

1920
/* Intermediate node */
2021
#define LPM_TREE_NODE_FLAG_IM BIT(0)
2122

2223
struct lpm_trie_node;
2324

2425
struct lpm_trie_node {
25-
struct rcu_head rcu;
2626
struct lpm_trie_node __rcu *child[2];
2727
u32 prefixlen;
2828
u32 flags;
@@ -32,10 +32,11 @@ struct lpm_trie_node {
3232
struct lpm_trie {
3333
struct bpf_map map;
3434
struct lpm_trie_node __rcu *root;
35+
struct bpf_mem_alloc ma;
3536
size_t n_entries;
3637
size_t max_prefixlen;
3738
size_t data_size;
38-
spinlock_t lock;
39+
raw_spinlock_t lock;
3940
};
4041

4142
/* This trie implements a longest prefix match algorithm that can be used to
@@ -287,17 +288,18 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
287288
return found->data + trie->data_size;
288289
}
289290

290-
static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
291-
const void *value)
291+
static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
292+
const void *value,
293+
bool disable_migration)
292294
{
293295
struct lpm_trie_node *node;
294-
size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
295296

296-
if (value)
297-
size += trie->map.value_size;
297+
if (disable_migration)
298+
migrate_disable();
299+
node = bpf_mem_cache_alloc(&trie->ma);
300+
if (disable_migration)
301+
migrate_enable();
298302

299-
node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
300-
trie->map.numa_node);
301303
if (!node)
302304
return NULL;
303305

@@ -310,12 +312,22 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
310312
return node;
311313
}
312314

315+
static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
316+
{
317+
if (flags == BPF_EXIST)
318+
return -ENOENT;
319+
if (trie->n_entries == trie->map.max_entries)
320+
return -ENOSPC;
321+
trie->n_entries++;
322+
return 0;
323+
}
324+
313325
/* Called from syscall or from eBPF program */
314326
static long trie_update_elem(struct bpf_map *map,
315327
void *_key, void *value, u64 flags)
316328
{
317329
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
318-
struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
330+
struct lpm_trie_node *node, *im_node, *new_node;
319331
struct lpm_trie_node *free_node = NULL;
320332
struct lpm_trie_node __rcu **slot;
321333
struct bpf_lpm_trie_key_u8 *key = _key;
@@ -330,22 +342,14 @@ static long trie_update_elem(struct bpf_map *map,
330342
if (key->prefixlen > trie->max_prefixlen)
331343
return -EINVAL;
332344

333-
spin_lock_irqsave(&trie->lock, irq_flags);
334-
335-
/* Allocate and fill a new node */
336-
337-
if (trie->n_entries == trie->map.max_entries) {
338-
ret = -ENOSPC;
339-
goto out;
340-
}
341-
342-
new_node = lpm_trie_node_alloc(trie, value);
343-
if (!new_node) {
344-
ret = -ENOMEM;
345-
goto out;
346-
}
345+
/* Allocate and fill a new node. Need to disable migration before
346+
* invoking bpf_mem_cache_alloc().
347+
*/
348+
new_node = lpm_trie_node_alloc(trie, value, true);
349+
if (!new_node)
350+
return -ENOMEM;
347351

348-
trie->n_entries++;
352+
raw_spin_lock_irqsave(&trie->lock, irq_flags);
349353

350354
new_node->prefixlen = key->prefixlen;
351355
RCU_INIT_POINTER(new_node->child[0], NULL);
@@ -364,8 +368,7 @@ static long trie_update_elem(struct bpf_map *map,
364368
matchlen = longest_prefix_match(trie, node, key);
365369

366370
if (node->prefixlen != matchlen ||
367-
node->prefixlen == key->prefixlen ||
368-
node->prefixlen == trie->max_prefixlen)
371+
node->prefixlen == key->prefixlen)
369372
break;
370373

371374
next_bit = extract_bit(key->data, node->prefixlen);
@@ -376,6 +379,10 @@ static long trie_update_elem(struct bpf_map *map,
376379
* simply assign the @new_node to that slot and be done.
377380
*/
378381
if (!node) {
382+
ret = trie_check_add_elem(trie, flags);
383+
if (ret)
384+
goto out;
385+
379386
rcu_assign_pointer(*slot, new_node);
380387
goto out;
381388
}
@@ -384,18 +391,30 @@ static long trie_update_elem(struct bpf_map *map,
384391
* which already has the correct data array set.
385392
*/
386393
if (node->prefixlen == matchlen) {
394+
if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
395+
if (flags == BPF_NOEXIST) {
396+
ret = -EEXIST;
397+
goto out;
398+
}
399+
} else {
400+
ret = trie_check_add_elem(trie, flags);
401+
if (ret)
402+
goto out;
403+
}
404+
387405
new_node->child[0] = node->child[0];
388406
new_node->child[1] = node->child[1];
389407

390-
if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
391-
trie->n_entries--;
392-
393408
rcu_assign_pointer(*slot, new_node);
394409
free_node = node;
395410

396411
goto out;
397412
}
398413

414+
ret = trie_check_add_elem(trie, flags);
415+
if (ret)
416+
goto out;
417+
399418
/* If the new node matches the prefix completely, it must be inserted
400419
* as an ancestor. Simply insert it between @node and *@slot.
401420
*/
@@ -406,8 +425,10 @@ static long trie_update_elem(struct bpf_map *map,
406425
goto out;
407426
}
408427

409-
im_node = lpm_trie_node_alloc(trie, NULL);
428+
/* migration is disabled within the locked scope */
429+
im_node = lpm_trie_node_alloc(trie, NULL, false);
410430
if (!im_node) {
431+
trie->n_entries--;
411432
ret = -ENOMEM;
412433
goto out;
413434
}
@@ -429,16 +450,13 @@ static long trie_update_elem(struct bpf_map *map,
429450
rcu_assign_pointer(*slot, im_node);
430451

431452
out:
432-
if (ret) {
433-
if (new_node)
434-
trie->n_entries--;
453+
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
435454

436-
kfree(new_node);
437-
kfree(im_node);
438-
}
439-
440-
spin_unlock_irqrestore(&trie->lock, irq_flags);
441-
kfree_rcu(free_node, rcu);
455+
migrate_disable();
456+
if (ret)
457+
bpf_mem_cache_free(&trie->ma, new_node);
458+
bpf_mem_cache_free_rcu(&trie->ma, free_node);
459+
migrate_enable();
442460

443461
return ret;
444462
}
@@ -459,7 +477,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
459477
if (key->prefixlen > trie->max_prefixlen)
460478
return -EINVAL;
461479

462-
spin_lock_irqsave(&trie->lock, irq_flags);
480+
raw_spin_lock_irqsave(&trie->lock, irq_flags);
463481

464482
/* Walk the tree looking for an exact key/length match and keeping
465483
* track of the path we traverse. We will need to know the node
@@ -535,9 +553,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
535553
free_node = node;
536554

537555
out:
538-
spin_unlock_irqrestore(&trie->lock, irq_flags);
539-
kfree_rcu(free_parent, rcu);
540-
kfree_rcu(free_node, rcu);
556+
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
557+
558+
migrate_disable();
559+
bpf_mem_cache_free_rcu(&trie->ma, free_parent);
560+
bpf_mem_cache_free_rcu(&trie->ma, free_node);
561+
migrate_enable();
541562

542563
return ret;
543564
}
@@ -559,6 +580,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
559580
static struct bpf_map *trie_alloc(union bpf_attr *attr)
560581
{
561582
struct lpm_trie *trie;
583+
size_t leaf_size;
584+
int err;
562585

563586
/* check sanity of attributes */
564587
if (attr->max_entries == 0 ||
@@ -581,9 +604,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
581604
offsetof(struct bpf_lpm_trie_key_u8, data);
582605
trie->max_prefixlen = trie->data_size * 8;
583606

584-
spin_lock_init(&trie->lock);
607+
raw_spin_lock_init(&trie->lock);
585608

609+
/* Allocate intermediate and leaf nodes from the same allocator */
610+
leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
611+
trie->map.value_size;
612+
err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
613+
if (err)
614+
goto free_out;
586615
return &trie->map;
616+
617+
free_out:
618+
bpf_map_area_free(trie);
619+
return ERR_PTR(err);
587620
}
588621

589622
static void trie_free(struct bpf_map *map)
@@ -615,13 +648,17 @@ static void trie_free(struct bpf_map *map)
615648
continue;
616649
}
617650

618-
kfree(node);
651+
/* No bpf program may access the map, so freeing the
652+
* node without waiting for the extra RCU GP.
653+
*/
654+
bpf_mem_cache_raw_free(node);
619655
RCU_INIT_POINTER(*slot, NULL);
620656
break;
621657
}
622658
}
623659

624660
out:
661+
bpf_mem_alloc_destroy(&trie->ma);
625662
bpf_map_area_free(trie);
626663
}
627664

@@ -633,7 +670,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
633670
struct lpm_trie_node **node_stack = NULL;
634671
int err = 0, stack_ptr = -1;
635672
unsigned int next_bit;
636-
size_t matchlen;
673+
size_t matchlen = 0;
637674

638675
/* The get_next_key follows postorder. For the 4 node example in
639676
* the top of this file, the trie_get_next_key() returns the following
@@ -672,7 +709,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
672709
next_bit = extract_bit(key->data, node->prefixlen);
673710
node = rcu_dereference(node->child[next_bit]);
674711
}
675-
if (!node || node->prefixlen != key->prefixlen ||
712+
if (!node || node->prefixlen != matchlen ||
676713
(node->flags & LPM_TREE_NODE_FLAG_IM))
677714
goto find_leftmost;
678715

tools/testing/selftests/bpf/.gitignore

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ bpf-syscall*
55
test_verifier
66
test_maps
77
test_lru_map
8-
test_lpm_map
98
test_tag
109
FEATURE-DUMP.libbpf
1110
FEATURE-DUMP.selftests

tools/testing/selftests/bpf/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ CLANG_CPUV4 := 1
8383
endif
8484

8585
# Order correspond to 'make run_tests' order
86-
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
86+
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \
8787
test_sockmap \
8888
test_tcpnotify_user test_sysctl \
8989
test_progs-no_alu32

0 commit comments

Comments
 (0)