Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
391 changes: 391 additions & 0 deletions pocs/linux/kernelctf/CVE-2024-36978_mitigation/docs/exploit.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,391 @@
# Exploit

## Trigger Vulnerability
The vulnerability exists in the `multiq_tune()` function within the kernel's traffic control subsystem. The issue arises from a size mismatch between allocation and access boundaries.

```c
static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct tc_multiq_qopt *qopt;
struct Qdisc **removed;
int i, n_removed = 0;

// ... validation checks ...

qopt = nla_data(opt);
qopt->bands = qdisc_dev(sch)->real_num_tx_queues;

// Allocation based on current q->bands value
removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands),
GFP_KERNEL);
if (!removed)
return -ENOMEM;

sch_tree_lock(sch);
q->bands = qopt->bands; // q->bands updated to new value

// Loop uses q->max_bands (unchanged) but removed was sized with old q->bands
for (i = q->bands; i < q->max_bands; i++) {
if (q->queues[i] != &noop_qdisc) {
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
qdisc_purge_queue(child);
removed[n_removed++] = child; // OOB write occurs here
}
}
// ...
}
```

The core issue is that `removed` is allocated using the difference `(q->max_bands - q->bands)` before `q->bands` gets updated. After the update, the loop iterates based on the new (smaller) `q->bands` value, potentially writing more entries than the allocated buffer can hold.

### Triggering Steps

1. Create a virtual ethernet device configured with 512 transmit queues
2. Reduce the transmit queue count to 465 to prevent kernel instability during exploitation
3. Attach a multiq qdisc to the interface - this stores 465 in `q->bands`
4. Further reduce transmit queues to 396
5. Invoke qdisc modification - now `q->bands` holds 465 while `qopt->bands` becomes 396, causing writes beyond the `removed` buffer boundary

## Leak kBase
Using side_channel, we can leak kBase address.

## Overwrite Qdisc * to Fake Chunk
Let's see `multiq_tune()`. We already analysis that `removed` array has vulnerability so we can trigger OOB access.

```c++
static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct tc_multiq_qopt *qopt;
struct Qdisc **removed;
int i, n_removed = 0;

if (!netif_is_multiqueue(qdisc_dev(sch)))
return -EOPNOTSUPP;
if (nla_len(opt) < sizeof(*qopt))
return -EINVAL;

qopt = nla_data(opt);

qopt->bands = qdisc_dev(sch)->real_num_tx_queues;

// [0]
removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands),
GFP_KERNEL);
if (!removed)
return -ENOMEM;

sch_tree_lock(sch);
q->bands = qopt->bands;

// [1]
for (i = q->bands; i < q->max_bands; i++) {
if (q->queues[i] != &noop_qdisc) {
struct Qdisc *child = q->queues[i];

q->queues[i] = &noop_qdisc;
qdisc_purge_queue(child);
removed[n_removed++] = child;
}
}

// [race condition] zone start

// [2]
sch_tree_unlock(sch);
for (i = 0; i < n_removed; i++) // [race condition] zone fin (when i >= (q->max_bands - q->bands))
qdisc_put(removed[i]);

// [3]
kfree(removed);

for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
child = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(sch->handle,
i + 1), extack);
if (child) {
sch_tree_lock(sch);
old = q->queues[i];
q->queues[i] = child;
if (child != &noop_qdisc)
qdisc_hash_add(child, true);

if (old != &noop_qdisc)
qdisc_purge_queue(old);
sch_tree_unlock(sch);
qdisc_put(old);
}
}
}
return 0;
}
```

In [1], we can see that push `struct Qdisc *child` to `removed` array. By the vulnerability, the code trigger OOB access to `removed` array. If we can make the memory map like below, we can control `struct Qdisc *child` to fake chunk.

```
----------- removed -----------
| |
| ... |
| |
-------------------------------
| fake child pointer | <- oob page
```


However, the chunk freed at [3]. Therefore, we need to finish all process between [1] ~ [3]. **There are not exist any method to trigger context switch(sleep, rcu, ...)** so we need to trigger race-condition between `[race condition] zone start` and `[race condition] zone fin(after i over size)`.

### What If the Vulnerability is UAF?
If this vulnuerability is non-race condition UAF, the exploit would more easier. Just UAF and overwrite fake chunk pointer at kmalloc-8k or 16k. However, this vulnerability is OOB with race-condition and the code does not have context-switching functions. Furthermore, it is just contiguous slab OOB, so we need to preallocate adjacent memory to each per-cpu buddy(cpu 0, cpu 1).

### How to Make OOB Adjacent Page? There Exist Guard Between Different SLAB Page with CONFIG_SLAB_VIRTUAL!
Detail explained on [novel_technique.md](./novel_technique.md). The conclusion of this discussion is, we can allocate adjacent memory if we allocate more than `4k` size by `kmalloc()`.

### Put Adjacent Chunk to Each Per-Cpu Buddy
The analysis result of [How to Make OOB Adjacent Page? There Exist Guard Between Different SLAB Page!](#how-to-make-oob-adjacent-page-there-exist-guard-between-different-slab-page), we know that allocate adjacent memory is available if we allocate more than `4k` size by `kmalloc()`.

In my exploit,
1. set cpu affinity to cpu0
2. Spray 0x300 `16k` chunks to cpu0 (by using `struct simple_xattr`)
3. remove only even index of `16k` chunks allocated from (1)
4. set cpu affinity to cpu1
5. Spray 0x300 `16k` chunks to cpu1 (by using `struct simple_xattr`)

If the per-cpu buddy is full, remain pages go to global buddy. So, the chunk which can't allocate to per-cpu buddy at (2) goes to global buddy and reallocate to cpu1 at (5). Therefore, we can allocate adjacent memory to each per-cpu buddy.

In lts or cos condition, it is hard to allocate adjacent memory to each per-cpu buddy but in mitigation, because of CONFIG_SLAB_VIRTUAL, buddy allocator is very quite. Therefore, we can allocate adjacent memory to each per-cpu buddy stabliy.

### Race-Condition Between per-cpu buddy
Now, We allocate adjacent memory to each per-cpu buddy. Then, we can make exploit scenario like below.
let each adjacent chunk as `A` and `B`. and `A` is allocated to cpu0 and `B` is allocated to cpu1. (`A` < `B`)

For write at `B`, I use `sendmsg()`.
```c++
// https://nonetype.kr/posts/Linux-Kernel-Heap-Spraying
static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
struct msghdr *msg_sys, unsigned int flags,
struct used_address *used_address,
unsigned int allowed_msghdr_flags)
{
...

if (msg_sys->msg_controllen > INT_MAX)
goto out_freeiov;
flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
ctl_len = msg_sys->msg_controllen;
if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
err =
cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
sizeof(ctl));
if (err)
goto out_freeiov;
ctl_buf = msg_sys->msg_control;
ctl_len = msg_sys->msg_controllen;
} else if (ctl_len) {
BUILD_BUG_ON(sizeof(struct cmsghdr) !=
CMSG_ALIGN(sizeof(struct cmsghdr)));
if (ctl_len > sizeof(ctl)) {

// [0]
ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
if (ctl_buf == NULL)
goto out_freeiov;
}
err = -EFAULT;
/*
* Careful! Before this, msg_sys->msg_control contains a user pointer.
* Afterwards, it will be a kernel pointer. Thus the compiler-assisted
* checking falls down on this.
*/

// [1]
if (copy_from_user(ctl_buf,
(void __user __force *)msg_sys->msg_control,
ctl_len))
goto out_freectl;
msg_sys->msg_control = ctl_buf;
}
msg_sys->msg_flags = flags;
...
```

Then, if the race condition succeed like below, the `multiq_tune()` will call `qdisc_put(fake_chunk)`
```
CPU 0 | CPU 1
---------------------------------------|---------------------------------------
Allocate A at multiq_tune() [0] |
|
write A at multiq_tune() [1] |
|
| Allocate B at ___sys_sendmsg() [0]
|
| write B at ___sys_sendmsg() [1]
|
access A[0x800] at multiq_tune() [2] |
-------------------------------------------------------------------------------
```

Then, we need to determine the `fake_chunk` address. I use CEA(`write_cpu_entry_area`), which always allocate at static kernel address. User can write arbitrary value to CEA, so we can control data by using it.

### Two CPU - Three Race
If we use CEA, as far as it known, It use one CPU permanently. Therefore, we need to use three cpu to exploit this vulnerability. But [novel_technique.md](./novel_technique.md) explain that can using only two cpu when exploit race-condition with CEA.

## Stack Pivoting and Control RIP
Now we can control `fake_chunk`. Therefore, we need to anaylsis the flow of `qdisc_put()`. Our target is call `qdisc->ops->reset(qdisc)`.

`struct Qdisc` is as follows:
```c++
struct Qdisc {
int (*enqueue)(struct sk_buff *skb,
struct Qdisc *sch,
struct sk_buff **to_free);
struct sk_buff * (*dequeue)(struct Qdisc *sch);
unsigned int flags;
#define TCQ_F_BUILTIN 1
#define TCQ_F_INGRESS 2
#define TCQ_F_CAN_BYPASS 4
#define TCQ_F_MQROOT 8
#define TCQ_F_ONETXQUEUE 0x10 /* dequeue_skb() can assume all skbs are for
* q->dev_queue : It can test
* netif_xmit_frozen_or_stopped() before
* dequeueing next packet.
* Its true for MQ/MQPRIO slaves, or non
* multiqueue device.
*/
#define TCQ_F_WARN_NONWC (1 << 16)
#define TCQ_F_CPUSTATS 0x20 /* run using percpu statistics */
#define TCQ_F_NOPARENT 0x40 /* root of its hierarchy :
* qdisc_tree_decrease_qlen() should stop.
*/
#define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */
u32 limit;
const struct Qdisc_ops *ops;
struct qdisc_size_table __rcu *stab;
struct hlist_node hash;
u32 handle;
u32 parent;

struct netdev_queue *dev_queue;

struct net_rate_estimator __rcu *rate_est;
struct gnet_stats_basic_sync __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
int pad;
refcount_t refcnt;

[...] //more
};
```

First, we needs to go `__qdisc_destroy()`, so `qdisc->flag != TCQ_F_BUILTIN` and `qdisc->refcnt == 1`
```c++
void qdisc_put(struct Qdisc *qdisc)
{
if (!qdisc)
return;

if (qdisc->flags & TCQ_F_BUILTIN ||
!refcount_dec_and_test(&qdisc->refcnt))
return;

__qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
```

Next, we need to pass `qdisc_hash_del()`, `qdisc_put_stab()`, `gen_kill_estimator()`.
```c++
static void __qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;

#ifdef CONFIG_NET_SCHED
qdisc_hash_del(qdisc);

qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
gen_kill_estimator(&qdisc->rate_est);

qdisc_reset(qdisc);

if (ops->destroy)
ops->destroy(qdisc);

module_put(ops->owner);
netdev_put(qdisc_dev(qdisc), &qdisc->dev_tracker);

trace_qdisc_destroy(qdisc);

call_rcu(&qdisc->rcu, qdisc_free_cb);
}
```

Therefore, the value must be `qdisc->parent == TC_H_ROOT`, `qdisc->stab == NULL`, `qdisc->rate_est == NULL`.
```c++
void qdisc_hash_del(struct Qdisc *q)
{
if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
ASSERT_RTNL();
hash_del_rcu(&q->hash);
}
}
EXPORT_SYMBOL(qdisc_hash_del);

void qdisc_put_stab(struct qdisc_size_table *tab)
{
if (!tab)
return;

if (--tab->refcnt == 0) {
list_del(&tab->list);
kfree_rcu(tab, rcu);
}
}
EXPORT_SYMBOL(qdisc_put_stab);

void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
struct net_rate_estimator *est;

est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
if (est) {
del_timer_sync(&est->timer);
kfree_rcu(est, rcu);
}
}
EXPORT_SYMBOL(gen_kill_estimator);
```

Then, we can control RIP by using `qdisc->ops->reset(qdisc)`.
```c++
void qdisc_reset(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;

trace_qdisc_reset(qdisc);

if (ops->reset)
ops->reset(qdisc);

__skb_queue_purge(&qdisc->gso_skb);
__skb_queue_purge(&qdisc->skb_bad_txq);

qdisc->q.qlen = 0;
qdisc->qstats.backlog = 0;
}
```

### Stack Pivoting
When we see assembly, rbp has `fake_struct` address so `mov rsp, rbp ; pop rbp ; ret` trigger stack pivoting.

## Arbitrary Code Execution
The last problem is, that the stack size for ROP is too small. We must finish ROP in only 6 lines. It is impossible to get `flag`. Therefore, we use [novel_technique.md](./novel_technique.md) to extand ROP stack which was used at [exp183](../exp183/novel_technique.md).
Loading
Loading