google · qwerty-theori · Dec 29, 2025 · Dec 29, 2025 · Dec 29, 2025 · Dec 29, 2025
diff --git a/pocs/linux/kernelctf/CVE-2024-36978_mitigation/docs/exploit.md b/pocs/linux/kernelctf/CVE-2024-36978_mitigation/docs/exploit.md
@@ -0,0 +1,391 @@
+# Exploit
+
+## Trigger Vulnerability
+The vulnerability exists in the `multiq_tune()` function within the kernel's traffic control subsystem. The issue arises from a size mismatch between allocation and access boundaries.
+
+```c
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
+         struct netlink_ext_ack *extack)
+{
+  struct multiq_sched_data *q = qdisc_priv(sch);
+  struct tc_multiq_qopt *qopt;
+  struct Qdisc **removed;
+  int i, n_removed = 0;
+
+  // ... validation checks ...
+
+  qopt = nla_data(opt);
+  qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+
+  // Allocation based on current q->bands value
+  removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands),
+        GFP_KERNEL);
+  if (!removed)
+    return -ENOMEM;
+
+  sch_tree_lock(sch);
+  q->bands = qopt->bands;  // q->bands updated to new value
+
+  // Loop uses q->max_bands (unchanged) but removed was sized with old q->bands
+  for (i = q->bands; i < q->max_bands; i++) {
+    if (q->queues[i] != &noop_qdisc) {
+      struct Qdisc *child = q->queues[i];
+      q->queues[i] = &noop_qdisc;
+      qdisc_purge_queue(child);
+      removed[n_removed++] = child;  // OOB write occurs here
+    }
+  }
+  // ...
+}
+```
+
+The core issue is that `removed` is allocated using the difference `(q->max_bands - q->bands)` before `q->bands` gets updated. After the update, the loop iterates based on the new (smaller) `q->bands` value, potentially writing more entries than the allocated buffer can hold.
+
+### Triggering Steps
+
+1. Create a virtual ethernet device configured with 512 transmit queues
+2. Reduce the transmit queue count to 465 to prevent kernel instability during exploitation
+3. Attach a multiq qdisc to the interface - this stores 465 in `q->bands`
+4. Further reduce transmit queues to 396
+5. Invoke qdisc modification - now `q->bands` holds 465 while `qopt->bands` becomes 396, causing writes beyond the `removed` buffer boundary
+
+## Leak kBase
+Using side_channel, we can leak kBase address.
+
+## Overwrite Qdisc * to Fake Chunk
+Let's see `multiq_tune()`. We already analysis that `removed` array has vulnerability so we can trigger OOB access.
+
+```c++
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct tc_multiq_qopt *qopt;
+	struct Qdisc **removed;
+	int i, n_removed = 0;
+
+	if (!netif_is_multiqueue(qdisc_dev(sch)))
+		return -EOPNOTSUPP;
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+
+	qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+
+	// [0]
+	removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands),
+			  GFP_KERNEL); 
+	if (!removed)
+		return -ENOMEM;
+
+	sch_tree_lock(sch);
+	q->bands = qopt->bands;
+
+	// [1]
+	for (i = q->bands; i < q->max_bands; i++) {
+		if (q->queues[i] != &noop_qdisc) {
+			struct Qdisc *child = q->queues[i];
+
+			q->queues[i] = &noop_qdisc;
+			qdisc_purge_queue(child);
+			removed[n_removed++] = child;
+		}
+	}
+
+	// [race condition] zone start
+
+	// [2]
+	sch_tree_unlock(sch);
+	for (i = 0; i < n_removed; i++) // [race condition] zone fin (when i >= (q->max_bands - q->bands))
+		qdisc_put(removed[i]);
+
+	// [3]
+	kfree(removed);
+
+	for (i = 0; i < q->bands; i++) {
+		if (q->queues[i] == &noop_qdisc) {
+			struct Qdisc *child, *old;
+			child = qdisc_create_dflt(sch->dev_queue,
+						  &pfifo_qdisc_ops,
+						  TC_H_MAKE(sch->handle,
+							    i + 1), extack);
+			if (child) {
+				sch_tree_lock(sch);
+				old = q->queues[i];
+				q->queues[i] = child;
+				if (child != &noop_qdisc)
+					qdisc_hash_add(child, true);
+
+				if (old != &noop_qdisc)
+					qdisc_purge_queue(old);
+				sch_tree_unlock(sch);
+				qdisc_put(old);
+			}
+		}
+	}
+	return 0;
+}
+```
+
+In [1], we can see that push `struct Qdisc *child` to `removed` array. By the vulnerability, the code trigger OOB access to `removed` array. If we can make the memory map like below, we can control `struct Qdisc *child` to fake chunk.
+
+```
+	 ----------- removed -----------
+	|				                |
+	|			   ...              |
+	|				                |
+	 -------------------------------
+	|       fake child pointer      | <- oob page
+```
+
+
+However, the chunk freed at [3]. Therefore, we need to finish all process between [1] ~ [3]. **There are not exist any method to trigger context switch(sleep, rcu, ...)** so we need to trigger race-condition between `[race condition] zone start` and `[race condition] zone fin(after i over size)`.
+
+### What If the Vulnerability is UAF?
+If this vulnuerability is non-race condition UAF, the exploit would more easier. Just UAF and overwrite fake chunk pointer at kmalloc-8k or 16k. However, this vulnerability is OOB with race-condition and the code does not have  context-switching functions. Furthermore, it is just contiguous slab OOB, so we need to preallocate adjacent memory to each per-cpu buddy(cpu 0, cpu 1). 
+
+### How to Make OOB Adjacent Page? There Exist Guard Between Different SLAB Page with CONFIG_SLAB_VIRTUAL!
+Detail explained on [novel_technique.md](./novel_technique.md). The conclusion of this discussion is, we can allocate adjacent memory if we allocate more than `4k` size by `kmalloc()`.
+
+### Put Adjacent Chunk to Each Per-Cpu Buddy
+The analysis result of [How to Make OOB Adjacent Page? There Exist Guard Between Different SLAB Page!](#how-to-make-oob-adjacent-page-there-exist-guard-between-different-slab-page), we know that allocate adjacent memory is available if we allocate more than `4k` size by `kmalloc()`.
+
+In my exploit, 
+1. set cpu affinity to cpu0
+2. Spray 0x300 `16k` chunks to cpu0 (by using `struct simple_xattr`)
+3. remove only even index of `16k` chunks allocated from (1)
+4. set cpu affinity to cpu1
+5. Spray 0x300 `16k` chunks to cpu1 (by using `struct simple_xattr`)
+
+If the per-cpu buddy is full, remain pages go to global buddy. So, the chunk which can't allocate to per-cpu buddy at (2) goes to global buddy and reallocate to cpu1 at (5). Therefore, we can allocate adjacent memory to each per-cpu buddy.
+
+In lts or cos condition, it is hard to allocate adjacent memory to each per-cpu buddy but in mitigation, because of CONFIG_SLAB_VIRTUAL, buddy allocator is very quite. Therefore, we can allocate adjacent memory to each per-cpu buddy stabliy.
+
+### Race-Condition Between per-cpu buddy
+Now, We allocate adjacent memory to each per-cpu buddy. Then, we can make exploit scenario like below.
+let each adjacent chunk as `A` and `B`. and `A` is allocated to cpu0 and `B` is allocated to cpu1. (`A` < `B`)
+
+For write at `B`, I use `sendmsg()`.
+```c++
+// https://nonetype.kr/posts/Linux-Kernel-Heap-Spraying
+static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
+			 struct msghdr *msg_sys, unsigned int flags,
+			 struct used_address *used_address,
+			 unsigned int allowed_msghdr_flags)
+{
+	...
+
+    if (msg_sys->msg_controllen > INT_MAX)
+		goto out_freeiov;
+	flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
+	ctl_len = msg_sys->msg_controllen;
+	if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
+		err =
+		    cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
+						     sizeof(ctl));
+		if (err)
+			goto out_freeiov;
+		ctl_buf = msg_sys->msg_control;
+		ctl_len = msg_sys->msg_controllen;
+	} else if (ctl_len) {
+		BUILD_BUG_ON(sizeof(struct cmsghdr) !=
+			     CMSG_ALIGN(sizeof(struct cmsghdr)));
+		if (ctl_len > sizeof(ctl)) {
+
+			// [0]
+ 			ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
+			if (ctl_buf == NULL)
+				goto out_freeiov;
+		}
+		err = -EFAULT;
+		/*
+		 * Careful! Before this, msg_sys->msg_control contains a user pointer.
+		 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
+		 * checking falls down on this.
+		 */
+
+		// [1]
+ 		if (copy_from_user(ctl_buf,
+				   (void __user __force *)msg_sys->msg_control,
+				   ctl_len)) 
+			goto out_freectl;
+		msg_sys->msg_control = ctl_buf;
+	}
+	msg_sys->msg_flags = flags;
+  ...
+```
+
+Then, if the race condition succeed like below, the `multiq_tune()` will call `qdisc_put(fake_chunk)`
+```
+                CPU 0				   | 				CPU 1		
+---------------------------------------|---------------------------------------
+    Allocate A at multiq_tune() [0]	   |
+									   |
+	  write A at multiq_tune() [1]	   |		
+									   |
+									   |   Allocate B at ___sys_sendmsg() [0]
+									   |	
+									   |     write B at ___sys_sendmsg() [1]
+									   |
+ access A[0x800] at multiq_tune() [2]  |
+-------------------------------------------------------------------------------
+```
+
+Then, we need to determine the `fake_chunk` address. I use CEA(`write_cpu_entry_area`), which always allocate at static kernel address. User can write arbitrary value to CEA, so we can control data by using it.
+
+### Two CPU - Three Race
+If we use CEA, as far as it known, It use one CPU permanently. Therefore, we need to use three cpu to exploit this vulnerability. But [novel_technique.md](./novel_technique.md) explain that can using only two cpu when exploit race-condition with CEA.
+
+## Stack Pivoting and Control RIP
+Now we can control `fake_chunk`. Therefore, we need to anaylsis the flow of `qdisc_put()`. Our target is call `qdisc->ops->reset(qdisc)`.
+
+`struct Qdisc` is as follows:
+```c++
+struct Qdisc {
+	int 			(*enqueue)(struct sk_buff *skb,
+					   struct Qdisc *sch,
+					   struct sk_buff **to_free);
+	struct sk_buff *	(*dequeue)(struct Qdisc *sch);
+	unsigned int		flags;
+#define TCQ_F_BUILTIN		1
+#define TCQ_F_INGRESS		2
+#define TCQ_F_CAN_BYPASS	4
+#define TCQ_F_MQROOT		8
+#define TCQ_F_ONETXQUEUE	0x10 /* dequeue_skb() can assume all skbs are for
+				      * q->dev_queue : It can test
+				      * netif_xmit_frozen_or_stopped() before
+				      * dequeueing next packet.
+				      * Its true for MQ/MQPRIO slaves, or non
+				      * multiqueue device.
+				      */
+#define TCQ_F_WARN_NONWC	(1 << 16)
+#define TCQ_F_CPUSTATS		0x20 /* run using percpu statistics */
+#define TCQ_F_NOPARENT		0x40 /* root of its hierarchy :
+				      * qdisc_tree_decrease_qlen() should stop.
+				      */
+#define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
+#define TCQ_F_NOLOCK		0x100 /* qdisc does not require locking */
+#define TCQ_F_OFFLOADED		0x200 /* qdisc is offloaded to HW */
+	u32			limit;
+	const struct Qdisc_ops	*ops;
+	struct qdisc_size_table	__rcu *stab;
+	struct hlist_node       hash;
+	u32			handle;
+	u32			parent;
+
+	struct netdev_queue	*dev_queue;
+
+	struct net_rate_estimator __rcu *rate_est;
+	struct gnet_stats_basic_sync __percpu *cpu_bstats;
+	struct gnet_stats_queue	__percpu *cpu_qstats;
+	int			pad;
+	refcount_t		refcnt;
+
+	[...] //more
+};
+```
+
+First, we needs to go `__qdisc_destroy()`, so `qdisc->flag != TCQ_F_BUILTIN` and `qdisc->refcnt == 1`
+```c++
+void qdisc_put(struct Qdisc *qdisc)
+{
+	if (!qdisc)
+		return;
+
+	if (qdisc->flags & TCQ_F_BUILTIN ||
+	    !refcount_dec_and_test(&qdisc->refcnt))
+		return;
+
+	__qdisc_destroy(qdisc);
+}
+EXPORT_SYMBOL(qdisc_put);
+``` 
+
+Next, we need to pass `qdisc_hash_del()`, `qdisc_put_stab()`, `gen_kill_estimator()`.
+```c++
+static void __qdisc_destroy(struct Qdisc *qdisc)
+{
+	const struct Qdisc_ops  *ops = qdisc->ops;
+
+#ifdef CONFIG_NET_SCHED
+	qdisc_hash_del(qdisc);
+
+	qdisc_put_stab(rtnl_dereference(qdisc->stab));
+#endif
+	gen_kill_estimator(&qdisc->rate_est);
+
+	qdisc_reset(qdisc);
+
+	if (ops->destroy)
+		ops->destroy(qdisc);
+
+	module_put(ops->owner);
+	netdev_put(qdisc_dev(qdisc), &qdisc->dev_tracker);
+
+	trace_qdisc_destroy(qdisc);
+
+	call_rcu(&qdisc->rcu, qdisc_free_cb);
+}
+```
+
+Therefore, the value must be `qdisc->parent == TC_H_ROOT`, `qdisc->stab == NULL`, `qdisc->rate_est == NULL`.
+```c++
+void qdisc_hash_del(struct Qdisc *q)
+{
+	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
+		ASSERT_RTNL();
+		hash_del_rcu(&q->hash);
+	}
+}
+EXPORT_SYMBOL(qdisc_hash_del);
+
+void qdisc_put_stab(struct qdisc_size_table *tab)
+{
+	if (!tab)
+		return;
+
+	if (--tab->refcnt == 0) {
+		list_del(&tab->list);
+		kfree_rcu(tab, rcu);
+	}
+}
+EXPORT_SYMBOL(qdisc_put_stab);
+
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
+{
+	struct net_rate_estimator *est;
+
+	est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+	if (est) {
+		del_timer_sync(&est->timer);
+		kfree_rcu(est, rcu);
+	}
+}
+EXPORT_SYMBOL(gen_kill_estimator);
+```
+
+Then, we can control RIP by using `qdisc->ops->reset(qdisc)`.
+```c++
+void qdisc_reset(struct Qdisc *qdisc)
+{
+	const struct Qdisc_ops *ops = qdisc->ops;
+
+	trace_qdisc_reset(qdisc);
+
+	if (ops->reset)
+		ops->reset(qdisc);
+
+	__skb_queue_purge(&qdisc->gso_skb);
+	__skb_queue_purge(&qdisc->skb_bad_txq);
+
+	qdisc->q.qlen = 0;
+	qdisc->qstats.backlog = 0;
+}
+```
+
+### Stack Pivoting
+When we see assembly, rbp has `fake_struct` address so `mov rsp, rbp ; pop rbp ; ret` trigger stack pivoting.
+
+## Arbitrary Code Execution
+The last problem is, that the stack size for ROP is too small. We must finish ROP in only 6 lines. It is impossible to get `flag`. Therefore, we use [novel_technique.md](./novel_technique.md) to extand ROP stack which was used at [exp183](../exp183/novel_technique.md).