diff --git a/AspBaseline/AspBaseline.csproj b/AspBaseline/AspBaseline.csproj new file mode 100644 index 0000000..82792b7 --- /dev/null +++ b/AspBaseline/AspBaseline.csproj @@ -0,0 +1,12 @@ + + + + net10.0 + enable + enable + true + false + true + + + diff --git a/AspBaseline/Program.cs b/AspBaseline/Program.cs new file mode 100644 index 0000000..25e8b5a --- /dev/null +++ b/AspBaseline/Program.cs @@ -0,0 +1,39 @@ +using System.Text.Json; +using Microsoft.Extensions.Logging; + +var builder = WebApplication.CreateBuilder(args); +builder.Logging.SetMinimumLevel(LogLevel.Warning); +builder.WebHost.ConfigureKestrel(o => o.ListenAnyIP(8080)); // default Kestrel socket transport + +var app = builder.Build(); + +// Same knob + same object as Minima's handler: serialize a WORK_ITEMS-element object to +// JSON per request and discard it (the work stands in for a serializing endpoint). +// 0 / unset = no work (plain "ok"). The handler already runs on the thread pool here — +// no Task.Run needed — which is exactly Kestrel's model. +int workItems = 1000; +Payload largeObject = BuildPayload(Math.Max(workItems, 1)); + +app.MapGet("/", () => +{ + if (workItems > 0) + { + _ = JsonSerializer.SerializeToUtf8Bytes(largeObject); + } + return "ok"; +}); + +app.Run(); + +static Payload BuildPayload(int count) +{ + var items = new Item[count]; + for (int i = 0; i < count; i++) + { + items[i] = new Item(i, $"item-{i}", i * 1.5, (i & 1) == 0, $"category-{i % 8}"); + } + return new Payload(DateTime.UtcNow.ToString("O"), count, items); +} + +internal sealed record Item(int Id, string Name, double Value, bool Active, string Category); +internal sealed record Payload(string Generated, int Count, Item[] Items); diff --git a/Examples/ZeroAlloc/Basic/Rings_as_ReadOnlySpan.cs b/Examples/ZeroAlloc/Basic/Rings_as_ReadOnlySpan.cs index 9d0e068..bf55d1f 100644 --- a/Examples/ZeroAlloc/Basic/Rings_as_ReadOnlySpan.cs +++ b/Examples/ZeroAlloc/Basic/Rings_as_ReadOnlySpan.cs @@ -1,3 +1,4 @@ +using System.Text.Json; using zerg; using zerg.core; @@ -5,6 +6,26 @@ namespace Examples.ZeroAlloc.Basic; internal sealed class Rings_as_ReadOnlySpan { + internal sealed record Item(int Id, string Name, double Value, bool Active, string Category); + internal sealed record Payload(string Generated, int Count, Item[] Items); + + // Real async-work knob: serialize an in-memory object of WORK_ITEMS elements to JSON + // on the THREAD POOL (via Task.Run) per request. 0 / unset = disabled (pure inline + // reactor path). Genuine CPU + allocation, not a busy-spin. + private static readonly int WorkItems = 50; + + private static readonly Payload LargeObject = BuildPayload(Math.Max(WorkItems, 1)); + + private static Payload BuildPayload(int count) + { + var items = new Item[count]; + for (int i = 0; i < count; i++) + { + items[i] = new Item(i, $"item-{i}", i * 1.5, (i & 1) == 0, $"category-{i % 8}"); + } + return new Payload(DateTime.UtcNow.ToString("O"), count, items); + } + internal static async Task HandleConnectionAsync(Connection connection) { while (true) @@ -19,6 +40,11 @@ internal static async Task HandleConnectionAsync(Connection connection) var sequence = rings.ToReadOnlySequence(); // Process received data... + if (WorkItems > 0) + { + //_ = await Task.Run(static () => JsonSerializer.SerializeToUtf8Bytes(LargeObject)); + JsonSerializer.SerializeToUtf8Bytes(LargeObject); + } // Return rings to the kernel foreach (var ring in rings) diff --git a/Minima/Connection/Connection.Write.cs b/Minima/Connection/Connection.Write.cs index a2bab99..2f4c0ce 100644 --- a/Minima/Connection/Connection.Write.cs +++ b/Minima/Connection/Connection.Write.cs @@ -90,6 +90,15 @@ public void Write(ReadOnlySpan source) // Flush inner buffer data to the kernel public ValueTask FlushAsync() { + // Connection already torn down (reactor saw EOF/error → MarkClosed): don't flush + // a removed connection — the handoff would reach a reactor that no longer knows + // this fd and the awaiter would hang. Return completed so the handler unwinds to + // its next ReadAsync, sees IsClosed, and exits. + if (Volatile.Read(ref _closed) == 1) + { + return default; + } + if (Interlocked.Exchange(ref _flushInProgress, 1) == 1) { throw new InvalidOperationException("FlushAsync already in progress."); @@ -115,6 +124,14 @@ public ValueTask FlushAsync() _reactor.EnqueueFlush(ClientFd); + // Race recovery (mirrors ReadAsync): if close raced in after the guard above, + // self-complete so we don't hang waiting on a send the reactor will never make. + if (Volatile.Read(ref _closed) == 1 && Interlocked.Exchange(ref _flushArmed, 0) == 1) + { + Volatile.Write(ref _flushInProgress, 0); + _flushSignal.SetResult(true); + } + return new ValueTask(this, (short)gen); } diff --git a/Minima/Connection/Connection.cs b/Minima/Connection/Connection.cs index 291a73a..b6b3a06 100644 --- a/Minima/Connection/Connection.cs +++ b/Minima/Connection/Connection.cs @@ -12,7 +12,13 @@ public sealed unsafe partial class Connection // Bumped on Clear(); the low 16 bits are used as the IVTS token so stale // awaiters can be detected after pool reuse. private int _generation; - + + // Refcount: the connection has two owners — the reactor (recv side) and the + // handler (which may run off-reactor). Init to 2 on accept; each owner DecRef's + // when done; teardown (Recycle) runs only at refs==0, so a connection is never + // recycled or pool-reused while a handler is still in flight on another thread. + private int _refs; + public Connection(Reactor reactor, int fd, int writeSlabSize = 1024 * 16) { _reactor = reactor; @@ -53,7 +59,20 @@ public void MarkClosed() _flushSignal.SetResult(true); } } - + + // Init to 2 (reactor + handler) at accept. + internal void InitRefs() => Volatile.Write(ref _refs, 2); + + // Release one owner's ref. Whoever drives it to 0 hands the connection to the + // reactor for teardown (close + Clear + pool) — never recycled before both done. + internal void DecRef() + { + if (Interlocked.Decrement(ref _refs) == 0) + { + _reactor.EnqueueRecycle(this); + } + } + internal void Clear() { // Bump generation first — readers of IVTS plumbing observe this via diff --git a/Minima/Program.cs b/Minima/Program.cs index fa2782c..c4e38a0 100644 --- a/Minima/Program.cs +++ b/Minima/Program.cs @@ -1,5 +1,6 @@ using System.Buffers; using System.IO.Pipelines; +using System.Text.Json; using Minima.Utils; namespace Minima; @@ -22,6 +23,7 @@ private static int Main() var config = new ServerConfig() { UsePipe = false, + ReactorCount = 24 }; Console.WriteLine($"[Minima] starting {config.ReactorCount} reactors on port {config.Port} (incremental={config.Incremental})"); @@ -50,6 +52,23 @@ private static int Main() internal static class Handler { + // Real async-work knob: serialize an in-memory object of WORK_ITEMS elements to JSON + // on the THREAD POOL (via Task.Run) per request. 0 / unset = disabled (pure inline + // reactor path). Genuine CPU + allocation, not a busy-spin. + private static readonly int WorkItems = 50; + + private static readonly Payload LargeObject = BuildPayload(Math.Max(WorkItems, 1)); + + private static Payload BuildPayload(int count) + { + var items = new Item[count]; + for (int i = 0; i < count; i++) + { + items[i] = new Item(i, $"item-{i}", i * 1.5, (i & 1) == 0, $"category-{i % 8}"); + } + return new Payload(DateTime.UtcNow.ToString("O"), count, items); + } + public static async Task HandleAsync(Reactor reactor, Connection conn) { try @@ -73,6 +92,16 @@ public static async Task HandleAsync(Reactor reactor, Connection conn) } } + // Real async work: serialize a large object to JSON on the THREAD POOL. + // The handler resumes OFF-REACTOR, so the FlushAsync below pays the eventfd + // handoff the pure-inline path avoids — and the serialization is genuine + // CPU + GC pressure on the pool, not a busy-spin. + if (WorkItems > 0) + { + //_ = await Task.Run(static () => JsonSerializer.SerializeToUtf8Bytes(LargeObject)); + JsonSerializer.SerializeToUtf8Bytes(LargeObject); + } + // One response per recv burst — accumulate in the connection's // per-connection write slab, then submit and await ack. conn.Write(Program.Response); @@ -94,6 +123,10 @@ public static async Task HandleAsync(Reactor reactor, Connection conn) // Reactor will clean the connection up via the recv-error path // (or SPSC overflow) on the next CQE for this fd. } + finally + { + conn.DecRef(); // release the handler's ref; teardown runs once the reactor releases too + } } // PipeReader/PipeWriter variant — same behavior, driven through the BCL @@ -134,6 +167,10 @@ public static async Task HandlePipeAsync(Reactor reactor, Connection conn) { reader.Complete(); writer.Complete(); + conn.DecRef(); } } } + +internal sealed record Item(int Id, string Name, double Value, bool Active, string Category); +internal sealed record Payload(string Generated, int Count, Item[] Items); diff --git a/Minima/Reactor/Reactor.Incremental.cs b/Minima/Reactor/Reactor.Incremental.cs index 6da3f68..13b1351 100644 --- a/Minima/Reactor/Reactor.Incremental.cs +++ b/Minima/Reactor/Reactor.Incremental.cs @@ -175,6 +175,7 @@ private void LoopIncremental() { DrainReturnQIncremental(); DrainFlushQ(); + DrainRecycleQ(); int rc = Ring.SubmitAndWait(1); if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) @@ -220,6 +221,7 @@ private void DispatchIncremental(in IoUringCqe cqe) ? pooled.SetFd(clientFd) : new Connection(this, clientFd, _config.WriteSlabSize); Connections[clientFd] = conn; + conn.InitRefs(); SetupConnectionBufRing(conn); SubmitRecvMultishot(clientFd, conn.Bgid); @@ -247,9 +249,10 @@ private void DispatchIncremental(in IoUringCqe cqe) // Peer EOF / recv error — the whole per-conn ring is freed in Recycle. if (Connections.Remove(fd, out var dyingConn)) { - Recycle(dyingConn, fd); + dyingConn.MarkClosed(); + dyingConn.DecRef(); } - + return; } @@ -284,8 +287,9 @@ private void DispatchIncremental(in IoUringCqe cqe) if (cqe.res <= 0) { Connections.Remove(fd); - Recycle(conn, fd); - + conn.MarkClosed(); + conn.DecRef(); + return; } conn.WriteHead += cqe.res; diff --git a/Minima/Reactor/Reactor.cs b/Minima/Reactor/Reactor.cs index 8483a68..0346f62 100644 --- a/Minima/Reactor/Reactor.cs +++ b/Minima/Reactor/Reactor.cs @@ -1,3 +1,4 @@ +using System.Collections.Concurrent; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using Minima.Utils; @@ -52,6 +53,12 @@ public sealed unsafe partial class Reactor private readonly Mpsc _returnQ = new(1 << 14); // 16384 slots private readonly Mpsc _flushQ = new(1 << 12); // 4096 slots + // Teardown handoff: when a connection's refcount hits 0 off-reactor (handler exited + // on the thread pool), the recycle must run on the reactor (it touches the buf_ring + // and the reactor-only pool). Connection is a ref type, so this is a ConcurrentQueue + // rather than the unmanaged Mpsc. + private readonly ConcurrentQueue _recycleQ = new(); + // Connection pool. Reactor-thread-only — accept and teardown both run on // this reactor, so a plain Stack is sufficient (no MPMC primitive // needed). PoolMax caps the slab footprint per reactor: @@ -215,6 +222,27 @@ private void DrainFlushQ() } } + // Called by Connection.DecRef when the refcount hits 0. Teardown must run on the + // reactor (buf_ring + pool are reactor-owned), so off-reactor callers hand off. + internal void EnqueueRecycle(Connection conn) + { + if (Environment.CurrentManagedThreadId == _reactorThreadId) + { + Recycle(conn, conn.ClientFd); + return; + } + _recycleQ.Enqueue(conn); + WakeFdWrite(); + } + + private void DrainRecycleQ() + { + while (_recycleQ.TryDequeue(out Connection? conn)) + { + Recycle(conn, conn.ClientFd); + } + } + private void ArmWakePoll() { IoUringSqe* sqe = GetSqeOrFlush(); @@ -277,6 +305,7 @@ private void LoopShared() // Drain MPSC queues from off-reactor handlers. Cheap when empty. DrainReturnQ(); DrainFlushQ(); + DrainRecycleQ(); int rc = Ring.SubmitAndWait(1); if (rc < 0 && rc != -EINTR && rc != -EAGAIN && rc != -EBUSY) @@ -325,6 +354,7 @@ private void Dispatch(in IoUringCqe cqe) ? pooled.SetFd(clientFd) : new Connection(this, clientFd, _config.WriteSlabSize); Connections[clientFd] = conn; + conn.InitRefs(); SubmitRecvMultishot(clientFd); _ = _config.UsePipe @@ -355,7 +385,8 @@ private void Dispatch(in IoUringCqe cqe) } if (Connections.Remove(fd, out var dyingConn)) { - Recycle(dyingConn, fd); + dyingConn.MarkClosed(); // signal the handler to exit + dyingConn.DecRef(); // release the reactor's ref; teardown at refs==0 } return; } @@ -386,9 +417,10 @@ private void Dispatch(in IoUringCqe cqe) } if (cqe.res <= 0) { - // Send error — reactor owns teardown. + // Send error — release the reactor's ref; teardown when the handler exits too. Connections.Remove(fd); - Recycle(conn, fd); + conn.MarkClosed(); + conn.DecRef(); return; } conn.WriteHead += cqe.res; diff --git a/Shrike.Playground/Program.cs b/Shrike.Playground/Program.cs index 20f219c..070112f 100644 --- a/Shrike.Playground/Program.cs +++ b/Shrike.Playground/Program.cs @@ -1,5 +1,6 @@ // ReSharper disable always SuggestVarOrType_BuiltInTypes using System.Runtime.CompilerServices; +using System.Text.Json; using Shrike; [SkipLocalsInit] @@ -20,6 +21,23 @@ public static void Main() engine.Build().Run(); } + // Same knob + object as Minima / AspBaseline / SocketBaseline: serialize a + // WORK_ITEMS-element object to JSON on the THREAD POOL per request. 0/unset = inline. + private static readonly int WorkItems = + int.TryParse(Environment.GetEnvironmentVariable("WORK_ITEMS"), out int n) ? n : 0; + + private static readonly Payload LargeObject = BuildPayload(Math.Max(WorkItems, 1)); + + private static Payload BuildPayload(int count) + { + var items = new Item[count]; + for (int i = 0; i < count; i++) + { + items[i] = new Item(i, $"item-{i}", i * 1.5, (i & 1) == 0, $"category-{i % 8}"); + } + return new Payload(DateTime.UtcNow.ToString("O"), count, items); + } + /// /// The per-connection handler — Minima-style. The handler owns the request /// lifecycle through the connection's IVTS-backed read/flush: @@ -45,7 +63,16 @@ private static async Task HandleAsync(Connection conn) } if (wrote) + { + // Real async work on the THREAD POOL — handler resumes off-worker. Shrike's + // FlushAsync does a thread-safe send() directly (epoll), so no handoff here. + if (WorkItems > 0) + { + _ = await Task.Run(static () => JsonSerializer.SerializeToUtf8Bytes(LargeObject)); + } + await conn.FlushAsync(); + } } } @@ -71,3 +98,6 @@ private static unsafe void CommitPlainTextResponse(Connection connection) dst[1] = (byte)('0' + ones); } } + +internal sealed record Item(int Id, string Name, double Value, bool Active, string Category); +internal sealed record Payload(string Generated, int Count, Item[] Items); diff --git a/SocketBaseline/Program.cs b/SocketBaseline/Program.cs new file mode 100644 index 0000000..cd20f3d --- /dev/null +++ b/SocketBaseline/Program.cs @@ -0,0 +1,66 @@ +using System.Net; +using System.Net.Sockets; +using System.Text.Json; + +// Raw System.Net.Sockets HTTP/1.1 server — NO ASP.NET, NO Kestrel. A single async accept +// loop; each connection is handled on the thread pool via the runtime's async socket engine +// (epoll-backed on Linux). Same WORK_ITEMS knob + same object as Minima / AspBaseline. +int workItems = 50; +Payload largeObject = BuildPayload(Math.Max(workItems, 1)); + +byte[] response = "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 2\r\n\r\nok"u8.ToArray(); + +using var listener = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); +listener.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.ReuseAddress, true); +listener.Bind(new IPEndPoint(IPAddress.Any, 8080)); +listener.Listen(512); +Console.WriteLine($"[SocketBaseline] listening on :8080 (WORK_ITEMS={workItems})"); + +while (true) +{ + Socket client = await listener.AcceptAsync(); + _ = HandleAsync(client); +} + +async Task HandleAsync(Socket client) +{ + client.NoDelay = true; // TCP_NODELAY + byte[] buf = new byte[16 * 1024]; + try + { + while (true) + { + int read = await client.ReceiveAsync(buf.AsMemory(), SocketFlags.None); + if (read <= 0) break; // peer closed + + // Same work as Minima/AspBaseline: serialize the object on the thread pool + // (the handler already runs there) and discard. WORK_ITEMS=0 → plain "ok". + if (workItems > 0) + { + // Force threadpool + _ = await Task.Run(() => JsonSerializer.SerializeToUtf8Bytes(largeObject)); + } + + int sent = 0; + while (sent < response.Length) + { + sent += await client.SendAsync(response.AsMemory(sent), SocketFlags.None); + } + } + } + catch { } + finally { client.Dispose(); } +} + +static Payload BuildPayload(int count) +{ + var items = new Item[count]; + for (int i = 0; i < count; i++) + { + items[i] = new Item(i, $"item-{i}", i * 1.5, (i & 1) == 0, $"category-{i % 8}"); + } + return new Payload(DateTime.UtcNow.ToString("O"), count, items); +} + +internal sealed record Item(int Id, string Name, double Value, bool Active, string Category); +internal sealed record Payload(string Generated, int Count, Item[] Items); diff --git a/SocketBaseline/SocketBaseline.csproj b/SocketBaseline/SocketBaseline.csproj new file mode 100644 index 0000000..6b3e5d8 --- /dev/null +++ b/SocketBaseline/SocketBaseline.csproj @@ -0,0 +1,13 @@ + + + + Exe + net10.0 + enable + enable + true + false + true + + + diff --git a/Twinflow/TwinflowConnection.cs b/Twinflow/TwinflowConnection.cs index 2cf2dd2..4d3e215 100644 --- a/Twinflow/TwinflowConnection.cs +++ b/Twinflow/TwinflowConnection.cs @@ -62,7 +62,7 @@ public async Task RunOutputPump() int off = 0; while (off < seg.Length) { - int sent = TrySend(seg.Span.Slice(off), out bool wouldBlock, out bool closed); + int sent = TrySend(seg.Span[off..], out bool wouldBlock, out bool closed); if (closed) { fail = true; diff --git a/docs/blog/blog.css b/docs/blog/blog.css index 4d24e06..4c77021 100644 --- a/docs/blog/blog.css +++ b/docs/blog/blog.css @@ -281,13 +281,13 @@ nav .links a.active { color: var(--accent-zerg); } font-family: var(--font-mono); font-size: 0.9rem; } -.glossary div { margin-bottom: 0.35rem; } +.glossary div { display: flex; gap: 1rem; margin-bottom: 0.35rem; } .glossary div:last-child { margin-bottom: 0; } .glossary .term { color: var(--accent-zerg); font-weight: 700; - display: inline-block; - width: 3.5rem; + flex: 0 0 auto; + min-width: 4rem; } .glossary .def { color: var(--text-muted); } diff --git a/docs/blog/images/part-5-hero.png b/docs/blog/images/part-5-hero.png new file mode 100644 index 0000000..305088f Binary files /dev/null and b/docs/blog/images/part-5-hero.png differ diff --git a/docs/blog/index.html b/docs/blog/index.html index 44f6f6b..9e7cf21 100644 --- a/docs/blog/index.html +++ b/docs/blog/index.html @@ -25,30 +25,37 @@

Blog

Notes on io_uring, low-level networking and the internals of zerg.

    +
  • + + +
    C# Networking Deep Dive With io_uring — Part 5: io_uring and the Thread Pool
    +
    A rant, not the planned Kestrel integration. Why io_uring's frightening speed in the pinned-reactor model is conditional: it holds only while the handler never leaves the reactor, and the entire .NET backend (thread pool, async/await, Kestrel) is built on leaving it. The reactor deadlock when an off-reactor thread can't get its SQE submitted, Minima's eventfd wake, why SQPOLL only relocates the problem, and why epoll sidesteps the whole thing.
    +
    +
  • - +
    C# Networking Deep Dive With io_uring — Part 4: Zero Copy Receive
    A theoretical side-quest into io_uring zero copy receive (zcrx), where the NIC DMAs payload straight into registered user memory so the kernel never copies it. A high-level Minima vs MinimaZero comparison across buffer ownership, ZCRX_IFQ registration, RECV_ZC, the 32-byte CQE token, the refill queue, and why zcrx collapses the multi-reactor model to a single ring. Scaffold only, no runnable code, since there was no zcrx-capable NIC to test on.
  • - +
    C# Networking Deep Dive With io_uring — Part 3: Touching the Bytes
    Extending the async model from part 2 to expose the received bytes themselves. We add a byte* to each ring item, then introduce UnmanagedMemoryManager — a MemoryManager<byte> subclass that wraps an unmanaged pointer so the recv slabs can flow into the BCL Memory<byte> ecosystem (PipeReader, ReadOnlySequence, async APIs).
  • - +
    C# Networking Deep Dive With io_uring — Part 2: Bridge the Async Model
    Wrapping the io_uring loop with C# async/await via IValueTaskSource. A zero-allocation producer/consumer rendezvous between the kernel CQE dispatcher and the application handler, with a small SPSC ring that buffers back-to-back CQEs and a snapshot pattern that lets the handler drain in batches without chasing a moving tail.
  • - +
    io_uring from scratch in C# — Part 1
    Bypassing every abstraction and using the kernel interface directly for high-efficiency TCP networking on Linux. We build a minimum viable io_uring wrapper in C# and walk through the setup, the SQE/CQE protocol, and the basic event loop.
    diff --git a/docs/blog/io-uring-minima-part-1.html b/docs/blog/io-uring-minima-part-1.html index ac840ca..ba7d8d7 100644 --- a/docs/blog/io-uring-minima-part-1.html +++ b/docs/blog/io-uring-minima-part-1.html @@ -27,7 +27,7 @@

    io_uring from scratch in C# — Part 1

    - +
    diff --git a/docs/blog/io-uring-minima-part-2.html b/docs/blog/io-uring-minima-part-2.html index 46a3bfe..391ed5c 100644 --- a/docs/blog/io-uring-minima-part-2.html +++ b/docs/blog/io-uring-minima-part-2.html @@ -25,7 +25,7 @@

    C# Networking Deep Dive With io_uring — Part 2: Bridge the Async Model

    - +
    diff --git a/docs/blog/io-uring-minima-part-3.html b/docs/blog/io-uring-minima-part-3.html index 25db29d..5c19545 100644 --- a/docs/blog/io-uring-minima-part-3.html +++ b/docs/blog/io-uring-minima-part-3.html @@ -27,7 +27,7 @@

    C# Networking Deep Dive With io_uring — Part 3: Touching the Bytes

    - +
    diff --git a/docs/blog/io-uring-minima-part-4.html b/docs/blog/io-uring-minima-part-4.html index 379f82f..58d7ee1 100644 --- a/docs/blog/io-uring-minima-part-4.html +++ b/docs/blog/io-uring-minima-part-4.html @@ -27,7 +27,7 @@

    C# Networking Deep Dive With io_uring — Part 4: Zero Copy Receive

    - +
    diff --git a/docs/blog/io-uring-minima-part-5.html b/docs/blog/io-uring-minima-part-5.html new file mode 100644 index 0000000..8640fb3 --- /dev/null +++ b/docs/blog/io-uring-minima-part-5.html @@ -0,0 +1,101 @@ + + + + + + C# Networking Deep Dive With io_uring — Part 5 — zerg + + + + + +
    + ← All posts + +
    + A wide seascape where two bodies of water with different colours meet along a visible line, deep blue on one side and bright turquoise on the other, without mixing. + +
    +

    C# Networking Deep Dive With io_uring — Part 5: io_uring and the Thread Pool

    + +
    + +
    +
    SQESubmission Queue Entry
    +
    CQECompletion Queue Entry
    +
    SQPOLLkernel submission-queue poller
    +
    eventfdkernel event / wait fd
    +
    + +
    +

    Part 5 was going to be about integrating with Kestrel. I actually did it, the integration is done and tested, and it even came out ahead on some benchmarks, but overall the performance lands not far from Kestrel's stock Socket transport. So instead of a walkthrough, this turned into a rant about io_uring and the thread pool.

    + +

    This story doesn't begin with io_uring. To be honest with you, I love epoll (plot twist :o), and the reason I've been experimenting with and researching io_uring for 7 months now is to understand whether it's truly a better alternative to epoll... for networking.

    + +

    Now, don't get me wrong, io_uring is great. I love so many things about it. It was originally created for disk/file I/O and excels at it, but in my humble opinion it can be a mismatch for typical networking and back-end applications.

    + +

    So at this point you're probably wondering what the hell is going on, and why I'm saying this when so many people treat io_uring as their coca-cola in the desert. For benchmark enthusiasts who want to push and squeeze numbers, io_uring is indeed fast. But that's exactly where it shines, micro-benchmarks.

    + +

    io_uring is a perfect match for the reactor pattern we've been exploring in this series. It performs especially fast when the reactor is pinned to a thread for its entire lifetime, which, again, lines up perfectly with how IValueTaskSource works. I haven't shared any benchmarks with you yet, but let me tell you, Minima is fast. Frighteningly fast. I'll save the numbers for a future part.

    + +

    This speed comes with a shackle, though.

    + +

    io_uring's speed in this model (Minima's multi-reactor) is conditional. Two things have to hold at the same time, the reactor is the sole submitter of the ring (SINGLE_ISSUER with DEFER_TASKRUN), and the handler runs inline on the reactor thread, so the IValueTaskSource resumes without ever leaving it. Both hold only as long as the handler never leaves the reactor. But guess what? The entire .NET backend world is built on leaving it, the thread pool, async/await resuming off-thread, and of course Kestrel, whose whole model is "hand the connection to the pool." So the moment we await any real async work, the handler moves off the reactor thread, the response can no longer be submitted from that thread, and we're forced into a cross-thread handoff. And now that we're juggling multiple threads, every kind of race condition and deadlock becomes possible, and fixing them is not free.

    + +

    The reactor deadlock

    + +

    Putting an SQE in the ring and bumping the SQ tail does nothing by itself. The kernel only looks at the SQ when you call io_uring_enter (assuming no SQPOLL), which is an explicit syscall that, in Minima's model, only the reactor can make. The reactor's wake-up, however, is gated on completions (CQEs), the loop blocks in io_uring_enter(to_submit, min_complete=1, GETEVENTS), submits whatever is pending, and then sleeps in the kernel until at least one CQE is available.

    + +

    Let's dissect,

    + +
      +
    1. All connections are idle (keep-alive, no in-flight requests). Every reactor is asleep inside io_uring_enter, waiting for any completion.
    2. +
    3. A handler finishes on a pool thread and needs to send a response on connection C (owned by reactor R). It produces a SEND SQE and writes it into R's ring, bumping the tail.
    4. +
    5. But it does not call io_uring_enter (single-issuer, so only R may submit). The SEND SQE now sits in the ring, unsubmitted.
    6. +
    7. R won't run again until a CQE wakes it, and the only CQE that would wake it is the completion of that SEND, which is never submitted, because R is the one that submits and R is asleep. If C was the only connection with work, no other completion is coming.
    8. +
    + +

    So to sum up, the pool thread is waiting for the reactor to submit its SQE, and the reactor is waiting for a completion that only that submission would produce. Each waits on the other. Deadlock.

    + +

    There are ways to avoid this, though. One is to use a wait syscall that accepts a timeout, if no CQE arrives, it simply times out and the reactor loop comes back around. This is zerg's solution, and it has an obvious problem, we might hit that timeout too often. If the timeout is too large, latency takes a hit, if it's too small, we keep waking up for nothing when traffic is low, burning CPU.

    + +

    Minima's solution is more elegant, an eventfd wake. The reactor keeps a multishot poll armed on an eventfd inside its own ring. After enqueuing the work, the pool thread does a small write() to that eventfd, the write makes it readable, the armed poll turns that into a CQE, and the reactor wakes. It comes with an ironic cost though, an extra syscall. The very thing we were trying to escape from epoll.

    + +

    Can SQPOLL solve this problem?

    + +

    On paper, it does, a kernel thread polls the submission queue, so the pool thread's SQE gets picked up and submitted without the reactor ever calling io_uring_enter. No sleeping reactor to wake. But the irony kicks in again. The poller itself can go to sleep, and then it needs a wake-up of its own, which is the same problem all over again. On top of that, SQPOLL and DEFER_TASKRUN are mutually exclusive, so we'd surrender the very completion-batching that makes the model fast in the first place. So SQPOLL doesn't remove the wake, it relocates it, burns a kernel thread per reactor, and makes us give up DEFER_TASKRUN on the way.

    + +

    We could set SQPOLL so it never sleeps, or build a more complex mechanism to wake it automatically. I might explore that in a future part, but to be honest, io_uring with SQPOLL has been subpar in my own tests, so I'd rather just go with epoll.

    + +

    Now, epoll sidesteps all of this because it never separates doing the I/O from submitting it. send and recv are plain syscalls that any thread calls directly. A much cleaner model, no hacks.

    + +

    So how much faster than epoll is io_uring, really? On a micro-benchmark where the work isn't delegated to the thread pool, it can be a tad faster, 5-10% in my own benchmarks. But on a real workload, it's as fast at best, and given everything io_uring drags along (security concerns, restricted kernel access, recent-kernel requirements, and implementation complexity), as of today it just ain't worth it, in my opinion. I might change my mind with more research.

    + +

    Regardless, io_uring has its place when the handler/endpoint logic is lightweight, such as websockets, or the kind of mostly-synchronous workloads that reactor/event-loop servers like nginx and Redis are built for (and it's no accident that those live on an event-loop model).

    + +

    For the next parts I'll keep going with Minima and explore the reactor pattern and the send branch. Even though io_uring may not be a great fit for Kestrel, it can still shine in the right kind of application.

    +
    +
    +
    + + + + + + + + diff --git a/zerg.sln b/zerg.sln index f34baf4..eacd5b6 100644 --- a/zerg.sln +++ b/zerg.sln @@ -58,6 +58,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Twinflow", "Twinflow\Twinfl EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Twinflow.Demo", "Twinflow.Demo\Twinflow.Demo.csproj", "{90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AspBaseline", "AspBaseline\AspBaseline.csproj", "{666B4039-BAB4-43E7-B133-69EA2B8778DE}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SocketBaseline", "SocketBaseline\SocketBaseline.csproj", "{D7731326-7DD1-4417-B803-C80850444208}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -416,6 +420,30 @@ Global {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Release|x64.Build.0 = Release|Any CPU {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Release|x86.ActiveCfg = Release|Any CPU {90E3B96A-6BEB-42A1-9FF7-72C1CBE3CAF4}.Release|x86.Build.0 = Release|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Debug|Any CPU.Build.0 = Debug|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Debug|x64.ActiveCfg = Debug|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Debug|x64.Build.0 = Debug|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Debug|x86.ActiveCfg = Debug|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Debug|x86.Build.0 = Debug|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Release|Any CPU.ActiveCfg = Release|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Release|Any CPU.Build.0 = Release|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Release|x64.ActiveCfg = Release|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Release|x64.Build.0 = Release|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Release|x86.ActiveCfg = Release|Any CPU + {666B4039-BAB4-43E7-B133-69EA2B8778DE}.Release|x86.Build.0 = Release|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Debug|x64.ActiveCfg = Debug|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Debug|x64.Build.0 = Debug|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Debug|x86.ActiveCfg = Debug|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Debug|x86.Build.0 = Debug|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Release|Any CPU.Build.0 = Release|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Release|x64.ActiveCfg = Release|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Release|x64.Build.0 = Release|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Release|x86.ActiveCfg = Release|Any CPU + {D7731326-7DD1-4417-B803-C80850444208}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE